xref: /illumos-gate/usr/src/uts/common/io/ib/clients/rds/rdsddi.c (revision fc80c0dfb0c877aee828d778ea32b77fcf7b1ef4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/conf.h>
30 #include <sys/modctl.h>
31 #include <sys/stat.h>
32 #include <sys/stream.h>
33 #include <sys/strsun.h>
34 #include <sys/ddi.h>
35 #include <sys/sunddi.h>
36 #include <sys/priv_names.h>
37 #include <inet/common.h>
38 
39 #define	_SUN_TPI_VERSION 2
40 #include <sys/tihdr.h>
41 #include <sys/timod.h>
42 #include <sys/tiuser.h>
43 #include <sys/suntpi.h>
44 #include <inet/common.h>
45 #include <inet/ip.h>
46 #include <inet/mi.h>
47 #include <sys/ib/clients/rds/rds.h>
48 #include <sys/policy.h>
49 #include <inet/ipclassifier.h>
50 #include <sys/ib/clients/rds/rds_kstat.h>
51 #include "sys/random.h"
52 #include <sys/ib/clients/rds/rds_transport.h>
53 #include <sys/ib/ibtl/ibti.h>
54 
55 
56 #define	RDS_NAME	"rds"
57 #define	RDS_STRTAB	rdsinfo
58 #define	RDS_DEVDESC	"RDS STREAMS driver %I%"
59 #define	RDS_DEVMINOR	0
60 #define	RDS_DEVMTFLAGS D_MP | D_SYNCSTR
61 #define	RDS_DEFAULT_PRIV_MODE	0666
62 
63 #define	rds_smallest_port	1
64 #define	rds_largest_port	65535
65 
66 #define	RDS_RECV_HIWATER	(56 * 1024)
67 #define	RDS_RECV_LOWATER	128
68 #define	RDS_XMIT_HIWATER	(56 * 1024)
69 #define	RDS_XMIT_LOWATER	1024
70 
71 #define	RDS_DPRINTF2	0 &&
72 #define	LABEL	"RDS"
73 
74 typedef struct rdsahdr_s {
75 	in_port_t	uha_src_port;	/* Source port */
76 	in_port_t	uha_dst_port;	/* Destination port */
77 } rdsha_t;
78 
79 #define	RDSH_SIZE	4
80 
81 int rds_recv_hiwat = RDS_RECV_HIWATER;
82 int rds_recv_lowat = RDS_RECV_LOWATER;
83 int rds_xmit_hiwat = RDS_XMIT_HIWATER;
84 int rds_xmit_lowat = RDS_XMIT_LOWATER;
85 
86 int rdsdebug;
87 
88 static dev_info_t *rds_dev_info;
89 
90 /* Hint not protected by any lock */
91 static	in_port_t	rds_next_port_to_try;
92 
93 ldi_ident_t rds_li;
94 static int loopmax = rds_largest_port - rds_smallest_port + 1;
95 
96 /* global configuration variables */
97 uint_t  UserBufferSize;
98 uint_t  rds_rx_pkts_pending_hwm;
99 
100 extern void rds_ioctl(queue_t *, mblk_t *);
101 extern void rds_ioctl_copyin_done(queue_t *q, mblk_t *mp);
102 
103 int rds_open_transport_driver();
104 int rds_close_transport_driver();
105 
106 #define	RDS_CURRENT_PORT_QUOTA()					\
107 	(rds_rx_pkts_pending_hwm/RDS_GET_NPORT())
108 
109 krwlock_t	rds_transport_lock;
110 ldi_handle_t	rds_transport_handle = NULL;
111 rds_transport_ops_t *rds_transport_ops = NULL;
112 
113 static int
114 rds_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
115 {
116 	int	ret;
117 
118 	if (cmd != DDI_ATTACH)
119 		return (DDI_FAILURE);
120 
121 	rds_dev_info = devi;
122 
123 	ret = ddi_create_minor_node(devi, RDS_NAME, S_IFCHR,
124 	    RDS_DEVMINOR, DDI_PSEUDO, 0);
125 	if (ret != DDI_SUCCESS) {
126 		return (ret);
127 	}
128 
129 	return (DDI_SUCCESS);
130 }
131 
132 static int
133 rds_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
134 {
135 	if (cmd != DDI_DETACH)
136 		return (DDI_FAILURE);
137 
138 	ASSERT(devi == rds_dev_info);
139 
140 	ddi_remove_minor_node(devi, NULL);
141 
142 	return (DDI_SUCCESS);
143 }
144 
145 /* ARGSUSED */
146 static int
147 rds_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
148 {
149 	int error = DDI_FAILURE;
150 
151 	switch (cmd) {
152 	case DDI_INFO_DEVT2DEVINFO:
153 		if (rds_dev_info != NULL) {
154 			*result = (void *)rds_dev_info;
155 			error = DDI_SUCCESS;
156 		}
157 		break;
158 
159 	case DDI_INFO_DEVT2INSTANCE:
160 		*result = NULL;
161 		error = DDI_SUCCESS;
162 		break;
163 
164 	default:
165 		break;
166 	}
167 
168 	return (error);
169 }
170 
171 
172 /*ARGSUSED*/
173 static int
174 rds_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
175 {
176 	rds_t	*rds;
177 	int	ret;
178 
179 	if (is_system_labeled()) {
180 		/*
181 		 * RDS socket is not supported on labeled systems
182 		 */
183 		return (ESOCKTNOSUPPORT);
184 	}
185 
186 	/* Open the transport driver if IB HW is present */
187 	rw_enter(&rds_transport_lock, RW_READER);
188 	if (rds_transport_handle == NULL) {
189 		rw_exit(&rds_transport_lock);
190 		ret = rds_open_transport_driver();
191 		rw_enter(&rds_transport_lock, RW_READER);
192 
193 		if (ret != 0) {
194 			/* Transport driver failed to load */
195 			rw_exit(&rds_transport_lock);
196 			return (ret);
197 		}
198 	}
199 	rw_exit(&rds_transport_lock);
200 
201 	if (sflag == MODOPEN) {
202 		return (EINVAL);
203 	}
204 
205 	/* Reopen not supported */
206 	if (q->q_ptr != NULL) {
207 		dprint(2, ("%s: Reopen is not supported: %p", LABEL, q->q_ptr));
208 		return (0);
209 	}
210 
211 	rds = rds_create(q, credp);
212 	if (rds == NULL) {
213 		dprint(2, ("%s: rds_create failed", LABEL));
214 		return (0);
215 	}
216 
217 	q->q_ptr = WR(q)->q_ptr = rds;
218 	rds->rds_state = TS_UNBND;
219 	rds->rds_family = AF_INET_OFFLOAD;
220 
221 	q->q_hiwat = rds_recv_hiwat;
222 	q->q_lowat = rds_recv_lowat;
223 
224 	qprocson(q);
225 
226 	WR(q)->q_hiwat = rds_xmit_hiwat;
227 	WR(q)->q_lowat = rds_xmit_lowat;
228 
229 	/* Set the Stream head watermarks */
230 	(void) mi_set_sth_hiwat(q, rds_recv_hiwat);
231 	(void) mi_set_sth_lowat(q, rds_recv_lowat);
232 
233 	return (0);
234 }
235 
236 static int
237 rds_close(queue_t *q)
238 {
239 	rds_t *rdsp = (rds_t *)q->q_ptr;
240 
241 	qprocsoff(q);
242 
243 	/*
244 	 * NPORT should be decremented only if this socket was previously
245 	 * bound to an RDS port.
246 	 */
247 	if (rdsp->rds_state >= TS_IDLE) {
248 		RDS_DECR_NPORT();
249 		RDS_SET_PORT_QUOTA(RDS_CURRENT_PORT_QUOTA());
250 		rds_transport_ops->
251 		    rds_transport_resume_port(ntohs(rdsp->rds_port));
252 	}
253 
254 	/* close the transport driver if this is the last socket */
255 	if (RDS_GET_NPORT() == 1) {
256 		(void) rds_close_transport_driver();
257 	}
258 
259 	/*
260 	 * We set the flags without holding a lock as this is
261 	 * just a hint for the fanout lookup to skip this rds.
262 	 * We dont free the struct until it's out of the hash and
263 	 * the ref count goes down.
264 	 */
265 	rdsp->rds_flags |= RDS_CLOSING;
266 	rds_bind_hash_remove(rdsp, B_FALSE);
267 	mutex_enter(&rdsp->rds_lock);
268 	ASSERT(rdsp->rds_refcnt > 0);
269 	if (rdsp->rds_refcnt != 1) {
270 		cv_wait(&rdsp->rds_refcv, &rdsp->rds_lock);
271 	}
272 	mutex_exit(&rdsp->rds_lock);
273 	RDS_DEC_REF_CNT(rdsp);
274 	RD(q)->q_ptr = NULL;
275 	WR(q)->q_ptr = NULL;
276 	return (0);
277 }
278 
279 /*
280  * Add a new message to the socket
281  */
282 int
283 rds_deliver_new_msg(mblk_t *mp, ipaddr_t local_addr, ipaddr_t rem_addr,
284     in_port_t local_port, in_port_t rem_port, zoneid_t zoneid)
285 {
286 	rds_t *rds;
287 	struct  T_unitdata_ind  *tudi;
288 	int	udi_size;	/* Size of T_unitdata_ind */
289 	mblk_t *mp1;
290 	sin_t	*sin;
291 	int error = 0;
292 
293 	local_port = htons(local_port);
294 	rem_port = htons(rem_port);
295 
296 	ASSERT(mp->b_datap->db_type == M_DATA);
297 	rds = rds_fanout(local_addr, rem_addr, local_port, rem_port, zoneid);
298 	if (rds == NULL) {
299 		dprint(2, ("%s: rds_fanout failed: (0x%x 0x%x %d %d)", LABEL,
300 		    local_addr, rem_addr, ntohs(local_port), ntohs(rem_port)));
301 		freemsg(mp);
302 		return (error);
303 	}
304 
305 	udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
306 
307 	/* Allocate a message block for the T_UNITDATA_IND structure. */
308 	mp1 = allocb(udi_size, BPRI_MED);
309 	if (mp1 == NULL) {
310 		dprint(2, ("%s: allocb failed", LABEL));
311 		freemsg(mp);
312 		return (ENOMEM);
313 	}
314 
315 	mp1->b_cont = mp;
316 	mp = mp1;
317 	mp->b_datap->db_type = M_PROTO;
318 	tudi = (struct T_unitdata_ind *)(uintptr_t)mp->b_rptr;
319 	mp->b_wptr = (uchar_t *)tudi + udi_size;
320 	tudi->PRIM_type = T_UNITDATA_IND;
321 	tudi->SRC_length = sizeof (sin_t);
322 	tudi->SRC_offset = sizeof (struct T_unitdata_ind);
323 	tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
324 	udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t));
325 	tudi->OPT_length = udi_size;
326 	sin = (sin_t *)&tudi[1];
327 	sin->sin_addr.s_addr = rem_addr;
328 	sin->sin_port = ntohs(rem_port);
329 	sin->sin_family = rds->rds_family;
330 	*(uint32_t *)(uintptr_t)&sin->sin_zero[0] = 0;
331 	*(uint32_t *)(uintptr_t)&sin->sin_zero[4] = 0;
332 
333 	putnext(rds->rds_ulpd, mp);
334 
335 	/* check port quota */
336 	if (RDS_GET_RXPKTS_PEND() > rds_rx_pkts_pending_hwm) {
337 		ulong_t current_port_quota = RDS_GET_PORT_QUOTA();
338 		if (rds->rds_port_quota > current_port_quota) {
339 			/* this may result in stalling the port */
340 			rds->rds_port_quota = current_port_quota;
341 			(void) mi_set_sth_hiwat(rds->rds_ulpd,
342 			    rds->rds_port_quota * UserBufferSize);
343 			RDS_INCR_PORT_QUOTA_ADJUSTED();
344 		}
345 	}
346 
347 	/*
348 	 * canputnext() check is done after putnext as the protocol does
349 	 * not allow dropping any received packet.
350 	 */
351 	if (!canputnext(rds->rds_ulpd)) {
352 		error = ENOSPC;
353 	}
354 
355 	RDS_DEC_REF_CNT(rds);
356 	return (error);
357 }
358 
359 
360 /* Default structure copied into T_INFO_ACK messages */
361 static struct T_info_ack rds_g_t_info_ack_ipv4 = {
362 	T_INFO_ACK,
363 	65535,	/* TSDU_size. Excl. headers */
364 	T_INVALID,	/* ETSU_size.  rds does not support expedited data. */
365 	T_INVALID,	/* CDATA_size. rds does not support connect data. */
366 	T_INVALID,	/* DDATA_size. rds does not support disconnect data. */
367 	sizeof (sin_t),	/* ADDR_size. */
368 	0,		/* OPT_size - not initialized here */
369 	65535,		/* TIDU_size.  Excl. headers */
370 	T_CLTS,		/* SERV_type.  rds supports connection-less. */
371 	TS_UNBND,	/* CURRENT_state.  This is set from rds_state. */
372 	(XPG4_1|SENDZERO) /* PROVIDER_flag */
373 };
374 
375 static in_port_t
376 rds_update_next_port(in_port_t port)
377 {
378 	(void) random_get_pseudo_bytes((uint8_t *)&port, sizeof (in_port_t));
379 	if (port < rds_smallest_port)
380 		port = rds_smallest_port;
381 	return (port);
382 }
383 
384 /* This routine creates a T_ERROR_ACK message and passes it upstream. */
385 static void
386 rds_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
387 {
388 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
389 		qreply(q, mp);
390 }
391 
392 static void
393 rds_capability_req(queue_t *q, mblk_t *mp)
394 {
395 	t_uscalar_t	cap_bits1;
396 	struct T_capability_ack *tcap;
397 
398 	cap_bits1 =
399 	    ((struct T_capability_req *)(uintptr_t)mp->b_rptr)->CAP_bits1;
400 
401 	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
402 	    mp->b_datap->db_type, T_CAPABILITY_ACK);
403 	if (mp == NULL)
404 		return;
405 	tcap = (struct T_capability_ack *)(uintptr_t)mp->b_rptr;
406 	tcap->CAP_bits1 = 0;
407 
408 	if (cap_bits1 & TC1_INFO) {
409 		tcap->CAP_bits1 |= TC1_INFO;
410 		*(&tcap->INFO_ack) = rds_g_t_info_ack_ipv4;
411 	}
412 
413 	qreply(q, mp);
414 }
415 
416 static void
417 rds_info_req(queue_t *q, mblk_t *omp)
418 {
419 	rds_t *rds = (rds_t *)q->q_ptr;
420 	struct T_info_ack *tap;
421 	mblk_t *mp;
422 
423 	/* Create a T_INFO_ACK message. */
424 	mp = tpi_ack_alloc(omp, sizeof (struct T_info_ack), M_PCPROTO,
425 	    T_INFO_ACK);
426 	if (mp == NULL)
427 		return;
428 	tap = (struct T_info_ack *)(uintptr_t)mp->b_rptr;
429 	*tap = rds_g_t_info_ack_ipv4;
430 	tap->CURRENT_state = rds->rds_state;
431 	tap->OPT_size = 128;
432 	qreply(q, mp);
433 }
434 
435 /*
436  * NO locking protection here as sockfs will only send down
437  * one bind operation at a time.
438  */
439 static void
440 rds_bind(queue_t *q, mblk_t *mp)
441 {
442 	sin_t		*sin;
443 	rds_t *rds;
444 	struct T_bind_req *tbr;
445 	in_port_t	port;	/* Host byte order */
446 	in_port_t	requested_port; /* Host byte order */
447 	struct T_bind_ack *tba;
448 	int		count;
449 	rds_bf_t	*rdsbf;
450 	in_port_t	lport;	/* Network byte order */
451 
452 	rds = (rds_t *)q->q_ptr;
453 	if (((uintptr_t)mp->b_wptr - (uintptr_t)mp->b_rptr) < sizeof (*tbr)) {
454 		rds_err_ack(q, mp, TPROTO, 0);
455 		return;
456 	}
457 
458 	/*
459 	 * We don't allow multiple binds
460 	 */
461 	if (rds->rds_state != TS_UNBND) {
462 		rds_err_ack(q, mp, TOUTSTATE, 0);
463 		return;
464 	}
465 
466 	tbr = (struct T_bind_req *)(uintptr_t)mp->b_rptr;
467 	switch (tbr->ADDR_length) {
468 	case sizeof (sin_t):    /* Complete IPv4 address */
469 		sin = (sin_t *)(uintptr_t)mi_offset_param(mp, tbr->ADDR_offset,
470 		    sizeof (sin_t));
471 		if (sin == NULL || !OK_32PTR((char *)sin)) {
472 			rds_err_ack(q, mp, TSYSERR, EINVAL);
473 			return;
474 		}
475 		if (rds->rds_family != AF_INET_OFFLOAD ||
476 		    sin->sin_family != AF_INET_OFFLOAD) {
477 			rds_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
478 			return;
479 		}
480 		if (sin->sin_addr.s_addr == INADDR_ANY) {
481 			rds_err_ack(q, mp, TBADADDR, 0);
482 			return;
483 		}
484 
485 		/*
486 		 * verify that the address is hosted on IB
487 		 * only exception is the loopback address.
488 		 */
489 		if ((sin->sin_addr.s_addr != INADDR_LOOPBACK) &&
490 		    !rds_verify_bind_address(sin->sin_addr.s_addr)) {
491 			rds_err_ack(q, mp, TBADADDR, 0);
492 			return;
493 		}
494 
495 		port = ntohs(sin->sin_port);
496 		break;
497 	default:	/* Invalid request */
498 		rds_err_ack(q, mp, TBADADDR, 0);
499 		return;
500 	}
501 
502 	requested_port = port;
503 
504 	/*
505 	 * TPI only sends down T_BIND_REQ for AF_INET and AF_INET6
506 	 * since RDS socket is of type AF_INET_OFFLOAD a O_T_BIND_REQ
507 	 * will be sent down. Treat O_T_BIND_REQ as T_BIND_REQ
508 	 */
509 
510 	if (requested_port == 0) {
511 		/*
512 		 * If the application passed in zero for the port number, it
513 		 * doesn't care which port number we bind to. Get one in the
514 		 * valid range.
515 		 */
516 		port = rds_update_next_port(rds_next_port_to_try);
517 	}
518 
519 	ASSERT(port != 0);
520 	count = 0;
521 	for (;;) {
522 		rds_t		*rds1;
523 		ASSERT(sin->sin_addr.s_addr != INADDR_ANY);
524 		/*
525 		 * Walk through the list of rds streams bound to
526 		 * requested port with the same IP address.
527 		 */
528 		lport = htons(port);
529 		rdsbf = &rds_bind_fanout[RDS_BIND_HASH(lport)];
530 		mutex_enter(&rdsbf->rds_bf_lock);
531 		for (rds1 = rdsbf->rds_bf_rds; rds1 != NULL;
532 		    rds1 = rds1->rds_bind_hash) {
533 			if (lport != rds1->rds_port ||
534 			    rds1->rds_src != sin->sin_addr.s_addr ||
535 			    rds1->rds_zoneid != rds->rds_zoneid)
536 
537 				continue;
538 			break;
539 		}
540 
541 		if (rds1 == NULL) {
542 			/*
543 			 * No other stream has this IP address
544 			 * and port number. We can use it.
545 			 */
546 			break;
547 		}
548 		mutex_exit(&rdsbf->rds_bf_lock);
549 		if (requested_port != 0) {
550 			/*
551 			 * We get here only when requested port
552 			 * is bound (and only first  of the for()
553 			 * loop iteration).
554 			 *
555 			 * The semantics of this bind request
556 			 * require it to fail so we return from
557 			 * the routine (and exit the loop).
558 			 *
559 			 */
560 			rds_err_ack(q, mp, TADDRBUSY, 0);
561 			return;
562 		}
563 
564 		port = rds_update_next_port(port + 1);
565 
566 		if (++count >= loopmax) {
567 			/*
568 			 * We've tried every possible port number and
569 			 * there are none available, so send an error
570 			 * to the user.
571 			 */
572 			rds_err_ack(q, mp, TNOADDR, 0);
573 			return;
574 		}
575 	}
576 
577 	/*
578 	 * Copy the source address into our rds structure.
579 	 */
580 	rds->rds_src = sin->sin_addr.s_addr;
581 	rds->rds_port = lport;
582 
583 	/*
584 	 * reset the next port if we choose the port
585 	 */
586 	if (requested_port == 0) {
587 		rds_next_port_to_try = port + 1;
588 	}
589 
590 	rds->rds_state = TS_IDLE;
591 	rds_bind_hash_insert(rdsbf, rds);
592 	mutex_exit(&rdsbf->rds_bf_lock);
593 
594 	/* Reset the message type in preparation for shipping it back. */
595 	mp->b_datap->db_type = M_PCPROTO;
596 	tba = (struct T_bind_ack *)(uintptr_t)mp->b_rptr;
597 	tba->PRIM_type = T_BIND_ACK;
598 
599 	/* Increment the number of ports and set the port quota */
600 	RDS_INCR_NPORT();
601 	rds->rds_port_quota = RDS_CURRENT_PORT_QUOTA();
602 	RDS_SET_PORT_QUOTA(rds->rds_port_quota);
603 	(void) mi_set_sth_hiwat(RD(q), rds->rds_port_quota * UserBufferSize);
604 
605 	qreply(q, mp);
606 }
607 
608 static void
609 rds_wput_other(queue_t *q, mblk_t *mp)
610 {
611 	rds_t *rds = (rds_t *)q->q_ptr;
612 	uchar_t *rptr = mp->b_rptr;
613 	struct datab *db;
614 	cred_t *cr;
615 
616 	cr = DB_CREDDEF(mp, rds->rds_cred);
617 	db = mp->b_datap;
618 	switch (db->db_type) {
619 	case M_DATA:
620 		/* Not connected */
621 		freemsg(mp);
622 		return;
623 	case M_PROTO:
624 	case M_PCPROTO:
625 		if ((uintptr_t)mp->b_wptr - (uintptr_t)rptr <
626 		    sizeof (t_scalar_t)) {
627 			freemsg(mp);
628 			return;
629 		}
630 		switch (((union T_primitives *)(uintptr_t)rptr)->type) {
631 		case T_CAPABILITY_REQ:
632 			rds_capability_req(q, mp);
633 			return;
634 
635 		case T_INFO_REQ:
636 			rds_info_req(q, mp);
637 			return;
638 		case O_T_BIND_REQ:
639 		case T_BIND_REQ:
640 			rds_bind(q, mp);
641 			return;
642 		case T_SVR4_OPTMGMT_REQ:
643 			(void) svr4_optcom_req(q, mp, cr, &rds_opt_obj,
644 			    B_FALSE);
645 			return;
646 		case T_OPTMGMT_REQ:
647 			(void) tpi_optcom_req(q, mp, cr, &rds_opt_obj, B_FALSE);
648 			return;
649 		case T_CONN_REQ:
650 			/*
651 			 * We should not receive T_CONN_REQ as sockfs only
652 			 * sends down T_CONN_REQ if family == AF_INET/AF_INET6
653 			 * and type == SOCK_DGRAM/SOCK_RAW. For all others
654 			 * it simply calls soisconnected. see sotpi_connect()
655 			 * for details.
656 			 */
657 		/* FALLTHRU */
658 		default:
659 			cmn_err(CE_PANIC, "type %d \n",
660 			    ((union T_primitives *)(uintptr_t)rptr)->type);
661 		}
662 		break;
663 	case M_FLUSH:
664 		if (*rptr & FLUSHW)
665 			flushq(q, FLUSHDATA);
666 		break;
667 	case M_IOCTL:
668 		rds_ioctl(q, mp);
669 		break;
670 	case M_IOCDATA:
671 		/* IOCTL continuation following copyin or copyout. */
672 		if (mi_copy_state(q, mp, NULL) == -1) {
673 			/*
674 			 * The copy operation failed.  mi_copy_state already
675 			 * cleaned up, so we're out of here.
676 			 */
677 			return;
678 		}
679 		/*
680 		 * If we just completed a copy in, continue processing
681 		 * in rds_ioctl_copyin_done. If it was a copy out, we call
682 		 * mi_copyout again.  If there is nothing more to copy out,
683 		 * it will complete the IOCTL.
684 		 */
685 
686 		if (MI_COPY_DIRECTION(mp) == MI_COPY_IN)
687 			rds_ioctl_copyin_done(q, mp);
688 		else
689 			mi_copyout(q, mp);
690 		return;
691 
692 	default:
693 		cmn_err(CE_PANIC, "types %d \n", db->db_type);
694 	}
695 }
696 
697 static int
698 rds_wput(queue_t *q, mblk_t *mp)
699 {
700 	struct	datab	*db;
701 	uchar_t	*rptr = mp->b_rptr;
702 
703 	db = mp->b_datap;
704 	switch (db->db_type) {
705 	case M_PROTO:
706 	case M_PCPROTO:
707 		ASSERT(((uintptr_t)mp->b_wptr - (uintptr_t)rptr) <=
708 		    (uintptr_t)INT_MAX);
709 		if ((uintptr_t)mp->b_wptr - (uintptr_t)rptr >=
710 		    sizeof (struct T_unitdata_req)) {
711 			if (((union T_primitives *)(uintptr_t)rptr)->type
712 			    == T_UNITDATA_REQ) {
713 				/*
714 				 *  We should never come here for T_UNITDATA_REQ
715 				 */
716 				cmn_err(CE_PANIC, "rds_wput T_UNITDATA_REQ \n");
717 			}
718 		}
719 		/* FALLTHRU */
720 	default:
721 		rds_wput_other(q, mp);
722 		return (0);
723 	}
724 }
725 
726 static int
727 rds_wput_data(queue_t *q, mblk_t *mp, uio_t *uiop)
728 {
729 	uchar_t	*rptr = mp->b_rptr;
730 	rds_t	*rds;
731 	mblk_t	*mp1;
732 	sin_t	*sin;
733 	ipaddr_t dst;
734 	uint16_t port;
735 	int ret = 0;
736 
737 #define	tudr	((struct T_unitdata_req *)(uintptr_t)rptr)
738 
739 	rds = (rds_t *)q->q_ptr;
740 	/* Handle UNITDATA_REQ messages here */
741 	if (rds->rds_state == TS_UNBND) {
742 		/* If a port has not been bound to the stream, fail. */
743 		dprint(2, ("%s: socket is not bound to a port", LABEL));
744 		freemsg(mp);
745 		return (EPROTO);
746 	}
747 
748 	mp1 = mp->b_cont;
749 	mp->b_cont = NULL;
750 	if (mp1 == NULL) {
751 		dprint(2, ("%s: No message to send", LABEL));
752 		freemsg(mp);
753 		return (EPROTO);
754 	}
755 
756 	/*
757 	 * No options allowed
758 	 */
759 	if (tudr->OPT_length != 0) {
760 		ret = EINVAL;
761 		goto done;
762 	}
763 
764 	ASSERT(mp1->b_datap->db_ref == 1);
765 
766 	if ((rptr + tudr->DEST_offset + tudr->DEST_length) >
767 	    mp->b_wptr) {
768 		ret = EDESTADDRREQ;
769 		goto done;
770 	}
771 
772 	sin = (sin_t *)(uintptr_t)&rptr[tudr->DEST_offset];
773 	if (!OK_32PTR((char *)sin) || tudr->DEST_length !=
774 	    sizeof (sin_t) || sin->sin_family != AF_INET_OFFLOAD) {
775 		ret = EDESTADDRREQ;
776 		goto done;
777 	}
778 	/* Extract port and ipaddr */
779 	port = sin->sin_port;
780 	dst = sin->sin_addr.s_addr;
781 
782 	if (port == 0 || dst == INADDR_ANY) {
783 		ret = EDESTADDRREQ;
784 		goto done;
785 	}
786 
787 	ASSERT(rds_transport_ops != NULL);
788 	ret = rds_transport_ops->rds_transport_sendmsg(uiop, rds->rds_src, dst,
789 	    ntohs(rds->rds_port), ntohs(port), rds->rds_zoneid);
790 	if (ret != 0) {
791 		if ((ret != ENOBUFS) && (ret != ENOMEM)) {
792 			/* ENOMEM is actually EWOULDBLOCK */
793 			dprint(2, ("%s: rds_sendmsg returned %d", LABEL, ret));
794 			goto done;
795 		}
796 	}
797 done:
798 	freemsg(mp1);
799 	freemsg(mp);
800 	return (ret);
801 }
802 
803 /*
804  * Make sure we dont return EINVAL and EWOULDBLOCK as it has
805  * special meanings for the synchronous streams (rwnext()).
806  * We should return ENOMEM which is changed to EWOULDBLOCK by kstrputmsg()
807  */
808 static int
809 rds_wrw(queue_t *q, struiod_t *dp)
810 {
811 	mblk_t  *mp = dp->d_mp;
812 	int error = 0;
813 	struct  datab   *db;
814 	uchar_t *rptr;
815 
816 	db = mp->b_datap;
817 	rptr = mp->b_rptr;
818 	switch (db->db_type) {
819 	case M_PROTO:
820 	case M_PCPROTO:
821 		ASSERT(((uintptr_t)mp->b_wptr - (uintptr_t)rptr) <=
822 		    (uintptr_t)INT_MAX);
823 		if ((uintptr_t)mp->b_wptr - (uintptr_t)rptr >=
824 		    sizeof (struct T_unitdata_req)) {
825 			/* Detect valid T_UNITDATA_REQ here */
826 			if (((union T_primitives *)(uintptr_t)rptr)->type
827 			    == T_UNITDATA_REQ)
828 			break;
829 		}
830 		/* FALLTHRU */
831 	default:
832 
833 		if (isuioq(q) && (error = struioget(q, mp, dp, 0))) {
834 		/*
835 		 * Uio error of some sort, so just return the error.
836 		 */
837 			goto done;
838 		}
839 		dp->d_mp = 0;
840 		rds_wput_other(q, mp);
841 		return (0);
842 	}
843 
844 	dp->d_mp = 0;
845 	error = rds_wput_data(q, mp, &dp->d_uio);
846 done:
847 	if (error == EWOULDBLOCK || error == EINVAL)
848 		error = EIO;
849 
850 	return (error);
851 }
852 
853 static void
854 rds_rsrv(queue_t *q)
855 {
856 	rds_t	*rds = (rds_t *)q->q_ptr;
857 	ulong_t current_port_quota;
858 
859 	/* update the port quota to the current level */
860 	current_port_quota = RDS_GET_PORT_QUOTA();
861 	if (rds->rds_port_quota != current_port_quota) {
862 		rds->rds_port_quota = current_port_quota;
863 		(void) mi_set_sth_hiwat(q,
864 		    rds->rds_port_quota * UserBufferSize);
865 	}
866 
867 	/* No more messages in the q, unstall the socket */
868 	rds_transport_ops->rds_transport_resume_port(ntohs(rds->rds_port));
869 }
870 
871 int
872 rds_close_transport_driver()
873 {
874 	ASSERT(rds_transport_ops != NULL);
875 
876 	rw_enter(&rds_transport_lock, RW_WRITER);
877 	if (rds_transport_handle != NULL) {
878 		rds_transport_ops->rds_transport_close_ib();
879 		(void) ldi_close(rds_transport_handle, FNDELAY, kcred);
880 		rds_transport_handle = NULL;
881 	}
882 	rw_exit(&rds_transport_lock);
883 
884 	return (0);
885 }
886 
887 
888 int
889 rds_open_transport_driver()
890 {
891 	int ret = 0;
892 
893 	rw_enter(&rds_transport_lock, RW_WRITER);
894 	if (rds_transport_handle != NULL) {
895 		/*
896 		 * Someone beat us to it.
897 		 */
898 		goto done;
899 	}
900 
901 	if (ibt_hw_is_present() == 0) {
902 		ret = ENODEV;
903 		goto done;
904 	}
905 
906 	if (rds_li == NULL) {
907 		ret = EPROTONOSUPPORT;
908 		goto done;
909 	}
910 
911 	ret = ldi_open_by_name("/devices/ib/rdsib@0:rdsib",
912 	    FREAD | FWRITE, kcred, &rds_transport_handle, rds_li);
913 	if (ret != 0) {
914 		ret = EPROTONOSUPPORT;
915 		rds_transport_handle = NULL;
916 		goto done;
917 	}
918 
919 	ret = rds_transport_ops->rds_transport_open_ib();
920 	if (ret != 0) {
921 		(void) ldi_close(rds_transport_handle, FNDELAY, kcred);
922 		rds_transport_handle = NULL;
923 	}
924 done:
925 	rw_exit(&rds_transport_lock);
926 	return (ret);
927 }
928 
929 static struct module_info info = {
930 	0, "rds", 1, INFPSZ, 65536, 1024
931 };
932 
933 static struct qinit rinit = {
934 	NULL, (pfi_t)rds_rsrv, rds_open, rds_close, NULL, &info
935 };
936 
937 static struct qinit winit = {
938 	(pfi_t)rds_wput, NULL, rds_open, rds_close, NULL, &info,
939 	NULL, rds_wrw, NULL, STRUIOT_STANDARD
940 };
941 
942 struct streamtab rdsinfo = {
943 	&rinit, &winit, NULL, NULL
944 };
945 
946 DDI_DEFINE_STREAM_OPS(rds_devops, nulldev, nulldev, rds_attach, rds_detach,
947     nulldev, rds_info, RDS_DEVMTFLAGS, &RDS_STRTAB);
948 
949 /*
950  * Module linkage information for the kernel.
951  */
952 static struct modldrv modldrv = {
953 	&mod_driverops,
954 	RDS_DEVDESC,
955 	&rds_devops
956 };
957 
958 static struct modlinkage modlinkage = {
959 	MODREV_1,
960 	&modldrv,
961 	NULL
962 };
963 
964 int
965 _init(void)
966 {
967 	int	ret;
968 
969 	rds_init();
970 
971 	ret = mod_install(&modlinkage);
972 	if (ret != 0)
973 		goto done;
974 	ret = ldi_ident_from_mod(&modlinkage, &rds_li);
975 	if (ret != 0)
976 		rds_li = NULL;
977 done:
978 	return (ret);
979 }
980 
981 int
982 _fini(void)
983 {
984 	int	ret;
985 
986 	ret = mod_remove(&modlinkage);
987 	if (ret != 0) {
988 		return (ret);
989 	}
990 
991 	rds_fini();
992 
993 	ldi_ident_release(rds_li);
994 	return (0);
995 }
996 
997 int
998 _info(struct modinfo *modinfop)
999 {
1000 	return (mod_info(&modlinkage, modinfop));
1001 }
1002