xref: /illumos-gate/usr/src/uts/common/io/idm/idm_so.c (revision cf8c0eba)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/conf.h>
27 #include <sys/stat.h>
28 #include <sys/file.h>
29 #include <sys/ddi.h>
30 #include <sys/sunddi.h>
31 #include <sys/modctl.h>
32 #include <sys/priv.h>
33 #include <sys/cpuvar.h>
34 #include <sys/socket.h>
35 #include <sys/strsubr.h>
36 #include <sys/sysmacros.h>
37 #include <sys/sdt.h>
38 #include <netinet/tcp.h>
39 #include <inet/tcp.h>
40 #include <sys/socketvar.h>
41 #include <sys/pathname.h>
42 #include <sys/fs/snode.h>
43 #include <sys/fs/dv_node.h>
44 #include <sys/vnode.h>
45 #include <netinet/in.h>
46 #include <net/if.h>
47 #include <sys/sockio.h>
48 #include <sys/ksocket.h>
49 #include <sys/idm/idm.h>
50 #include <sys/idm/idm_so.h>
51 #include <sys/idm/idm_text.h>
52 
53 /*
54  * in6addr_any is currently all zeroes, but use the macro in case this
55  * ever changes.
56  */
57 const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
58 
59 static void idm_sorx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
60 static void idm_sorx_addl_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
61 static void idm_sotx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
62 
63 static idm_status_t idm_so_conn_create_common(idm_conn_t *ic, ksocket_t new_so);
64 static void idm_so_conn_destroy_common(idm_conn_t *ic);
65 static void idm_so_conn_connect_common(idm_conn_t *ic);
66 
67 static void idm_set_ini_preconnect_options(idm_so_conn_t *sc);
68 static void idm_set_ini_postconnect_options(idm_so_conn_t *sc);
69 static void idm_set_tgt_connect_options(ksocket_t so);
70 static idm_status_t idm_i_so_tx(idm_pdu_t *pdu);
71 
72 static idm_status_t idm_sorecvdata(idm_conn_t *ic, idm_pdu_t *pdu);
73 static void idm_so_send_rtt_data(idm_conn_t *ic, idm_task_t *idt,
74     idm_buf_t *idb, uint32_t offset, uint32_t length);
75 static void idm_so_send_rtt_data_done(idm_task_t *idt, idm_buf_t *idb);
76 static idm_status_t idm_so_send_buf_region(idm_task_t *idt,
77     idm_buf_t *idb, uint32_t buf_region_offset, uint32_t buf_region_length);
78 
79 static uint32_t idm_fill_iov(idm_pdu_t *pdu, idm_buf_t *idb,
80     uint32_t ro, uint32_t dlength);
81 
82 static idm_status_t idm_so_handle_digest(idm_conn_t *it,
83     nvpair_t *digest_choice, const idm_kv_xlate_t *ikvx);
84 
85 /*
86  * Transport ops prototypes
87  */
88 static void idm_so_tx(idm_conn_t *ic, idm_pdu_t *pdu);
89 static idm_status_t idm_so_buf_tx_to_ini(idm_task_t *idt, idm_buf_t *idb);
90 static idm_status_t idm_so_buf_rx_from_ini(idm_task_t *idt, idm_buf_t *idb);
91 static void idm_so_rx_datain(idm_conn_t *ic, idm_pdu_t *pdu);
92 static void idm_so_rx_rtt(idm_conn_t *ic, idm_pdu_t *pdu);
93 static void idm_so_rx_dataout(idm_conn_t *ic, idm_pdu_t *pdu);
94 static idm_status_t idm_so_free_task_rsrc(idm_task_t *idt);
95 static kv_status_t idm_so_negotiate_key_values(idm_conn_t *it,
96     nvlist_t *request_nvl, nvlist_t *response_nvl, nvlist_t *negotiated_nvl);
97 static void idm_so_notice_key_values(idm_conn_t *it,
98     nvlist_t *negotiated_nvl);
99 static boolean_t idm_so_conn_is_capable(idm_conn_req_t *ic,
100     idm_transport_caps_t *caps);
101 static idm_status_t idm_so_buf_alloc(idm_buf_t *idb, uint64_t buflen);
102 static void idm_so_buf_free(idm_buf_t *idb);
103 static idm_status_t idm_so_buf_setup(idm_buf_t *idb);
104 static void idm_so_buf_teardown(idm_buf_t *idb);
105 static idm_status_t idm_so_tgt_svc_create(idm_svc_req_t *sr, idm_svc_t *is);
106 static void idm_so_tgt_svc_destroy(idm_svc_t *is);
107 static idm_status_t idm_so_tgt_svc_online(idm_svc_t *is);
108 static void idm_so_tgt_svc_offline(idm_svc_t *is);
109 static void idm_so_tgt_conn_destroy(idm_conn_t *ic);
110 static idm_status_t idm_so_tgt_conn_connect(idm_conn_t *ic);
111 static void idm_so_conn_disconnect(idm_conn_t *ic);
112 static idm_status_t idm_so_ini_conn_create(idm_conn_req_t *cr, idm_conn_t *ic);
113 static void idm_so_ini_conn_destroy(idm_conn_t *ic);
114 static idm_status_t idm_so_ini_conn_connect(idm_conn_t *ic);
115 
116 /*
117  * IDM Native Sockets transport operations
118  */
119 static
120 idm_transport_ops_t idm_so_transport_ops = {
121 	idm_so_tx,			/* it_tx_pdu */
122 	idm_so_buf_tx_to_ini,		/* it_buf_tx_to_ini */
123 	idm_so_buf_rx_from_ini,		/* it_buf_rx_from_ini */
124 	idm_so_rx_datain,		/* it_rx_datain */
125 	idm_so_rx_rtt,			/* it_rx_rtt */
126 	idm_so_rx_dataout,		/* it_rx_dataout */
127 	NULL,				/* it_alloc_conn_rsrc */
128 	NULL,				/* it_free_conn_rsrc */
129 	NULL,				/* it_tgt_enable_datamover */
130 	NULL,				/* it_ini_enable_datamover */
131 	NULL,				/* it_conn_terminate */
132 	idm_so_free_task_rsrc,		/* it_free_task_rsrc */
133 	idm_so_negotiate_key_values,	/* it_negotiate_key_values */
134 	idm_so_notice_key_values,	/* it_notice_key_values */
135 	idm_so_conn_is_capable,		/* it_conn_is_capable */
136 	idm_so_buf_alloc,		/* it_buf_alloc */
137 	idm_so_buf_free,		/* it_buf_free */
138 	idm_so_buf_setup,		/* it_buf_setup */
139 	idm_so_buf_teardown,		/* it_buf_teardown */
140 	idm_so_tgt_svc_create,		/* it_tgt_svc_create */
141 	idm_so_tgt_svc_destroy,		/* it_tgt_svc_destroy */
142 	idm_so_tgt_svc_online,		/* it_tgt_svc_online */
143 	idm_so_tgt_svc_offline,		/* it_tgt_svc_offline */
144 	idm_so_tgt_conn_destroy,	/* it_tgt_conn_destroy */
145 	idm_so_tgt_conn_connect,	/* it_tgt_conn_connect */
146 	idm_so_conn_disconnect,		/* it_tgt_conn_disconnect */
147 	idm_so_ini_conn_create,		/* it_ini_conn_create */
148 	idm_so_ini_conn_destroy,	/* it_ini_conn_destroy */
149 	idm_so_ini_conn_connect,	/* it_ini_conn_connect */
150 	idm_so_conn_disconnect		/* it_ini_conn_disconnect */
151 };
152 
153 /*
154  * idm_so_init()
155  * Sockets transport initialization
156  */
157 void
158 idm_so_init(idm_transport_t *it)
159 {
160 	/* Cache for IDM Data and R2T Transmit PDU's */
161 	idm.idm_sotx_pdu_cache = kmem_cache_create("idm_tx_pdu_cache",
162 	    sizeof (idm_pdu_t) + sizeof (iscsi_hdr_t), 8,
163 	    &idm_sotx_pdu_constructor, NULL, NULL, NULL, NULL, KM_SLEEP);
164 
165 	/* Cache for IDM Receive PDU's */
166 	idm.idm_sorx_pdu_cache = kmem_cache_create("idm_rx_pdu_cache",
167 	    sizeof (idm_pdu_t) + IDM_SORX_CACHE_HDRLEN, 8,
168 	    &idm_sorx_pdu_constructor, NULL, NULL, NULL, NULL, KM_SLEEP);
169 
170 	/* 128k buffer cache */
171 	idm.idm_so_128k_buf_cache = kmem_cache_create("idm_128k_buf_cache",
172 	    IDM_SO_BUF_CACHE_UB, 8, NULL, NULL, NULL, NULL, NULL, KM_SLEEP);
173 
174 	/* Set the sockets transport ops */
175 	it->it_ops = &idm_so_transport_ops;
176 }
177 
178 /*
179  * idm_so_fini()
180  * Sockets transport teardown
181  */
182 void
183 idm_so_fini(void)
184 {
185 	kmem_cache_destroy(idm.idm_so_128k_buf_cache);
186 	kmem_cache_destroy(idm.idm_sotx_pdu_cache);
187 	kmem_cache_destroy(idm.idm_sorx_pdu_cache);
188 }
189 
190 ksocket_t
191 idm_socreate(int domain, int type, int protocol)
192 {
193 	ksocket_t ks;
194 
195 	if (!ksocket_socket(&ks, domain, type, protocol, KSOCKET_NOSLEEP,
196 	    CRED())) {
197 		return (ks);
198 	} else {
199 		return (NULL);
200 	}
201 }
202 
203 /*
204  * idm_soshutdown will disconnect the socket and prevent subsequent PDU
205  * reception and transmission.  The sonode still exists but its state
206  * gets modified to indicate it is no longer connected.  Calls to
207  * idm_sorecv/idm_iov_sorecv will return so idm_soshutdown can be used
208  * regain control of a thread stuck in idm_sorecv.
209  */
210 void
211 idm_soshutdown(ksocket_t so)
212 {
213 	(void) ksocket_shutdown(so, SHUT_RDWR, CRED());
214 }
215 
216 /*
217  * idm_sodestroy releases all resources associated with a socket previously
218  * created with idm_socreate.  The socket must be shutdown using
219  * idm_soshutdown before the socket is destroyed with idm_sodestroy,
220  * otherwise undefined behavior will result.
221  */
222 void
223 idm_sodestroy(ksocket_t ks)
224 {
225 	(void) ksocket_close(ks, CRED());
226 }
227 
228 /*
229  * IP address filter functions to flag addresses that should not
230  * go out to initiators through discovery.
231  */
232 static boolean_t
233 idm_v4_addr_okay(struct in_addr *in_addr)
234 {
235 	in_addr_t addr = ntohl(in_addr->s_addr);
236 
237 	if ((INADDR_NONE == addr) ||
238 	    (IN_MULTICAST(addr)) ||
239 	    ((addr >> IN_CLASSA_NSHIFT) == 0) ||
240 	    ((addr >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) {
241 		return (B_FALSE);
242 	}
243 	return (B_TRUE);
244 }
245 
246 static boolean_t
247 idm_v6_addr_okay(struct in6_addr *addr6)
248 {
249 
250 	if ((IN6_IS_ADDR_UNSPECIFIED(addr6)) ||
251 	    (IN6_IS_ADDR_LOOPBACK(addr6)) ||
252 	    (IN6_IS_ADDR_MULTICAST(addr6)) ||
253 	    (IN6_IS_ADDR_V4MAPPED(addr6)) ||
254 	    (IN6_IS_ADDR_V4COMPAT(addr6)) ||
255 	    (IN6_IS_ADDR_LINKLOCAL(addr6))) {
256 		return (B_FALSE);
257 	}
258 	return (B_TRUE);
259 }
260 
261 /*
262  * idm_get_ipaddr will retrieve a list of IP Addresses which the host is
263  * configured with by sending down a sequence of kernel ioctl to IP STREAMS.
264  */
265 int
266 idm_get_ipaddr(idm_addr_list_t **ipaddr_p)
267 {
268 	ksocket_t 		so4, so6;
269 	struct lifnum		lifn;
270 	struct lifconf		lifc;
271 	struct lifreq		*lp;
272 	int			rval;
273 	int			numifs;
274 	int			bufsize;
275 	void			*buf;
276 	int			i, j, n, rc;
277 	struct sockaddr_storage	ss;
278 	struct sockaddr_in	*sin;
279 	struct sockaddr_in6	*sin6;
280 	idm_addr_t		*ip;
281 	idm_addr_list_t		*ipaddr;
282 	int			size_ipaddr;
283 
284 	*ipaddr_p = NULL;
285 	size_ipaddr = 0;
286 	buf = NULL;
287 
288 	/* create an ipv4 and ipv6 UDP socket */
289 	if ((so6 = idm_socreate(PF_INET6, SOCK_DGRAM, 0)) == NULL)
290 		return (0);
291 	if ((so4 = idm_socreate(PF_INET, SOCK_DGRAM, 0)) == NULL) {
292 		idm_sodestroy(so6);
293 		return (0);
294 	}
295 
296 
297 retry_count:
298 	/* snapshot the current number of interfaces */
299 	lifn.lifn_family = PF_UNSPEC;
300 	lifn.lifn_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
301 	lifn.lifn_count = 0;
302 	/* use vp6 for ioctls with unspecified families by default */
303 	if (ksocket_ioctl(so6, SIOCGLIFNUM, (intptr_t)&lifn, &rval, CRED())
304 	    != 0) {
305 		goto cleanup;
306 	}
307 
308 	numifs = lifn.lifn_count;
309 	if (numifs <= 0) {
310 		goto cleanup;
311 	}
312 
313 	/* allocate extra room in case more interfaces appear */
314 	numifs += 10;
315 
316 	/* get the interface names and ip addresses */
317 	bufsize = numifs * sizeof (struct lifreq);
318 	buf = kmem_alloc(bufsize, KM_SLEEP);
319 
320 	lifc.lifc_family = AF_UNSPEC;
321 	lifc.lifc_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
322 	lifc.lifc_len = bufsize;
323 	lifc.lifc_buf = buf;
324 	rc = ksocket_ioctl(so6, SIOCGLIFCONF, (intptr_t)&lifc, &rval, CRED());
325 	if (rc != 0) {
326 		goto cleanup;
327 	}
328 	/* if our extra room is used up, try again */
329 	if (bufsize <= lifc.lifc_len) {
330 		kmem_free(buf, bufsize);
331 		buf = NULL;
332 		goto retry_count;
333 	}
334 	/* calc actual number of ifconfs */
335 	n = lifc.lifc_len / sizeof (struct lifreq);
336 
337 	/* get ip address */
338 	if (n > 0) {
339 		size_ipaddr = sizeof (idm_addr_list_t) +
340 		    (n - 1) * sizeof (idm_addr_t);
341 		ipaddr = kmem_zalloc(size_ipaddr, KM_SLEEP);
342 	} else {
343 		goto cleanup;
344 	}
345 
346 	/*
347 	 * Examine the array of interfaces and filter uninteresting ones
348 	 */
349 	for (i = 0, j = 0, lp = lifc.lifc_req; i < n; i++, lp++) {
350 
351 		/*
352 		 * Copy the address as the SIOCGLIFFLAGS ioctl is destructive
353 		 */
354 		ss = lp->lifr_addr;
355 		/*
356 		 * fetch the flags using the socket of the correct family
357 		 */
358 		switch (ss.ss_family) {
359 		case AF_INET:
360 			rc = ksocket_ioctl(so4, SIOCGLIFFLAGS, (intptr_t)lp,
361 			    &rval, CRED());
362 			break;
363 		case AF_INET6:
364 			rc = ksocket_ioctl(so6, SIOCGLIFFLAGS, (intptr_t)lp,
365 			    &rval, CRED());
366 			break;
367 		default:
368 			continue;
369 		}
370 		if (rc == 0) {
371 			/*
372 			 * If we got the flags, skip uninteresting
373 			 * interfaces based on flags
374 			 */
375 			if ((lp->lifr_flags & IFF_UP) != IFF_UP)
376 				continue;
377 			if (lp->lifr_flags &
378 			    (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED))
379 				continue;
380 		}
381 
382 		/* save ip address */
383 		ip = &ipaddr->al_addrs[j];
384 		switch (ss.ss_family) {
385 		case AF_INET:
386 			sin = (struct sockaddr_in *)&ss;
387 			if (!idm_v4_addr_okay(&sin->sin_addr))
388 				continue;
389 			ip->a_addr.i_addr.in4 = sin->sin_addr;
390 			ip->a_addr.i_insize = sizeof (struct in_addr);
391 			break;
392 		case AF_INET6:
393 			sin6 = (struct sockaddr_in6 *)&ss;
394 			if (!idm_v6_addr_okay(&sin6->sin6_addr))
395 				continue;
396 			ip->a_addr.i_addr.in6 = sin6->sin6_addr;
397 			ip->a_addr.i_insize = sizeof (struct in6_addr);
398 			break;
399 		default:
400 			continue;
401 		}
402 		j++;
403 	}
404 
405 	if (j == 0) {
406 		/* no valid ifaddr */
407 		kmem_free(ipaddr, size_ipaddr);
408 		size_ipaddr = 0;
409 		ipaddr = NULL;
410 	} else {
411 		ipaddr->al_out_cnt = j;
412 	}
413 
414 
415 cleanup:
416 	idm_sodestroy(so6);
417 	idm_sodestroy(so4);
418 
419 	if (buf != NULL)
420 		kmem_free(buf, bufsize);
421 
422 	*ipaddr_p = ipaddr;
423 	return (size_ipaddr);
424 }
425 
426 int
427 idm_sorecv(ksocket_t so, void *msg, size_t len)
428 {
429 	iovec_t iov;
430 
431 	ASSERT(so != NULL);
432 	ASSERT(len != 0);
433 
434 	/*
435 	 * Fill in iovec and receive data
436 	 */
437 	iov.iov_base = msg;
438 	iov.iov_len = len;
439 
440 	return (idm_iov_sorecv(so, &iov, 1, len));
441 }
442 
443 /*
444  * idm_sosendto - Sends a buffered data on a non-connected socket.
445  *
446  * This function puts the data provided on the wire by calling sosendmsg.
447  * It will return only when all the data has been sent or if an error
448  * occurs.
449  *
450  * Returns 0 for success, the socket errno value if sosendmsg fails, and
451  * -1 if sosendmsg returns success but uio_resid != 0
452  */
453 int
454 idm_sosendto(ksocket_t so, void *buff, size_t len,
455     struct sockaddr *name, socklen_t namelen)
456 {
457 	struct msghdr		msg;
458 	struct iovec		iov[1];
459 	int			error;
460 	size_t			sent = 0;
461 
462 	iov[0].iov_base	= buff;
463 	iov[0].iov_len	= len;
464 
465 	/* Initialization of the message header. */
466 	bzero(&msg, sizeof (msg));
467 	msg.msg_iov	= iov;
468 	msg.msg_iovlen	= 1;
469 	msg.msg_name	= name;
470 	msg.msg_namelen	= namelen;
471 
472 	if ((error = ksocket_sendmsg(so, &msg, 0, &sent, CRED())) == 0) {
473 		/* Data sent */
474 		if (sent == len) {
475 			/* All data sent.  Success. */
476 			return (0);
477 		} else {
478 			/* Not all data was sent.  Failure */
479 			return (-1);
480 		}
481 	}
482 
483 	/* Send failed */
484 	return (error);
485 }
486 
487 /*
488  * idm_iov_sosend - Sends an iovec on a connection.
489  *
490  * This function puts the data provided on the wire by calling sosendmsg.
491  * It will return only when all the data has been sent or if an error
492  * occurs.
493  *
494  * Returns 0 for success, the socket errno value if sosendmsg fails, and
495  * -1 if sosendmsg returns success but uio_resid != 0
496  */
497 int
498 idm_iov_sosend(ksocket_t so, iovec_t *iop, int iovlen, size_t total_len)
499 {
500 	struct msghdr		msg;
501 	int			error;
502 	size_t 			sent = 0;
503 
504 	ASSERT(iop != NULL);
505 
506 	/* Initialization of the message header. */
507 	bzero(&msg, sizeof (msg));
508 	msg.msg_iov	= iop;
509 	msg.msg_iovlen	= iovlen;
510 
511 	if ((error = ksocket_sendmsg(so, &msg, 0, &sent, CRED()))
512 	    == 0) {
513 		/* Data sent */
514 		if (sent == total_len) {
515 			/* All data sent.  Success. */
516 			return (0);
517 		} else {
518 			/* Not all data was sent.  Failure */
519 			return (-1);
520 		}
521 	}
522 
523 	/* Send failed */
524 	return (error);
525 }
526 
527 /*
528  * idm_iov_sorecv - Receives an iovec from a connection
529  *
530  * This function gets the data asked for from the socket.  It will return
531  * only when all the requested data has been retrieved or if an error
532  * occurs.
533  *
534  * Returns 0 for success, the socket errno value if sorecvmsg fails, and
535  * -1 if sorecvmsg returns success but uio_resid != 0
536  */
537 int
538 idm_iov_sorecv(ksocket_t so, iovec_t *iop, int iovlen, size_t total_len)
539 {
540 	struct msghdr		msg;
541 	int			error;
542 	size_t			recv;
543 	int 			flags;
544 
545 	ASSERT(iop != NULL);
546 
547 	/* Initialization of the message header. */
548 	bzero(&msg, sizeof (msg));
549 	msg.msg_iov	= iop;
550 	msg.msg_iovlen	= iovlen;
551 	flags		= MSG_WAITALL;
552 
553 	if ((error = ksocket_recvmsg(so, &msg, flags, &recv, CRED()))
554 	    == 0) {
555 		/* Received data */
556 		if (recv == total_len) {
557 			/* All requested data received.  Success */
558 			return (0);
559 		} else {
560 			/*
561 			 * Not all data was received.  The connection has
562 			 * probably failed.
563 			 */
564 			return (-1);
565 		}
566 	}
567 
568 	/* Receive failed */
569 	return (error);
570 }
571 
572 static void
573 idm_set_ini_preconnect_options(idm_so_conn_t *sc)
574 {
575 	int	conn_abort = 10000;
576 	int	conn_notify = 2000;
577 	int	abort = 30000;
578 
579 	/* Pre-connect socket options */
580 	(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP,
581 	    TCP_CONN_NOTIFY_THRESHOLD, (char *)&conn_notify, sizeof (int),
582 	    CRED());
583 	(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP,
584 	    TCP_CONN_ABORT_THRESHOLD, (char *)&conn_abort, sizeof (int),
585 	    CRED());
586 	(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP, TCP_ABORT_THRESHOLD,
587 	    (char *)&abort, sizeof (int), CRED());
588 }
589 
590 static void
591 idm_set_ini_postconnect_options(idm_so_conn_t *sc)
592 {
593 	int32_t		rcvbuf = IDM_RCVBUF_SIZE;
594 	int32_t		sndbuf = IDM_SNDBUF_SIZE;
595 	const int	on = 1;
596 
597 	/* Set postconnect options */
598 	(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP, TCP_NODELAY,
599 	    (char *)&on, sizeof (int), CRED());
600 	(void) ksocket_setsockopt(sc->ic_so, SOL_SOCKET, SO_RCVBUF,
601 	    (char *)&rcvbuf, sizeof (int), CRED());
602 	(void) ksocket_setsockopt(sc->ic_so, SOL_SOCKET, SO_SNDBUF,
603 	    (char *)&sndbuf, sizeof (int), CRED());
604 }
605 
606 static void
607 idm_set_tgt_connect_options(ksocket_t ks)
608 {
609 	int32_t		rcvbuf = IDM_RCVBUF_SIZE;
610 	int32_t		sndbuf = IDM_SNDBUF_SIZE;
611 	const int	on = 1;
612 
613 	/* Set connect options */
614 	(void) ksocket_setsockopt(ks, SOL_SOCKET, SO_RCVBUF,
615 	    (char *)&rcvbuf, sizeof (int), CRED());
616 	(void) ksocket_setsockopt(ks, SOL_SOCKET, SO_SNDBUF,
617 	    (char *)&sndbuf, sizeof (int), CRED());
618 	(void) ksocket_setsockopt(ks, IPPROTO_TCP, TCP_NODELAY,
619 	    (char *)&on, sizeof (on), CRED());
620 }
621 
622 static uint32_t
623 n2h24(const uchar_t *ptr)
624 {
625 	return ((ptr[0] << 16) | (ptr[1] << 8) | ptr[2]);
626 }
627 
628 
629 static idm_status_t
630 idm_sorecvhdr(idm_conn_t *ic, idm_pdu_t *pdu)
631 {
632 	iscsi_hdr_t	*bhs;
633 	uint32_t	hdr_digest_crc;
634 	uint32_t	crc_calculated;
635 	void		*new_hdr;
636 	int		ahslen = 0;
637 	int		total_len = 0;
638 	int		iovlen = 0;
639 	struct iovec	iov[2];
640 	idm_so_conn_t	*so_conn;
641 	int		rc;
642 
643 	so_conn = ic->ic_transport_private;
644 
645 	/*
646 	 * Read BHS
647 	 */
648 	bhs = pdu->isp_hdr;
649 	rc = idm_sorecv(so_conn->ic_so, pdu->isp_hdr, sizeof (iscsi_hdr_t));
650 	if (rc != IDM_STATUS_SUCCESS) {
651 		return (IDM_STATUS_FAIL);
652 	}
653 
654 	/*
655 	 * Check actual AHS length against the amount available in the buffer
656 	 */
657 	pdu->isp_hdrlen = sizeof (iscsi_hdr_t) +
658 	    (bhs->hlength * sizeof (uint32_t));
659 	pdu->isp_datalen = n2h24(bhs->dlength);
660 	if (bhs->hlength > IDM_SORX_CACHE_AHSLEN) {
661 		/* Allocate a new header segment and change the callback */
662 		new_hdr = kmem_alloc(pdu->isp_hdrlen, KM_SLEEP);
663 		bcopy(pdu->isp_hdr, new_hdr, sizeof (iscsi_hdr_t));
664 		pdu->isp_hdr = new_hdr;
665 		pdu->isp_flags |= IDM_PDU_ADDL_HDR;
666 
667 		/*
668 		 * This callback will restore the expected values after
669 		 * the RX PDU has been processed.
670 		 */
671 		pdu->isp_callback = idm_sorx_addl_pdu_cb;
672 	}
673 
674 	/*
675 	 * Setup receipt of additional header and header digest (if enabled).
676 	 */
677 	if (bhs->hlength > 0) {
678 		iov[iovlen].iov_base = (caddr_t)(pdu->isp_hdr + 1);
679 		ahslen = pdu->isp_hdrlen - sizeof (iscsi_hdr_t);
680 		iov[iovlen].iov_len = ahslen;
681 		total_len += iov[iovlen].iov_len;
682 		iovlen++;
683 	}
684 
685 	if (ic->ic_conn_flags & IDM_CONN_HEADER_DIGEST) {
686 		iov[iovlen].iov_base = (caddr_t)&hdr_digest_crc;
687 		iov[iovlen].iov_len = sizeof (hdr_digest_crc);
688 		total_len += iov[iovlen].iov_len;
689 		iovlen++;
690 	}
691 
692 	if ((iovlen != 0) &&
693 	    (idm_iov_sorecv(so_conn->ic_so, &iov[0], iovlen,
694 	    total_len) != 0)) {
695 		return (IDM_STATUS_FAIL);
696 	}
697 
698 	/*
699 	 * Validate header digest if enabled
700 	 */
701 	if (ic->ic_conn_flags & IDM_CONN_HEADER_DIGEST) {
702 		crc_calculated = idm_crc32c(pdu->isp_hdr,
703 		    sizeof (iscsi_hdr_t) + ahslen);
704 		if (crc_calculated != hdr_digest_crc) {
705 			/* Invalid Header Digest */
706 			return (IDM_STATUS_HEADER_DIGEST);
707 		}
708 	}
709 
710 	return (0);
711 }
712 
713 /*
714  * idm_so_ini_conn_create()
715  * Allocate the sockets transport connection resources.
716  */
717 static idm_status_t
718 idm_so_ini_conn_create(idm_conn_req_t *cr, idm_conn_t *ic)
719 {
720 	ksocket_t	so;
721 	idm_so_conn_t	*so_conn;
722 	idm_status_t	idmrc;
723 
724 	so = idm_socreate(cr->cr_domain, cr->cr_type,
725 	    cr->cr_protocol);
726 	if (so == NULL) {
727 		return (IDM_STATUS_FAIL);
728 	}
729 
730 	/* Bind the socket if configured to do so */
731 	if (cr->cr_bound) {
732 		if (ksocket_bind(so, &cr->cr_bound_addr.sin,
733 		    SIZEOF_SOCKADDR(&cr->cr_bound_addr.sin), CRED()) != 0) {
734 			idm_sodestroy(so);
735 			return (IDM_STATUS_FAIL);
736 		}
737 	}
738 
739 	idmrc = idm_so_conn_create_common(ic, so);
740 	if (idmrc != IDM_STATUS_SUCCESS) {
741 		idm_soshutdown(so);
742 		idm_sodestroy(so);
743 		return (IDM_STATUS_FAIL);
744 	}
745 
746 	so_conn = ic->ic_transport_private;
747 	/* Set up socket options */
748 	idm_set_ini_preconnect_options(so_conn);
749 
750 	return (IDM_STATUS_SUCCESS);
751 }
752 
753 /*
754  * idm_so_ini_conn_destroy()
755  * Tear down the sockets transport connection resources.
756  */
757 static void
758 idm_so_ini_conn_destroy(idm_conn_t *ic)
759 {
760 	idm_so_conn_destroy_common(ic);
761 }
762 
763 /*
764  * idm_so_ini_conn_connect()
765  * Establish the connection referred to by the handle previously allocated via
766  * idm_so_ini_conn_create().
767  */
768 static idm_status_t
769 idm_so_ini_conn_connect(idm_conn_t *ic)
770 {
771 	idm_so_conn_t	*so_conn;
772 
773 	so_conn = ic->ic_transport_private;
774 
775 	if (ksocket_connect(so_conn->ic_so, &ic->ic_ini_dst_addr.sin,
776 	    (SIZEOF_SOCKADDR(&ic->ic_ini_dst_addr.sin)), CRED()) != 0) {
777 		idm_soshutdown(so_conn->ic_so);
778 		return (IDM_STATUS_FAIL);
779 	}
780 
781 	idm_so_conn_connect_common(ic);
782 
783 	idm_set_ini_postconnect_options(so_conn);
784 
785 	return (IDM_STATUS_SUCCESS);
786 }
787 
788 idm_status_t
789 idm_so_tgt_conn_create(idm_conn_t *ic, ksocket_t new_so)
790 {
791 	idm_status_t	idmrc;
792 
793 	idmrc = idm_so_conn_create_common(ic, new_so);
794 
795 	return (idmrc);
796 }
797 
798 static void
799 idm_so_tgt_conn_destroy(idm_conn_t *ic)
800 {
801 	idm_so_conn_destroy_common(ic);
802 }
803 
804 /*
805  * idm_so_tgt_conn_connect()
806  * Establish the connection in ic, passed from idm_tgt_conn_finish(), which
807  * is invoked from the SM as a result of an inbound connection request.
808  */
809 static idm_status_t
810 idm_so_tgt_conn_connect(idm_conn_t *ic)
811 {
812 	idm_so_conn_connect_common(ic);
813 
814 	return (IDM_STATUS_SUCCESS);
815 }
816 
817 static idm_status_t
818 idm_so_conn_create_common(idm_conn_t *ic, ksocket_t new_so)
819 {
820 	idm_so_conn_t	*so_conn;
821 
822 	so_conn = kmem_zalloc(sizeof (idm_so_conn_t), KM_SLEEP);
823 	so_conn->ic_so = new_so;
824 
825 	ic->ic_transport_private = so_conn;
826 	ic->ic_transport_hdrlen = 0;
827 
828 	/* Set the scoreboarding flag on this connection */
829 	ic->ic_conn_flags |= IDM_CONN_USE_SCOREBOARD;
830 
831 	/*
832 	 * Initialize tx thread mutex and list
833 	 */
834 	mutex_init(&so_conn->ic_tx_mutex, NULL, MUTEX_DEFAULT, NULL);
835 	cv_init(&so_conn->ic_tx_cv, NULL, CV_DEFAULT, NULL);
836 	list_create(&so_conn->ic_tx_list, sizeof (idm_pdu_t),
837 	    offsetof(idm_pdu_t, idm_tx_link));
838 
839 	return (IDM_STATUS_SUCCESS);
840 }
841 
842 static void
843 idm_so_conn_destroy_common(idm_conn_t *ic)
844 {
845 	idm_so_conn_t	*so_conn = ic->ic_transport_private;
846 
847 	ic->ic_transport_private = NULL;
848 	idm_sodestroy(so_conn->ic_so);
849 	list_destroy(&so_conn->ic_tx_list);
850 	mutex_destroy(&so_conn->ic_tx_mutex);
851 	cv_destroy(&so_conn->ic_tx_cv);
852 
853 	kmem_free(so_conn, sizeof (idm_so_conn_t));
854 }
855 
856 static void
857 idm_so_conn_connect_common(idm_conn_t *ic)
858 {
859 	idm_so_conn_t	*so_conn;
860 	struct sockaddr_in6	t_addr;
861 	socklen_t	t_addrlen = 0;
862 
863 	so_conn = ic->ic_transport_private;
864 	bzero(&t_addr, sizeof (struct sockaddr_in6));
865 	t_addrlen = sizeof (struct sockaddr_in6);
866 
867 	/* Set the local and remote addresses in the idm conn handle */
868 	ksocket_getsockname(so_conn->ic_so, (struct sockaddr *)&t_addr,
869 	    &t_addrlen, CRED());
870 	bcopy(&t_addr, &ic->ic_laddr, t_addrlen);
871 	ksocket_getpeername(so_conn->ic_so, (struct sockaddr *)&t_addr,
872 	    &t_addrlen, CRED());
873 	bcopy(&t_addr, &ic->ic_raddr, t_addrlen);
874 
875 	mutex_enter(&ic->ic_mutex);
876 	so_conn->ic_tx_thread = thread_create(NULL, 0, idm_sotx_thread, ic, 0,
877 	    &p0, TS_RUN, minclsyspri);
878 	so_conn->ic_rx_thread = thread_create(NULL, 0, idm_sorx_thread, ic, 0,
879 	    &p0, TS_RUN, minclsyspri);
880 
881 	while (!so_conn->ic_rx_thread_running || !so_conn->ic_tx_thread_running)
882 		cv_wait(&ic->ic_cv, &ic->ic_mutex);
883 	mutex_exit(&ic->ic_mutex);
884 }
885 
886 /*
887  * idm_so_conn_disconnect()
888  * Shutdown the socket connection and stop the thread
889  */
890 static void
891 idm_so_conn_disconnect(idm_conn_t *ic)
892 {
893 	idm_so_conn_t	*so_conn;
894 
895 	so_conn = ic->ic_transport_private;
896 
897 	mutex_enter(&ic->ic_mutex);
898 	so_conn->ic_rx_thread_running = B_FALSE;
899 	so_conn->ic_tx_thread_running = B_FALSE;
900 	/* We need to wakeup the TX thread */
901 	mutex_enter(&so_conn->ic_tx_mutex);
902 	cv_signal(&so_conn->ic_tx_cv);
903 	mutex_exit(&so_conn->ic_tx_mutex);
904 	mutex_exit(&ic->ic_mutex);
905 
906 	/* This should wakeup the RX thread if it is sleeping */
907 	idm_soshutdown(so_conn->ic_so);
908 
909 	thread_join(so_conn->ic_tx_thread_did);
910 	thread_join(so_conn->ic_rx_thread_did);
911 }
912 
913 /*
914  * idm_so_tgt_svc_create()
915  * Establish a service on an IP address and port.  idm_svc_req_t contains
916  * the service parameters.
917  */
918 /*ARGSUSED*/
919 static idm_status_t
920 idm_so_tgt_svc_create(idm_svc_req_t *sr, idm_svc_t *is)
921 {
922 	idm_so_svc_t		*so_svc;
923 
924 	so_svc = kmem_zalloc(sizeof (idm_so_svc_t), KM_SLEEP);
925 
926 	/* Set the new sockets service in svc handle */
927 	is->is_so_svc = (void *)so_svc;
928 
929 	return (IDM_STATUS_SUCCESS);
930 }
931 
932 /*
933  * idm_so_tgt_svc_destroy()
934  * Teardown sockets resources allocated in idm_so_tgt_svc_create()
935  */
936 static void
937 idm_so_tgt_svc_destroy(idm_svc_t *is)
938 {
939 	/* the socket will have been torn down; free the service */
940 	kmem_free(is->is_so_svc, sizeof (idm_so_svc_t));
941 }
942 
943 /*
944  * idm_so_tgt_svc_online()
945  * Launch a watch thread on the svc allocated in idm_so_tgt_svc_create()
946  */
947 
948 static idm_status_t
949 idm_so_tgt_svc_online(idm_svc_t *is)
950 {
951 	idm_so_svc_t		*so_svc;
952 	idm_svc_req_t		*sr = &is->is_svc_req;
953 	struct sockaddr_in6	sin6_ip;
954 	const uint32_t		on = 1;
955 	const uint32_t		off = 0;
956 
957 	mutex_enter(&is->is_mutex);
958 	so_svc = (idm_so_svc_t *)is->is_so_svc;
959 
960 	/*
961 	 * Try creating an IPv6 socket first
962 	 */
963 	if ((so_svc->is_so = idm_socreate(PF_INET6, SOCK_STREAM, 0)) == NULL) {
964 		mutex_exit(&is->is_mutex);
965 		return (IDM_STATUS_FAIL);
966 	} else {
967 		bzero(&sin6_ip, sizeof (sin6_ip));
968 		sin6_ip.sin6_family = AF_INET6;
969 		sin6_ip.sin6_port = htons(sr->sr_port);
970 		sin6_ip.sin6_addr = in6addr_any;
971 
972 		(void) ksocket_setsockopt(so_svc->is_so, SOL_SOCKET,
973 		    SO_REUSEADDR, (char *)&on, sizeof (on), CRED());
974 		/*
975 		 * Turn off SO_MAC_EXEMPT so future sobinds succeed
976 		 */
977 		(void) ksocket_setsockopt(so_svc->is_so, SOL_SOCKET,
978 		    SO_MAC_EXEMPT, (char *)&off, sizeof (off), CRED());
979 
980 		if (ksocket_bind(so_svc->is_so, (struct sockaddr *)&sin6_ip,
981 		    sizeof (sin6_ip), CRED()) != 0) {
982 			mutex_exit(&is->is_mutex);
983 			idm_sodestroy(so_svc->is_so);
984 			return (IDM_STATUS_FAIL);
985 		}
986 	}
987 
988 	idm_set_tgt_connect_options(so_svc->is_so);
989 
990 	if (ksocket_listen(so_svc->is_so, 5, CRED()) != 0) {
991 		mutex_exit(&is->is_mutex);
992 		idm_soshutdown(so_svc->is_so);
993 		idm_sodestroy(so_svc->is_so);
994 		return (IDM_STATUS_FAIL);
995 	}
996 
997 	/* Launch a watch thread */
998 	so_svc->is_thread = thread_create(NULL, 0, idm_so_svc_port_watcher,
999 	    is, 0, &p0, TS_RUN, minclsyspri);
1000 
1001 	if (so_svc->is_thread == NULL) {
1002 		/* Failure to launch; teardown the socket */
1003 		mutex_exit(&is->is_mutex);
1004 		idm_soshutdown(so_svc->is_so);
1005 		idm_sodestroy(so_svc->is_so);
1006 		return (IDM_STATUS_FAIL);
1007 	}
1008 	ksocket_hold(so_svc->is_so);
1009 	/* Wait for the port watcher thread to start */
1010 	while (!so_svc->is_thread_running)
1011 		cv_wait(&is->is_cv, &is->is_mutex);
1012 	mutex_exit(&is->is_mutex);
1013 
1014 	return (IDM_STATUS_SUCCESS);
1015 }
1016 
1017 /*
1018  * idm_so_tgt_svc_offline
1019  *
1020  * Stop listening on the IP address and port identified by idm_svc_t.
1021  */
1022 static void
1023 idm_so_tgt_svc_offline(idm_svc_t *is)
1024 {
1025 	idm_so_svc_t		*so_svc;
1026 	mutex_enter(&is->is_mutex);
1027 	so_svc = (idm_so_svc_t *)is->is_so_svc;
1028 	so_svc->is_thread_running = B_FALSE;
1029 	mutex_exit(&is->is_mutex);
1030 
1031 	/*
1032 	 * Teardown socket
1033 	 */
1034 	idm_sodestroy(so_svc->is_so);
1035 
1036 	/*
1037 	 * Now we expect the port watcher thread to terminate
1038 	 */
1039 	thread_join(so_svc->is_thread_did);
1040 }
1041 
1042 /*
1043  * Watch thread for target service connection establishment.
1044  */
1045 void
1046 idm_so_svc_port_watcher(void *arg)
1047 {
1048 	idm_svc_t		*svc = arg;
1049 	ksocket_t		new_so;
1050 	idm_conn_t		*ic;
1051 	idm_status_t		idmrc;
1052 	idm_so_svc_t		*so_svc;
1053 	int			rc;
1054 	const uint32_t		off = 0;
1055 	struct sockaddr_in6 	t_addr;
1056 	socklen_t		t_addrlen;
1057 
1058 	bzero(&t_addr, sizeof (struct sockaddr_in6));
1059 	t_addrlen = sizeof (struct sockaddr_in6);
1060 	mutex_enter(&svc->is_mutex);
1061 
1062 	so_svc = svc->is_so_svc;
1063 	so_svc->is_thread_running = B_TRUE;
1064 	so_svc->is_thread_did = so_svc->is_thread->t_did;
1065 
1066 	cv_signal(&svc->is_cv);
1067 
1068 	IDM_SVC_LOG(CE_NOTE, "iSCSI service (%p/%d) online", (void *)svc,
1069 	    svc->is_svc_req.sr_port);
1070 
1071 	while (so_svc->is_thread_running) {
1072 		mutex_exit(&svc->is_mutex);
1073 
1074 		if ((rc = ksocket_accept(so_svc->is_so,
1075 		    (struct sockaddr *)&t_addr, &t_addrlen,
1076 		    &new_so, CRED())) != 0) {
1077 			mutex_enter(&svc->is_mutex);
1078 			if (rc == ECONNABORTED)
1079 				continue;
1080 			/* Connection problem */
1081 			break;
1082 		}
1083 		/*
1084 		 * Turn off SO_MAC_EXEMPT so future sobinds succeed
1085 		 */
1086 		(void) ksocket_setsockopt(new_so, SOL_SOCKET, SO_MAC_EXEMPT,
1087 		    (char *)&off, sizeof (off), CRED());
1088 
1089 		idmrc = idm_svc_conn_create(svc, IDM_TRANSPORT_TYPE_SOCKETS,
1090 		    &ic);
1091 		if (idmrc != IDM_STATUS_SUCCESS) {
1092 			/* Drop connection */
1093 			idm_soshutdown(new_so);
1094 			idm_sodestroy(new_so);
1095 			mutex_enter(&svc->is_mutex);
1096 			continue;
1097 		}
1098 
1099 		idmrc = idm_so_tgt_conn_create(ic, new_so);
1100 		if (idmrc != IDM_STATUS_SUCCESS) {
1101 			idm_svc_conn_destroy(ic);
1102 			idm_soshutdown(new_so);
1103 			idm_sodestroy(new_so);
1104 			mutex_enter(&svc->is_mutex);
1105 			continue;
1106 		}
1107 
1108 		/*
1109 		 * Kick the state machine.  At CS_S3_XPT_UP the state machine
1110 		 * will notify the client (target) about the new connection.
1111 		 */
1112 		idm_conn_event(ic, CE_CONNECT_ACCEPT, NULL);
1113 
1114 		mutex_enter(&svc->is_mutex);
1115 	}
1116 	ksocket_rele(so_svc->is_so);
1117 	so_svc->is_thread_running = B_FALSE;
1118 	mutex_exit(&svc->is_mutex);
1119 
1120 	IDM_SVC_LOG(CE_NOTE, "iSCSI service (%p/%d) offline", (void *)svc,
1121 	    svc->is_svc_req.sr_port);
1122 
1123 	thread_exit();
1124 }
1125 
1126 /*
1127  * idm_so_free_task_rsrc() stops any ongoing processing of the task and
1128  * frees resources associated with the task.
1129  *
1130  * It's not clear that this should return idm_status_t.  What do we do
1131  * if it fails?
1132  */
1133 static idm_status_t
1134 idm_so_free_task_rsrc(idm_task_t *idt)
1135 {
1136 	idm_buf_t	*idb;
1137 
1138 	/*
1139 	 * There is nothing to cleanup on initiator connections
1140 	 */
1141 	if (IDM_CONN_ISINI(idt->idt_ic))
1142 		return (IDM_STATUS_SUCCESS);
1143 
1144 	/*
1145 	 * If this is a target connection, call idm_buf_rx_from_ini_done for
1146 	 * any buffer on the "outbufv" list with idb->idb_in_transport==B_TRUE.
1147 	 *
1148 	 * In addition, remove any buffers associated with this task from
1149 	 * the ic_tx_list.  We'll do this by walking the idt_inbufv list, but
1150 	 * items don't actually get removed from that list (and completion
1151 	 * routines called) until idm_task_cleanup.
1152 	 */
1153 	mutex_enter(&idt->idt_mutex);
1154 
1155 	for (idb = list_head(&idt->idt_outbufv); idb != NULL;
1156 	    idb = list_next(&idt->idt_outbufv, idb)) {
1157 		if (idb->idb_in_transport) {
1158 			/*
1159 			 * idm_buf_rx_from_ini_done releases idt->idt_mutex
1160 			 */
1161 			idm_buf_rx_from_ini_done(idt, idb, IDM_STATUS_ABORTED);
1162 			mutex_enter(&idt->idt_mutex);
1163 		}
1164 	}
1165 
1166 	for (idb = list_head(&idt->idt_inbufv); idb != NULL;
1167 	    idb = list_next(&idt->idt_inbufv, idb)) {
1168 		/*
1169 		 * We want to remove these items from the tx_list as well,
1170 		 * but knowing it's in the idt_inbufv list is not a guarantee
1171 		 * that it's in the tx_list.  If it's on the tx list then
1172 		 * let idm_sotx_thread() clean it up.
1173 		 */
1174 		if (idb->idb_in_transport && !idb->idb_tx_thread) {
1175 			/*
1176 			 * idm_buf_tx_to_ini_done releases idt->idt_mutex
1177 			 */
1178 			idm_buf_tx_to_ini_done(idt, idb, IDM_STATUS_ABORTED);
1179 			mutex_enter(&idt->idt_mutex);
1180 		}
1181 	}
1182 
1183 	mutex_exit(&idt->idt_mutex);
1184 
1185 	return (IDM_STATUS_SUCCESS);
1186 }
1187 
1188 /*
1189  * idm_so_negotiate_key_values() validates the key values for this connection
1190  */
1191 /* ARGSUSED */
1192 static kv_status_t
1193 idm_so_negotiate_key_values(idm_conn_t *it, nvlist_t *request_nvl,
1194     nvlist_t *response_nvl, nvlist_t *negotiated_nvl)
1195 {
1196 	/* All parameters are negotiated at the iscsit level */
1197 	return (KV_HANDLED);
1198 }
1199 
1200 /*
1201  * idm_so_notice_key_values() activates the negotiated key values for
1202  * this connection.
1203  */
1204 static void
1205 idm_so_notice_key_values(idm_conn_t *it, nvlist_t *negotiated_nvl)
1206 {
1207 	char			*nvp_name;
1208 	nvpair_t		*nvp;
1209 	nvpair_t		*next_nvp;
1210 	int			nvrc;
1211 	idm_status_t		idm_status;
1212 	const idm_kv_xlate_t	*ikvx;
1213 
1214 	for (nvp = nvlist_next_nvpair(negotiated_nvl, NULL);
1215 	    nvp != NULL; nvp = next_nvp) {
1216 		next_nvp = nvlist_next_nvpair(negotiated_nvl, nvp);
1217 		nvp_name = nvpair_name(nvp);
1218 
1219 		ikvx = idm_lookup_kv_xlate(nvp_name, strlen(nvp_name));
1220 		switch (ikvx->ik_key_id) {
1221 		case KI_HEADER_DIGEST:
1222 		case KI_DATA_DIGEST:
1223 			idm_status = idm_so_handle_digest(it, nvp, ikvx);
1224 			ASSERT(idm_status == 0);
1225 
1226 			/* Remove processed item from negotiated_nvl list */
1227 			nvrc = nvlist_remove_all(
1228 			    negotiated_nvl, ikvx->ik_key_name);
1229 			ASSERT(nvrc == 0);
1230 			break;
1231 		default:
1232 			break;
1233 		}
1234 	}
1235 }
1236 
1237 
1238 static idm_status_t
1239 idm_so_handle_digest(idm_conn_t *it, nvpair_t *digest_choice,
1240     const idm_kv_xlate_t *ikvx)
1241 {
1242 	int			nvrc;
1243 	char			*digest_choice_string;
1244 
1245 	nvrc = nvpair_value_string(digest_choice,
1246 	    &digest_choice_string);
1247 	ASSERT(nvrc == 0);
1248 	if (strcasecmp(digest_choice_string, "crc32c") == 0) {
1249 		switch (ikvx->ik_key_id) {
1250 		case KI_HEADER_DIGEST:
1251 			it->ic_conn_flags |= IDM_CONN_HEADER_DIGEST;
1252 			break;
1253 		case KI_DATA_DIGEST:
1254 			it->ic_conn_flags |= IDM_CONN_DATA_DIGEST;
1255 			break;
1256 		default:
1257 			ASSERT(0);
1258 			break;
1259 		}
1260 	} else if (strcasecmp(digest_choice_string, "none") == 0) {
1261 		switch (ikvx->ik_key_id) {
1262 		case KI_HEADER_DIGEST:
1263 			it->ic_conn_flags &= ~IDM_CONN_HEADER_DIGEST;
1264 			break;
1265 		case KI_DATA_DIGEST:
1266 			it->ic_conn_flags &= ~IDM_CONN_DATA_DIGEST;
1267 			break;
1268 		default:
1269 			ASSERT(0);
1270 			break;
1271 		}
1272 	} else {
1273 		ASSERT(0);
1274 	}
1275 
1276 	return (IDM_STATUS_SUCCESS);
1277 }
1278 
1279 
1280 /*
1281  * idm_so_conn_is_capable() verifies that the passed connection is provided
1282  * for by the sockets interface.
1283  */
1284 /* ARGSUSED */
1285 static boolean_t
1286 idm_so_conn_is_capable(idm_conn_req_t *ic, idm_transport_caps_t *caps)
1287 {
1288 	return (B_TRUE);
1289 }
1290 
1291 /*
1292  * idm_so_rx_datain() validates the Data Sequence number of the PDU. The
1293  * idm_sorecv_scsidata() function invoked earlier actually reads the data
1294  * off the socket into the appropriate buffers.
1295  */
1296 static void
1297 idm_so_rx_datain(idm_conn_t *ic, idm_pdu_t *pdu)
1298 {
1299 	iscsi_data_hdr_t	*bhs;
1300 	idm_task_t		*idt;
1301 	idm_buf_t		*idb;
1302 	uint32_t		datasn;
1303 	size_t			offset;
1304 	iscsi_hdr_t		*ihp = (iscsi_hdr_t *)pdu->isp_hdr;
1305 	iscsi_data_rsp_hdr_t    *idrhp = (iscsi_data_rsp_hdr_t *)ihp;
1306 
1307 	ASSERT(ic != NULL);
1308 	ASSERT(pdu != NULL);
1309 
1310 	bhs	= (iscsi_data_hdr_t *)pdu->isp_hdr;
1311 	datasn	= ntohl(bhs->datasn);
1312 	offset	= ntohl(bhs->offset);
1313 
1314 	ASSERT(bhs->opcode == ISCSI_OP_SCSI_DATA_RSP);
1315 
1316 	/*
1317 	 * Look up the task corresponding to the initiator task tag
1318 	 * to get the buffers affiliated with the task.
1319 	 */
1320 	idt = idm_task_find(ic, bhs->itt, bhs->ttt);
1321 	if (idt == NULL) {
1322 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_datain: failed to find task");
1323 		idm_pdu_rx_protocol_error(ic, pdu);
1324 		return;
1325 	}
1326 
1327 	idb = pdu->isp_sorx_buf;
1328 	if (idb == NULL) {
1329 		IDM_CONN_LOG(CE_WARN,
1330 		    "idm_so_rx_datain: failed to find buffer");
1331 		idm_task_rele(idt);
1332 		idm_pdu_rx_protocol_error(ic, pdu);
1333 		return;
1334 	}
1335 
1336 	/*
1337 	 * DataSN values should be sequential and should not have any gaps or
1338 	 * repetitions. Check the DataSN with the one stored in the task.
1339 	 */
1340 	if (datasn == idt->idt_exp_datasn) {
1341 		idt->idt_exp_datasn++; /* keep track of DataSN received */
1342 	} else {
1343 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_datain: datasn out of order");
1344 		idm_task_rele(idt);
1345 		idm_pdu_rx_protocol_error(ic, pdu);
1346 		return;
1347 	}
1348 
1349 	/*
1350 	 * PDUs in a sequence should be in continuously increasing
1351 	 * address offset
1352 	 */
1353 	if (offset != idb->idb_exp_offset) {
1354 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_datain: unexpected offset");
1355 		idm_task_rele(idt);
1356 		idm_pdu_rx_protocol_error(ic, pdu);
1357 		return;
1358 	}
1359 	/* Expected next relative buffer offset */
1360 	idb->idb_exp_offset += n2h24(bhs->dlength);
1361 	idt->idt_rx_bytes += n2h24(bhs->dlength);
1362 
1363 	idm_task_rele(idt);
1364 
1365 	/*
1366 	 * For now call scsi_rsp which will process the data rsp
1367 	 * Revisit, need to provide an explicit client entry point for
1368 	 * phase collapse completions.
1369 	 */
1370 	if (((ihp->opcode & ISCSI_OPCODE_MASK) == ISCSI_OP_SCSI_DATA_RSP) &&
1371 	    (idrhp->flags & ISCSI_FLAG_DATA_STATUS)) {
1372 		(*ic->ic_conn_ops.icb_rx_scsi_rsp)(ic, pdu);
1373 	}
1374 
1375 	idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1376 }
1377 
1378 /*
1379  * The idm_so_rx_dataout() function is used by the iSCSI target to read
1380  * data from the Data-Out PDU sent by the iSCSI initiator.
1381  *
1382  * This function gets the Initiator Task Tag from the PDU BHS and looks up the
1383  * task to get the buffers associated with the PDU. A PDU might span buffers.
1384  * The data is then read into the respective buffer.
1385  */
1386 static void
1387 idm_so_rx_dataout(idm_conn_t *ic, idm_pdu_t *pdu)
1388 {
1389 
1390 	iscsi_data_hdr_t	*bhs;
1391 	idm_task_t		*idt;
1392 	idm_buf_t		*idb;
1393 	size_t			offset;
1394 
1395 	ASSERT(ic != NULL);
1396 	ASSERT(pdu != NULL);
1397 
1398 	bhs = (iscsi_data_hdr_t *)pdu->isp_hdr;
1399 	offset = ntohl(bhs->offset);
1400 	ASSERT(bhs->opcode == ISCSI_OP_SCSI_DATA);
1401 
1402 	/*
1403 	 * Look up the task corresponding to the initiator task tag
1404 	 * to get the buffers affiliated with the task.
1405 	 */
1406 	idt = idm_task_find(ic, bhs->itt, bhs->ttt);
1407 	if (idt == NULL) {
1408 		IDM_CONN_LOG(CE_WARN,
1409 		    "idm_so_rx_dataout: failed to find task");
1410 		idm_pdu_rx_protocol_error(ic, pdu);
1411 		return;
1412 	}
1413 
1414 	idb = pdu->isp_sorx_buf;
1415 	if (idb == NULL) {
1416 		IDM_CONN_LOG(CE_WARN,
1417 		    "idm_so_rx_dataout: failed to find buffer");
1418 		idm_task_rele(idt);
1419 		idm_pdu_rx_protocol_error(ic, pdu);
1420 		return;
1421 	}
1422 
1423 	/* Keep track of data transferred - check data offsets */
1424 	if (offset != idb->idb_exp_offset) {
1425 		IDM_CONN_LOG(CE_NOTE, "idm_so_rx_dataout: offset out of seq: "
1426 		    "%ld, %d", offset, idb->idb_exp_offset);
1427 		idm_task_rele(idt);
1428 		idm_pdu_rx_protocol_error(ic, pdu);
1429 		return;
1430 	}
1431 	/* Expected next relative offset */
1432 	idb->idb_exp_offset += ntoh24(bhs->dlength);
1433 	idt->idt_rx_bytes += n2h24(bhs->dlength);
1434 
1435 	/*
1436 	 * Call the buffer callback when the transfer is complete
1437 	 *
1438 	 * The connection state machine should only abort tasks after
1439 	 * shutting down the connection so we are assured that there
1440 	 * won't be a simultaneous attempt to abort this task at the
1441 	 * same time as we are processing this PDU (due to a connection
1442 	 * state change).
1443 	 */
1444 	if (bhs->flags & ISCSI_FLAG_FINAL) {
1445 		/*
1446 		 * We only want to call idm_buf_rx_from_ini_done once
1447 		 * per transfer.  It's possible that this task has
1448 		 * already been aborted in which case
1449 		 * idm_so_free_task_rsrc will call idm_buf_rx_from_ini_done
1450 		 * for each buffer with idb_in_transport==B_TRUE.  To
1451 		 * close this window and ensure that this doesn't happen,
1452 		 * we'll clear idb->idb_in_transport now while holding
1453 		 * the task mutex.   This is only really an issue for
1454 		 * SCSI task abort -- if tasks were being aborted because
1455 		 * of a connection state change the state machine would
1456 		 * have already stopped the receive thread.
1457 		 */
1458 		mutex_enter(&idt->idt_mutex);
1459 
1460 		/*
1461 		 * Release the task hold here (obtained in idm_task_find)
1462 		 * because the task may complete synchronously during
1463 		 * idm_buf_rx_from_ini_done.  Since we still have an active
1464 		 * buffer we know there is at least one additional hold on idt.
1465 		 */
1466 		idm_task_rele(idt);
1467 
1468 		/*
1469 		 * idm_buf_rx_from_ini_done releases idt->idt_mutex
1470 		 */
1471 		idm_buf_rx_from_ini_done(idt, idb, IDM_STATUS_SUCCESS);
1472 		idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1473 		return;
1474 	}
1475 
1476 	idm_task_rele(idt);
1477 	idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1478 }
1479 
1480 /*
1481  * The idm_so_rx_rtt() function is used by the iSCSI initiator to handle
1482  * the R2T PDU sent by the iSCSI target indicating that it is ready to
1483  * accept data. This gets the Initiator Task Tag (itt) from the PDU BHS
1484  * and looks up the task in the task tree using the itt to get the output
1485  * buffers associated the task. The R2T PDU contains the offset of the
1486  * requested data and the data length. This function then constructs a
1487  * sequence of iSCSI PDUs and outputs the requested data. Each Data-Out
1488  * PDU is associated with the R2T by the Target Transfer Tag  (ttt).
1489  */
1490 
1491 static void
1492 idm_so_rx_rtt(idm_conn_t *ic, idm_pdu_t *pdu)
1493 {
1494 	idm_task_t		*idt;
1495 	idm_buf_t		*idb;
1496 	iscsi_rtt_hdr_t		*rtt_hdr;
1497 	uint32_t		data_offset;
1498 	uint32_t		data_length;
1499 
1500 	ASSERT(ic != NULL);
1501 	ASSERT(pdu != NULL);
1502 
1503 	rtt_hdr	= (iscsi_rtt_hdr_t *)pdu->isp_hdr;
1504 	data_offset = ntohl(rtt_hdr->data_offset);
1505 	data_length = ntohl(rtt_hdr->data_length);
1506 	idt	= idm_task_find(ic, rtt_hdr->itt, rtt_hdr->ttt);
1507 
1508 	if (idt == NULL) {
1509 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_rtt: could not find task");
1510 		idm_pdu_rx_protocol_error(ic, pdu);
1511 		return;
1512 	}
1513 
1514 	/* Find the buffer bound to the task by the iSCSI initiator */
1515 	mutex_enter(&idt->idt_mutex);
1516 	idb = idm_buf_find(&idt->idt_outbufv, data_offset);
1517 	if (idb == NULL) {
1518 		mutex_exit(&idt->idt_mutex);
1519 		idm_task_rele(idt);
1520 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_rtt: could not find buffer");
1521 		idm_pdu_rx_protocol_error(ic, pdu);
1522 		return;
1523 	}
1524 
1525 	/* return buffer contains this data */
1526 	if (data_offset + data_length > idb->idb_buflen) {
1527 		/* Overflow */
1528 		mutex_exit(&idt->idt_mutex);
1529 		idm_task_rele(idt);
1530 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_rtt: read from outside "
1531 		    "buffer");
1532 		idm_pdu_rx_protocol_error(ic, pdu);
1533 		return;
1534 	}
1535 
1536 	idt->idt_r2t_ttt = rtt_hdr->ttt;
1537 	idt->idt_exp_datasn = 0;
1538 
1539 	idm_so_send_rtt_data(ic, idt, idb, data_offset,
1540 	    ntohl(rtt_hdr->data_length));
1541 	mutex_exit(&idt->idt_mutex);
1542 
1543 	idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1544 	idm_task_rele(idt);
1545 
1546 }
1547 
1548 idm_status_t
1549 idm_sorecvdata(idm_conn_t *ic, idm_pdu_t *pdu)
1550 {
1551 	uint8_t		pad[ISCSI_PAD_WORD_LEN];
1552 	int		pad_len;
1553 	uint32_t	data_digest_crc;
1554 	uint32_t	crc_calculated;
1555 	int		total_len;
1556 	idm_so_conn_t	*so_conn;
1557 
1558 	so_conn = ic->ic_transport_private;
1559 
1560 	pad_len = ((ISCSI_PAD_WORD_LEN -
1561 	    (pdu->isp_datalen & (ISCSI_PAD_WORD_LEN - 1))) &
1562 	    (ISCSI_PAD_WORD_LEN - 1));
1563 
1564 	ASSERT(pdu->isp_iovlen < (PDU_MAX_IOVLEN - 2)); /* pad + data digest */
1565 
1566 	total_len = pdu->isp_datalen;
1567 
1568 	if (pad_len) {
1569 		pdu->isp_iov[pdu->isp_iovlen].iov_base	= (char *)&pad;
1570 		pdu->isp_iov[pdu->isp_iovlen].iov_len	= pad_len;
1571 		total_len		+= pad_len;
1572 		pdu->isp_iovlen++;
1573 	}
1574 
1575 	/* setup data digest */
1576 	if ((ic->ic_conn_flags & IDM_CONN_DATA_DIGEST) != 0) {
1577 		pdu->isp_iov[pdu->isp_iovlen].iov_base =
1578 		    (char *)&data_digest_crc;
1579 		pdu->isp_iov[pdu->isp_iovlen].iov_len =
1580 		    sizeof (data_digest_crc);
1581 		total_len		+= sizeof (data_digest_crc);
1582 		pdu->isp_iovlen++;
1583 	}
1584 
1585 	pdu->isp_data = (uint8_t *)(uintptr_t)pdu->isp_iov[0].iov_base;
1586 
1587 	if (idm_iov_sorecv(so_conn->ic_so, &pdu->isp_iov[0],
1588 	    pdu->isp_iovlen, total_len) != 0) {
1589 		return (IDM_STATUS_IO);
1590 	}
1591 
1592 	if ((ic->ic_conn_flags & IDM_CONN_DATA_DIGEST) != 0) {
1593 		crc_calculated = idm_crc32c(pdu->isp_data,
1594 		    pdu->isp_datalen);
1595 		if (pad_len) {
1596 			crc_calculated = idm_crc32c_continued((char *)&pad,
1597 			    pad_len, crc_calculated);
1598 		}
1599 		if (crc_calculated != data_digest_crc) {
1600 			IDM_CONN_LOG(CE_WARN,
1601 			    "idm_sorecvdata: "
1602 			    "CRC error: actual 0x%x, calc 0x%x",
1603 			    data_digest_crc, crc_calculated);
1604 
1605 			/* Invalid Data Digest */
1606 			return (IDM_STATUS_DATA_DIGEST);
1607 		}
1608 	}
1609 
1610 	return (IDM_STATUS_SUCCESS);
1611 }
1612 
1613 /*
1614  * idm_sorecv_scsidata() is used to receive scsi data from the socket. The
1615  * Data-type PDU header must be read into the idm_pdu_t structure prior to
1616  * calling this function.
1617  */
1618 idm_status_t
1619 idm_sorecv_scsidata(idm_conn_t *ic, idm_pdu_t *pdu)
1620 {
1621 	iscsi_data_hdr_t	*bhs;
1622 	idm_task_t		*task;
1623 	uint32_t		offset;
1624 	uint8_t			opcode;
1625 	uint32_t		dlength;
1626 	list_t			*buflst;
1627 	uint32_t		xfer_bytes;
1628 	idm_status_t		status;
1629 
1630 	ASSERT(ic != NULL);
1631 	ASSERT(pdu != NULL);
1632 
1633 	bhs	= (iscsi_data_hdr_t *)pdu->isp_hdr;
1634 
1635 	offset	= ntohl(bhs->offset);
1636 	opcode	= bhs->opcode;
1637 	dlength = n2h24(bhs->dlength);
1638 
1639 	ASSERT((opcode == ISCSI_OP_SCSI_DATA_RSP) ||
1640 	    (opcode == ISCSI_OP_SCSI_DATA));
1641 
1642 	/*
1643 	 * Successful lookup implicitly gets a "hold" on the task.  This
1644 	 * hold must be released before leaving this function.  At one
1645 	 * point we were caching this task context and retaining the hold
1646 	 * but it turned out to be very difficult to release the hold properly.
1647 	 * The task can be aborted and the connection shutdown between this
1648 	 * call and the subsequent expected call to idm_so_rx_datain/
1649 	 * idm_so_rx_dataout (in which case those functions are not called).
1650 	 * Releasing the hold in the PDU callback doesn't work well either
1651 	 * because the whole task may be completed by then at which point
1652 	 * it is too late to release the hold -- for better or worse this
1653 	 * code doesn't wait on the refcnts during normal operation.
1654 	 * idm_task_find() is very fast and it is not a huge burden if we
1655 	 * have to do it twice.
1656 	 */
1657 	task = idm_task_find(ic, bhs->itt, bhs->ttt);
1658 	if (task == NULL) {
1659 		IDM_CONN_LOG(CE_WARN,
1660 		    "idm_sorecv_scsidata: could not find task");
1661 		return (IDM_STATUS_FAIL);
1662 	}
1663 
1664 	mutex_enter(&task->idt_mutex);
1665 	buflst	= (opcode == ISCSI_OP_SCSI_DATA_RSP) ?
1666 	    &task->idt_inbufv : &task->idt_outbufv;
1667 	pdu->isp_sorx_buf = idm_buf_find(buflst, offset);
1668 	mutex_exit(&task->idt_mutex);
1669 
1670 	if (pdu->isp_sorx_buf == NULL) {
1671 		idm_task_rele(task);
1672 		IDM_CONN_LOG(CE_WARN, "idm_sorecv_scsidata: could not find "
1673 		    "buffer for offset %x opcode=%x",
1674 		    offset, opcode);
1675 		return (IDM_STATUS_FAIL);
1676 	}
1677 
1678 	xfer_bytes = idm_fill_iov(pdu, pdu->isp_sorx_buf, offset, dlength);
1679 	ASSERT(xfer_bytes != 0);
1680 	if (xfer_bytes != dlength) {
1681 		idm_task_rele(task);
1682 		/*
1683 		 * Buffer overflow, connection error.  The PDU data is still
1684 		 * sitting in the socket so we can't use the connection
1685 		 * again until that data is drained.
1686 		 */
1687 		return (IDM_STATUS_FAIL);
1688 	}
1689 
1690 	status = idm_sorecvdata(ic, pdu);
1691 
1692 	idm_task_rele(task);
1693 
1694 	return (status);
1695 }
1696 
1697 static uint32_t
1698 idm_fill_iov(idm_pdu_t *pdu, idm_buf_t *idb, uint32_t ro, uint32_t dlength)
1699 {
1700 	uint32_t	buf_ro = ro - idb->idb_bufoffset;
1701 	uint32_t	xfer_len = min(dlength, idb->idb_buflen - buf_ro);
1702 
1703 	ASSERT(ro >= idb->idb_bufoffset);
1704 
1705 	pdu->isp_iov[pdu->isp_iovlen].iov_base	=
1706 	    (caddr_t)idb->idb_buf + buf_ro;
1707 	pdu->isp_iov[pdu->isp_iovlen].iov_len	= xfer_len;
1708 	pdu->isp_iovlen++;
1709 
1710 	return (xfer_len);
1711 }
1712 
1713 int
1714 idm_sorecv_nonscsidata(idm_conn_t *ic, idm_pdu_t *pdu)
1715 {
1716 	pdu->isp_data = kmem_alloc(pdu->isp_datalen, KM_SLEEP);
1717 	ASSERT(pdu->isp_data != NULL);
1718 
1719 	pdu->isp_databuflen = pdu->isp_datalen;
1720 	pdu->isp_iov[0].iov_base = (caddr_t)pdu->isp_data;
1721 	pdu->isp_iov[0].iov_len = pdu->isp_datalen;
1722 	pdu->isp_iovlen = 1;
1723 	/*
1724 	 * Since we are associating a new data buffer with this received
1725 	 * PDU we need to set a specific callback to free the data
1726 	 * after the PDU is processed.
1727 	 */
1728 	pdu->isp_flags |= IDM_PDU_ADDL_DATA;
1729 	pdu->isp_callback = idm_sorx_addl_pdu_cb;
1730 
1731 	return (idm_sorecvdata(ic, pdu));
1732 }
1733 
1734 void
1735 idm_sorx_thread(void *arg)
1736 {
1737 	boolean_t	conn_failure = B_FALSE;
1738 	idm_conn_t	*ic = (idm_conn_t *)arg;
1739 	idm_so_conn_t	*so_conn;
1740 	idm_pdu_t	*pdu;
1741 	idm_status_t	rc;
1742 
1743 	idm_conn_hold(ic);
1744 
1745 	mutex_enter(&ic->ic_mutex);
1746 
1747 	so_conn = ic->ic_transport_private;
1748 	so_conn->ic_rx_thread_running = B_TRUE;
1749 	so_conn->ic_rx_thread_did = so_conn->ic_rx_thread->t_did;
1750 	cv_signal(&ic->ic_cv);
1751 
1752 	while (so_conn->ic_rx_thread_running) {
1753 		mutex_exit(&ic->ic_mutex);
1754 
1755 		/*
1756 		 * Get PDU with default header size (large enough for
1757 		 * BHS plus any anticipated AHS).  PDU from
1758 		 * the cache will have all values set correctly
1759 		 * for sockets RX including callback.
1760 		 */
1761 		pdu = kmem_cache_alloc(idm.idm_sorx_pdu_cache, KM_SLEEP);
1762 		pdu->isp_ic = ic;
1763 		pdu->isp_flags = 0;
1764 		pdu->isp_transport_hdrlen = 0;
1765 
1766 		if ((rc = idm_sorecvhdr(ic, pdu)) != 0) {
1767 			/*
1768 			 * Call idm_pdu_complete so that we call the callback
1769 			 * and ensure any memory allocated in idm_sorecvhdr
1770 			 * gets freed up.
1771 			 */
1772 			idm_pdu_complete(pdu, IDM_STATUS_FAIL);
1773 
1774 			/*
1775 			 * If ic_rx_thread_running is still set then
1776 			 * this is some kind of connection problem
1777 			 * on the socket.  In this case we want to
1778 			 * generate an event.  Otherwise some other
1779 			 * thread closed the socket due to another
1780 			 * issue in which case we don't need to
1781 			 * generate an event.
1782 			 */
1783 			mutex_enter(&ic->ic_mutex);
1784 			if (so_conn->ic_rx_thread_running) {
1785 				conn_failure = B_TRUE;
1786 				so_conn->ic_rx_thread_running = B_FALSE;
1787 			}
1788 
1789 			continue;
1790 		}
1791 
1792 		/*
1793 		 * Header has been read and validated.  Now we need
1794 		 * to read the PDU data payload (if present).  SCSI data
1795 		 * need to be transferred from the socket directly into
1796 		 * the associated transfer buffer for the SCSI task.
1797 		 */
1798 		if (pdu->isp_datalen != 0) {
1799 			if ((IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA) ||
1800 			    (IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA_RSP)) {
1801 				rc = idm_sorecv_scsidata(ic, pdu);
1802 				/*
1803 				 * All SCSI errors are fatal to the
1804 				 * connection right now since we have no
1805 				 * place to put the data.  What we need
1806 				 * is some kind of sink to dispose of unwanted
1807 				 * SCSI data.  For example an invalid task tag
1808 				 * should not kill the connection (although
1809 				 * we may want to drop the connection).
1810 				 */
1811 			} else {
1812 				/*
1813 				 * Not data PDUs so allocate a buffer for the
1814 				 * data segment and read the remaining data.
1815 				 */
1816 				rc = idm_sorecv_nonscsidata(ic, pdu);
1817 			}
1818 			if (rc != 0) {
1819 				/*
1820 				 * Call idm_pdu_complete so that we call the
1821 				 * callback and ensure any memory allocated
1822 				 * in idm_sorecvhdr gets freed up.
1823 				 */
1824 				idm_pdu_complete(pdu, IDM_STATUS_FAIL);
1825 
1826 				/*
1827 				 * If ic_rx_thread_running is still set then
1828 				 * this is some kind of connection problem
1829 				 * on the socket.  In this case we want to
1830 				 * generate an event.  Otherwise some other
1831 				 * thread closed the socket due to another
1832 				 * issue in which case we don't need to
1833 				 * generate an event.
1834 				 */
1835 				mutex_enter(&ic->ic_mutex);
1836 				if (so_conn->ic_rx_thread_running) {
1837 					conn_failure = B_TRUE;
1838 					so_conn->ic_rx_thread_running = B_FALSE;
1839 				}
1840 				continue;
1841 			}
1842 		}
1843 
1844 		/*
1845 		 * Process RX PDU
1846 		 */
1847 		idm_pdu_rx(ic, pdu);
1848 
1849 		mutex_enter(&ic->ic_mutex);
1850 	}
1851 
1852 	mutex_exit(&ic->ic_mutex);
1853 
1854 	/*
1855 	 * If we dropped out of the RX processing loop because of
1856 	 * a socket problem or other connection failure (including
1857 	 * digest errors) then we need to generate a state machine
1858 	 * event to shut the connection down.
1859 	 * If the state machine is already in, for example, INIT_ERROR, this
1860 	 * event will get dropped, and the TX thread will never be notified
1861 	 * to shut down.  To be safe, we'll just notify it here.
1862 	 */
1863 	if (conn_failure) {
1864 		if (so_conn->ic_tx_thread_running) {
1865 			so_conn->ic_tx_thread_running = B_FALSE;
1866 			mutex_enter(&so_conn->ic_tx_mutex);
1867 			cv_signal(&so_conn->ic_tx_cv);
1868 			mutex_exit(&so_conn->ic_tx_mutex);
1869 		}
1870 
1871 		idm_conn_event(ic, CE_TRANSPORT_FAIL, rc);
1872 	}
1873 
1874 	idm_conn_rele(ic);
1875 
1876 	thread_exit();
1877 }
1878 
1879 /*
1880  * idm_so_tx
1881  *
1882  * This is the implementation of idm_transport_ops_t's it_tx_pdu entry
1883  * point.  By definition, it is supposed to be fast.  So, simply queue
1884  * the entry and return.  The real work is done by idm_i_so_tx() via
1885  * idm_sotx_thread().
1886  */
1887 
1888 static void
1889 idm_so_tx(idm_conn_t *ic, idm_pdu_t *pdu)
1890 {
1891 	idm_so_conn_t *so_conn = ic->ic_transport_private;
1892 
1893 	ASSERT(pdu->isp_ic == ic);
1894 	mutex_enter(&so_conn->ic_tx_mutex);
1895 
1896 	if (!so_conn->ic_tx_thread_running) {
1897 		mutex_exit(&so_conn->ic_tx_mutex);
1898 		idm_pdu_complete(pdu, IDM_STATUS_ABORTED);
1899 		return;
1900 	}
1901 
1902 	list_insert_tail(&so_conn->ic_tx_list, (void *)pdu);
1903 	cv_signal(&so_conn->ic_tx_cv);
1904 	mutex_exit(&so_conn->ic_tx_mutex);
1905 }
1906 
1907 static idm_status_t
1908 idm_i_so_tx(idm_pdu_t *pdu)
1909 {
1910 	idm_conn_t	*ic = pdu->isp_ic;
1911 	idm_status_t	status = IDM_STATUS_SUCCESS;
1912 	uint8_t		pad[ISCSI_PAD_WORD_LEN];
1913 	int		pad_len;
1914 	uint32_t	hdr_digest_crc;
1915 	uint32_t	data_digest_crc = 0;
1916 	int		total_len = 0;
1917 	int		iovlen = 0;
1918 	struct iovec	iov[6];
1919 	idm_so_conn_t	*so_conn;
1920 
1921 	so_conn = ic->ic_transport_private;
1922 
1923 	/* Setup BHS */
1924 	iov[iovlen].iov_base	= (caddr_t)pdu->isp_hdr;
1925 	iov[iovlen].iov_len	= pdu->isp_hdrlen;
1926 	total_len		+= iov[iovlen].iov_len;
1927 	iovlen++;
1928 
1929 	/* Setup header digest */
1930 	if (((pdu->isp_flags & IDM_PDU_LOGIN_TX) == 0) &&
1931 	    (ic->ic_conn_flags & IDM_CONN_HEADER_DIGEST)) {
1932 		hdr_digest_crc = idm_crc32c(pdu->isp_hdr, pdu->isp_hdrlen);
1933 
1934 		iov[iovlen].iov_base	= (caddr_t)&hdr_digest_crc;
1935 		iov[iovlen].iov_len	= sizeof (hdr_digest_crc);
1936 		total_len		+= iov[iovlen].iov_len;
1937 		iovlen++;
1938 	}
1939 
1940 	/* Setup the data */
1941 	if (pdu->isp_datalen) {
1942 		idm_task_t		*idt;
1943 		idm_buf_t		*idb;
1944 		iscsi_data_hdr_t	*ihp;
1945 		ihp = (iscsi_data_hdr_t *)pdu->isp_hdr;
1946 		/* Write of immediate data */
1947 		if (ic->ic_ffp &&
1948 		    (ihp->opcode == ISCSI_OP_SCSI_CMD ||
1949 		    ihp->opcode == ISCSI_OP_SCSI_DATA)) {
1950 			idt = idm_task_find(ic, ihp->itt, ihp->ttt);
1951 			if (idt) {
1952 				mutex_enter(&idt->idt_mutex);
1953 				idb = idm_buf_find(&idt->idt_outbufv, 0);
1954 				mutex_exit(&idt->idt_mutex);
1955 				/*
1956 				 * If the initiator call to idm_buf_alloc
1957 				 * failed then we can get to this point
1958 				 * without a bound buffer.  The associated
1959 				 * connection failure will clean things up
1960 				 * later.  It would be nice to come up with
1961 				 * a cleaner way to handle this.  In
1962 				 * particular it seems absurd to look up
1963 				 * the task and the buffer just to update
1964 				 * this counter.
1965 				 */
1966 				if (idb)
1967 					idb->idb_xfer_len += pdu->isp_datalen;
1968 				idm_task_rele(idt);
1969 			}
1970 		}
1971 
1972 		iov[iovlen].iov_base = (caddr_t)pdu->isp_data;
1973 		iov[iovlen].iov_len  = pdu->isp_datalen;
1974 		total_len += iov[iovlen].iov_len;
1975 		iovlen++;
1976 	}
1977 
1978 	/* Setup the data pad if necessary */
1979 	pad_len = ((ISCSI_PAD_WORD_LEN -
1980 	    (pdu->isp_datalen & (ISCSI_PAD_WORD_LEN - 1))) &
1981 	    (ISCSI_PAD_WORD_LEN - 1));
1982 
1983 	if (pad_len) {
1984 		bzero(pad, sizeof (pad));
1985 		iov[iovlen].iov_base = (void *)&pad;
1986 		iov[iovlen].iov_len  = pad_len;
1987 		total_len		+= iov[iovlen].iov_len;
1988 		iovlen++;
1989 	}
1990 
1991 	/*
1992 	 * Setup the data digest if enabled.  Data-digest is not sent
1993 	 * for login-phase PDUs.
1994 	 */
1995 	if ((ic->ic_conn_flags & IDM_CONN_DATA_DIGEST) &&
1996 	    ((pdu->isp_flags & IDM_PDU_LOGIN_TX) == 0) &&
1997 	    (pdu->isp_datalen || pad_len)) {
1998 		/*
1999 		 * RFC3720/10.2.3: A zero-length Data Segment also
2000 		 * implies a zero-length data digest.
2001 		 */
2002 		if (pdu->isp_datalen) {
2003 			data_digest_crc = idm_crc32c(pdu->isp_data,
2004 			    pdu->isp_datalen);
2005 		}
2006 		if (pad_len) {
2007 			data_digest_crc = idm_crc32c_continued(&pad,
2008 			    pad_len, data_digest_crc);
2009 		}
2010 
2011 		iov[iovlen].iov_base	= (caddr_t)&data_digest_crc;
2012 		iov[iovlen].iov_len	= sizeof (data_digest_crc);
2013 		total_len		+= iov[iovlen].iov_len;
2014 		iovlen++;
2015 	}
2016 
2017 	/* Transmit the PDU */
2018 	if (idm_iov_sosend(so_conn->ic_so, &iov[0], iovlen,
2019 	    total_len) != 0) {
2020 		/* Set error status */
2021 		IDM_CONN_LOG(CE_WARN,
2022 		    "idm_so_tx: failed to transmit the PDU, so: %p ic: %p "
2023 		    "data: %p", (void *) so_conn->ic_so, (void *) ic,
2024 		    (void *) pdu->isp_data);
2025 		status = IDM_STATUS_IO;
2026 	}
2027 
2028 	/*
2029 	 * Success does not mean that the PDU actually reached the
2030 	 * remote node since it could get dropped along the way.
2031 	 */
2032 	idm_pdu_complete(pdu, status);
2033 
2034 	return (status);
2035 }
2036 
2037 /*
2038  * The idm_so_buf_tx_to_ini() is used by the target iSCSI layer to transmit the
2039  * Data-In PDUs using sockets. Based on the negotiated MaxRecvDataSegmentLength,
2040  * the buffer is segmented into a sequence of Data-In PDUs, ordered by DataSN.
2041  * A target can invoke this function multiple times for a single read command
2042  * (identified by the same ITT) to split the input into several sequences.
2043  *
2044  * DataSN starts with 0 for the first data PDU of an input command and advances
2045  * by 1 for each subsequent data PDU. Each sequence will have its own F bit,
2046  * which is set to 1 for the last data PDU of a sequence.
2047  *
2048  * Scope for Prototype build:
2049  * The data PDUs within a sequence will be sent in order with the buffer offset
2050  * in increasing order. i.e. initiator and target must have negotiated the
2051  * "DataPDUInOrder" to "Yes". The order between sequences is not enforced.
2052  *
2053  * Caller holds idt->idt_mutex
2054  */
2055 static idm_status_t
2056 idm_so_buf_tx_to_ini(idm_task_t *idt, idm_buf_t *idb)
2057 {
2058 	idm_so_conn_t	*so_conn = idb->idb_ic->ic_transport_private;
2059 	idm_pdu_t	tmppdu;
2060 
2061 	ASSERT(mutex_owned(&idt->idt_mutex));
2062 
2063 	/*
2064 	 * Put the idm_buf_t on the tx queue.  It will be transmitted by
2065 	 * idm_sotx_thread.
2066 	 */
2067 	mutex_enter(&so_conn->ic_tx_mutex);
2068 
2069 	if (!so_conn->ic_tx_thread_running) {
2070 		mutex_exit(&so_conn->ic_tx_mutex);
2071 		/*
2072 		 * Don't release idt->idt_mutex since we're supposed to hold
2073 		 * in when calling idm_buf_tx_to_ini_done
2074 		 */
2075 		idm_buf_tx_to_ini_done(idt, idb, IDM_STATUS_ABORTED);
2076 		return (IDM_STATUS_FAIL);
2077 	}
2078 
2079 	/*
2080 	 * Build a template for the data PDU headers we will use so that
2081 	 * the SN values will stay consistent with other PDU's we are
2082 	 * transmitting like R2T and SCSI status.
2083 	 */
2084 	bzero(&idb->idb_data_hdr_tmpl, sizeof (iscsi_hdr_t));
2085 	tmppdu.isp_hdr = &idb->idb_data_hdr_tmpl;
2086 	(*idt->idt_ic->ic_conn_ops.icb_build_hdr)(idt, &tmppdu,
2087 	    ISCSI_OP_SCSI_DATA_RSP);
2088 	idb->idb_tx_thread = B_TRUE;
2089 	list_insert_tail(&so_conn->ic_tx_list, (void *)idb);
2090 	cv_signal(&so_conn->ic_tx_cv);
2091 	mutex_exit(&so_conn->ic_tx_mutex);
2092 	mutex_exit(&idt->idt_mutex);
2093 
2094 	/*
2095 	 * Returning success here indicates the transfer was successfully
2096 	 * dispatched -- it does not mean that the transfer completed
2097 	 * successfully.
2098 	 */
2099 	return (IDM_STATUS_SUCCESS);
2100 }
2101 
2102 /*
2103  * The idm_so_buf_rx_from_ini() is used by the target iSCSI layer to specify the
2104  * data blocks it is ready to receive from the initiator in response to a WRITE
2105  * SCSI command. The target iSCSI layer passes the information about the desired
2106  * data blocks to the initiator in one R2T PDU. The receiving buffer, the buffer
2107  * offset and datalen are passed via the 'idb' argument.
2108  *
2109  * Scope for Prototype build:
2110  * R2Ts are required for any Data-Out PDU, i.e. initiator and target must have
2111  * negotiated the "InitialR2T" to "Yes".
2112  *
2113  * Caller holds idt->idt_mutex
2114  */
2115 static idm_status_t
2116 idm_so_buf_rx_from_ini(idm_task_t *idt, idm_buf_t *idb)
2117 {
2118 	idm_pdu_t		*pdu;
2119 	iscsi_rtt_hdr_t		*rtt;
2120 
2121 	ASSERT(mutex_owned(&idt->idt_mutex));
2122 
2123 	pdu = kmem_cache_alloc(idm.idm_sotx_pdu_cache, KM_SLEEP);
2124 	pdu->isp_ic = idt->idt_ic;
2125 	bzero(pdu->isp_hdr, sizeof (iscsi_rtt_hdr_t));
2126 
2127 	/* iSCSI layer fills the TTT, ITT, StatSN, ExpCmdSN, MaxCmdSN */
2128 	(*idt->idt_ic->ic_conn_ops.icb_build_hdr)(idt, pdu, ISCSI_OP_RTT_RSP);
2129 
2130 	/* set the rttsn, rtt.flags, rtt.data_offset and rtt.data_length */
2131 	rtt = (iscsi_rtt_hdr_t *)(pdu->isp_hdr);
2132 
2133 	rtt->opcode		= ISCSI_OP_RTT_RSP;
2134 	rtt->flags		= ISCSI_FLAG_FINAL;
2135 	rtt->data_offset	= htonl(idb->idb_bufoffset);
2136 	rtt->data_length	= htonl(idb->idb_xfer_len);
2137 	rtt->rttsn		= htonl(idt->idt_exp_rttsn++);
2138 
2139 	/* Keep track of buffer offsets */
2140 	idb->idb_exp_offset	= idb->idb_bufoffset;
2141 	mutex_exit(&idt->idt_mutex);
2142 
2143 	/*
2144 	 * Transmit the PDU.
2145 	 */
2146 	idm_pdu_tx(pdu);
2147 
2148 	return (IDM_STATUS_SUCCESS);
2149 }
2150 
2151 static idm_status_t
2152 idm_so_buf_alloc(idm_buf_t *idb, uint64_t buflen)
2153 {
2154 	if ((buflen > IDM_SO_BUF_CACHE_LB) && (buflen <= IDM_SO_BUF_CACHE_UB)) {
2155 		idb->idb_buf = kmem_cache_alloc(idm.idm_so_128k_buf_cache,
2156 		    KM_NOSLEEP);
2157 		idb->idb_buf_private = idm.idm_so_128k_buf_cache;
2158 	} else {
2159 		idb->idb_buf = kmem_alloc(buflen, KM_NOSLEEP);
2160 		idb->idb_buf_private = NULL;
2161 	}
2162 
2163 	if (idb->idb_buf == NULL) {
2164 		IDM_CONN_LOG(CE_NOTE,
2165 		    "idm_so_buf_alloc: failed buffer allocation");
2166 		return (IDM_STATUS_FAIL);
2167 	}
2168 
2169 	return (IDM_STATUS_SUCCESS);
2170 }
2171 
2172 /* ARGSUSED */
2173 static idm_status_t
2174 idm_so_buf_setup(idm_buf_t *idb)
2175 {
2176 	/* Ensure bufalloc'd flag is unset */
2177 	idb->idb_bufalloc = B_FALSE;
2178 
2179 	return (IDM_STATUS_SUCCESS);
2180 }
2181 
2182 /* ARGSUSED */
2183 static void
2184 idm_so_buf_teardown(idm_buf_t *idb)
2185 {
2186 	/* nothing to do here */
2187 }
2188 
2189 static void
2190 idm_so_buf_free(idm_buf_t *idb)
2191 {
2192 	if (idb->idb_buf_private == NULL) {
2193 		kmem_free(idb->idb_buf, idb->idb_buflen);
2194 	} else {
2195 		kmem_cache_free(idb->idb_buf_private, idb->idb_buf);
2196 	}
2197 }
2198 
2199 static void
2200 idm_so_send_rtt_data(idm_conn_t *ic, idm_task_t *idt, idm_buf_t *idb,
2201     uint32_t offset, uint32_t length)
2202 {
2203 	idm_so_conn_t	*so_conn = ic->ic_transport_private;
2204 	idm_pdu_t	tmppdu;
2205 	idm_buf_t	*rtt_buf;
2206 
2207 	ASSERT(mutex_owned(&idt->idt_mutex));
2208 
2209 	/*
2210 	 * Allocate a buffer to represent the RTT transfer.  We could further
2211 	 * optimize this by allocating the buffers internally from an rtt
2212 	 * specific buffer cache since this is socket-specific code but for
2213 	 * now we will keep it simple.
2214 	 */
2215 	rtt_buf = idm_buf_alloc(ic, (uint8_t *)idb->idb_buf + offset, length);
2216 	if (rtt_buf == NULL) {
2217 		/*
2218 		 * If we're in FFP then the failure was likely a resource
2219 		 * allocation issue and we should close the connection by
2220 		 * sending a CE_TRANSPORT_FAIL event.
2221 		 *
2222 		 * If we're not in FFP then idm_buf_alloc will always
2223 		 * fail and the state is transitioning to "complete" anyway
2224 		 * so we won't bother to send an event.
2225 		 */
2226 		mutex_enter(&ic->ic_state_mutex);
2227 		if (ic->ic_ffp)
2228 			idm_conn_event_locked(ic, CE_TRANSPORT_FAIL,
2229 			    NULL, CT_NONE);
2230 		mutex_exit(&ic->ic_state_mutex);
2231 		return;
2232 	}
2233 
2234 	rtt_buf->idb_buf_cb = NULL;
2235 	rtt_buf->idb_cb_arg = NULL;
2236 	rtt_buf->idb_bufoffset = offset;
2237 	rtt_buf->idb_xfer_len = length;
2238 	rtt_buf->idb_ic = idt->idt_ic;
2239 	rtt_buf->idb_task_binding = idt;
2240 
2241 	/*
2242 	 * Put the idm_buf_t on the tx queue.  It will be transmitted by
2243 	 * idm_sotx_thread.
2244 	 */
2245 	mutex_enter(&so_conn->ic_tx_mutex);
2246 
2247 	if (!so_conn->ic_tx_thread_running) {
2248 		idm_buf_free(rtt_buf);
2249 		mutex_exit(&so_conn->ic_tx_mutex);
2250 		return;
2251 	}
2252 
2253 	/*
2254 	 * This new buffer represents an additional reference on the task
2255 	 */
2256 	idm_task_hold(idt);
2257 
2258 	/*
2259 	 * Build a template for the data PDU headers we will use so that
2260 	 * the SN values will stay consistent with other PDU's we are
2261 	 * transmitting like R2T and SCSI status.
2262 	 */
2263 	bzero(&rtt_buf->idb_data_hdr_tmpl, sizeof (iscsi_hdr_t));
2264 	tmppdu.isp_hdr = &rtt_buf->idb_data_hdr_tmpl;
2265 	(*idt->idt_ic->ic_conn_ops.icb_build_hdr)(idt, &tmppdu,
2266 	    ISCSI_OP_SCSI_DATA);
2267 	rtt_buf->idb_tx_thread = B_TRUE;
2268 	rtt_buf->idb_in_transport = B_TRUE;
2269 	list_insert_tail(&so_conn->ic_tx_list, (void *)rtt_buf);
2270 	cv_signal(&so_conn->ic_tx_cv);
2271 	mutex_exit(&so_conn->ic_tx_mutex);
2272 }
2273 
2274 static void
2275 idm_so_send_rtt_data_done(idm_task_t *idt, idm_buf_t *idb)
2276 {
2277 	/*
2278 	 * Don't worry about status -- we assume any error handling
2279 	 * is performed by the caller (idm_sotx_thread).
2280 	 */
2281 	idb->idb_in_transport = B_FALSE;
2282 	idm_task_rele(idt);
2283 	idm_buf_free(idb);
2284 }
2285 
2286 static idm_status_t
2287 idm_so_send_buf_region(idm_task_t *idt, idm_buf_t *idb,
2288     uint32_t buf_region_offset, uint32_t buf_region_length)
2289 {
2290 	idm_conn_t		*ic;
2291 	uint32_t		max_dataseglen;
2292 	size_t			remainder, chunk;
2293 	uint32_t		data_offset = buf_region_offset;
2294 	iscsi_data_hdr_t	*bhs;
2295 	idm_pdu_t		*pdu;
2296 	idm_status_t		tx_status;
2297 
2298 	ASSERT(mutex_owned(&idt->idt_mutex));
2299 
2300 	ic = idt->idt_ic;
2301 
2302 	max_dataseglen = 8192; /* Need value from login negotiation */
2303 	remainder = buf_region_length;
2304 
2305 	while (remainder) {
2306 		if (idt->idt_state != TASK_ACTIVE) {
2307 			ASSERT((idt->idt_state != TASK_IDLE) &&
2308 			    (idt->idt_state != TASK_COMPLETE));
2309 			return (IDM_STATUS_ABORTED);
2310 		}
2311 
2312 		/* check to see if we need to chunk the data */
2313 		if (remainder > max_dataseglen) {
2314 			chunk = max_dataseglen;
2315 		} else {
2316 			chunk = remainder;
2317 		}
2318 
2319 		/* Data PDU headers will always be sizeof (iscsi_hdr_t) */
2320 		pdu = kmem_cache_alloc(idm.idm_sotx_pdu_cache, KM_SLEEP);
2321 		pdu->isp_ic = ic;
2322 
2323 		/*
2324 		 * We've already built a build a header template
2325 		 * to use during the transfer.  Use this template so that
2326 		 * the SN values stay consistent with any unrelated PDU's
2327 		 * being transmitted.
2328 		 */
2329 		bcopy(&idb->idb_data_hdr_tmpl, pdu->isp_hdr,
2330 		    sizeof (iscsi_hdr_t));
2331 
2332 		/*
2333 		 * Set DataSN, data offset, and flags in BHS
2334 		 * For the prototype build, A = 0, S = 0, U = 0
2335 		 */
2336 		bhs = (iscsi_data_hdr_t *)(pdu->isp_hdr);
2337 
2338 		bhs->datasn		= htonl(idt->idt_exp_datasn++);
2339 
2340 		hton24(bhs->dlength, chunk);
2341 		bhs->offset = htonl(idb->idb_bufoffset + data_offset);
2342 
2343 		if (chunk == remainder) {
2344 			bhs->flags = ISCSI_FLAG_FINAL; /* F bit set to 1 */
2345 		}
2346 
2347 		/* setup data */
2348 		pdu->isp_data	=  (uint8_t *)idb->idb_buf + data_offset;
2349 		pdu->isp_datalen = (uint_t)chunk;
2350 		remainder	-= chunk;
2351 		data_offset	+= chunk;
2352 
2353 		/*
2354 		 * Now that we're done working with idt_exp_datasn,
2355 		 * idt->idt_state and idb->idb_bufoffset we can release
2356 		 * the task lock -- don't want to hold it across the
2357 		 * call to idm_i_so_tx since we could block.
2358 		 */
2359 		mutex_exit(&idt->idt_mutex);
2360 
2361 		/*
2362 		 * Transmit the PDU.  Call the internal routine directly
2363 		 * as there is already implicit ordering.
2364 		 */
2365 		if ((tx_status = idm_i_so_tx(pdu)) != IDM_STATUS_SUCCESS) {
2366 			mutex_enter(&idt->idt_mutex);
2367 			return (tx_status);
2368 		}
2369 
2370 		mutex_enter(&idt->idt_mutex);
2371 		idt->idt_tx_bytes += chunk;
2372 	}
2373 
2374 	return (IDM_STATUS_SUCCESS);
2375 }
2376 
2377 /*
2378  * TX PDU cache
2379  */
2380 /* ARGSUSED */
2381 int
2382 idm_sotx_pdu_constructor(void *hdl, void *arg, int flags)
2383 {
2384 	idm_pdu_t	*pdu = hdl;
2385 
2386 	bzero(pdu, sizeof (idm_pdu_t));
2387 	pdu->isp_hdr = (iscsi_hdr_t *)(pdu + 1); /* Ptr arithmetic */
2388 	pdu->isp_hdrlen = sizeof (iscsi_hdr_t);
2389 	pdu->isp_callback = idm_sotx_cache_pdu_cb;
2390 	pdu->isp_magic = IDM_PDU_MAGIC;
2391 	bzero(pdu->isp_hdr, sizeof (iscsi_hdr_t));
2392 
2393 	return (0);
2394 }
2395 
2396 /* ARGSUSED */
2397 void
2398 idm_sotx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status)
2399 {
2400 	/* reset values between use */
2401 	pdu->isp_datalen = 0;
2402 
2403 	kmem_cache_free(idm.idm_sotx_pdu_cache, pdu);
2404 }
2405 
2406 /*
2407  * RX PDU cache
2408  */
2409 /* ARGSUSED */
2410 int
2411 idm_sorx_pdu_constructor(void *hdl, void *arg, int flags)
2412 {
2413 	idm_pdu_t	*pdu = hdl;
2414 
2415 	bzero(pdu, sizeof (idm_pdu_t));
2416 	pdu->isp_magic = IDM_PDU_MAGIC;
2417 	pdu->isp_hdr = (iscsi_hdr_t *)(pdu + 1); /* Ptr arithmetic */
2418 	pdu->isp_callback = idm_sorx_cache_pdu_cb;
2419 
2420 	return (0);
2421 }
2422 
2423 /* ARGSUSED */
2424 static void
2425 idm_sorx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status)
2426 {
2427 	pdu->isp_iovlen = 0;
2428 	pdu->isp_sorx_buf = 0;
2429 	kmem_cache_free(idm.idm_sorx_pdu_cache, pdu);
2430 }
2431 
2432 static void
2433 idm_sorx_addl_pdu_cb(idm_pdu_t *pdu, idm_status_t status)
2434 {
2435 	/*
2436 	 * We had to modify our cached RX PDU with a longer header buffer
2437 	 * and/or a longer data buffer.  Release the new buffers and fix
2438 	 * the fields back to what we would expect for a cached RX PDU.
2439 	 */
2440 	if (pdu->isp_flags & IDM_PDU_ADDL_HDR) {
2441 		kmem_free(pdu->isp_hdr, pdu->isp_hdrlen);
2442 	}
2443 	if (pdu->isp_flags & IDM_PDU_ADDL_DATA) {
2444 		kmem_free(pdu->isp_data, pdu->isp_datalen);
2445 	}
2446 	pdu->isp_hdr = (iscsi_hdr_t *)(pdu + 1);
2447 	pdu->isp_hdrlen = sizeof (iscsi_hdr_t);
2448 	pdu->isp_data = NULL;
2449 	pdu->isp_datalen = 0;
2450 	pdu->isp_sorx_buf = 0;
2451 	pdu->isp_callback = idm_sorx_cache_pdu_cb;
2452 	idm_sorx_cache_pdu_cb(pdu, status);
2453 }
2454 
2455 /*
2456  * This thread is only active when I/O is queued for transmit
2457  * because the socket is busy.
2458  */
2459 void
2460 idm_sotx_thread(void *arg)
2461 {
2462 	idm_conn_t	*ic = arg;
2463 	idm_tx_obj_t	*object, *next;
2464 	idm_so_conn_t	*so_conn;
2465 	idm_status_t	status = IDM_STATUS_SUCCESS;
2466 
2467 	idm_conn_hold(ic);
2468 
2469 	mutex_enter(&ic->ic_mutex);
2470 	so_conn = ic->ic_transport_private;
2471 	so_conn->ic_tx_thread_running = B_TRUE;
2472 	so_conn->ic_tx_thread_did = so_conn->ic_tx_thread->t_did;
2473 	cv_signal(&ic->ic_cv);
2474 	mutex_exit(&ic->ic_mutex);
2475 
2476 	mutex_enter(&so_conn->ic_tx_mutex);
2477 
2478 	while (so_conn->ic_tx_thread_running) {
2479 		while (list_is_empty(&so_conn->ic_tx_list)) {
2480 			DTRACE_PROBE1(soconn__tx__sleep, idm_conn_t *, ic);
2481 			cv_wait(&so_conn->ic_tx_cv, &so_conn->ic_tx_mutex);
2482 			DTRACE_PROBE1(soconn__tx__wakeup, idm_conn_t *, ic);
2483 
2484 			if (!so_conn->ic_tx_thread_running) {
2485 				goto tx_bail;
2486 			}
2487 		}
2488 
2489 		object = (idm_tx_obj_t *)list_head(&so_conn->ic_tx_list);
2490 		list_remove(&so_conn->ic_tx_list, object);
2491 		mutex_exit(&so_conn->ic_tx_mutex);
2492 
2493 		switch (object->idm_tx_obj_magic) {
2494 		case IDM_PDU_MAGIC:
2495 			DTRACE_PROBE2(soconn__tx__pdu, idm_conn_t *, ic,
2496 			    idm_pdu_t *, (idm_pdu_t *)object);
2497 
2498 			status = idm_i_so_tx((idm_pdu_t *)object);
2499 			break;
2500 
2501 		case IDM_BUF_MAGIC: {
2502 			idm_buf_t *idb = (idm_buf_t *)object;
2503 			idm_task_t *idt = idb->idb_task_binding;
2504 
2505 			DTRACE_PROBE2(soconn__tx__buf, idm_conn_t *, ic,
2506 			    idm_buf_t *, idb);
2507 
2508 			mutex_enter(&idt->idt_mutex);
2509 			status = idm_so_send_buf_region(idt,
2510 			    idb, 0, idb->idb_xfer_len);
2511 
2512 			/*
2513 			 * TX thread owns the buffer so we expect it to
2514 			 * be "in transport"
2515 			 */
2516 			ASSERT(idb->idb_in_transport);
2517 			if (IDM_CONN_ISTGT(ic)) {
2518 				/*
2519 				 * idm_buf_tx_to_ini_done releases
2520 				 * idt->idt_mutex
2521 				 */
2522 				idm_buf_tx_to_ini_done(idt, idb, status);
2523 			} else {
2524 				idm_so_send_rtt_data_done(idt, idb);
2525 				mutex_exit(&idt->idt_mutex);
2526 			}
2527 			break;
2528 		}
2529 
2530 		default:
2531 			IDM_CONN_LOG(CE_WARN, "idm_sotx_thread: Unknown magic "
2532 			    "(0x%08x)", object->idm_tx_obj_magic);
2533 			status = IDM_STATUS_FAIL;
2534 		}
2535 
2536 		mutex_enter(&so_conn->ic_tx_mutex);
2537 
2538 		if (status != IDM_STATUS_SUCCESS) {
2539 			so_conn->ic_tx_thread_running = B_FALSE;
2540 			idm_conn_event(ic, CE_TRANSPORT_FAIL, status);
2541 		}
2542 	}
2543 
2544 	/*
2545 	 * Before we leave, we need to abort every item remaining in the
2546 	 * TX list.
2547 	 */
2548 
2549 tx_bail:
2550 	object = (idm_tx_obj_t *)list_head(&so_conn->ic_tx_list);
2551 
2552 	while (object != NULL) {
2553 		next = list_next(&so_conn->ic_tx_list, object);
2554 
2555 		list_remove(&so_conn->ic_tx_list, object);
2556 		switch (object->idm_tx_obj_magic) {
2557 		case IDM_PDU_MAGIC:
2558 			idm_pdu_complete((idm_pdu_t *)object,
2559 			    IDM_STATUS_ABORTED);
2560 			break;
2561 
2562 		case IDM_BUF_MAGIC: {
2563 			idm_buf_t *idb = (idm_buf_t *)object;
2564 			idm_task_t *idt = idb->idb_task_binding;
2565 			mutex_exit(&so_conn->ic_tx_mutex);
2566 			mutex_enter(&idt->idt_mutex);
2567 			/*
2568 			 * TX thread owns the buffer so we expect it to
2569 			 * be "in transport"
2570 			 */
2571 			ASSERT(idb->idb_in_transport);
2572 			if (IDM_CONN_ISTGT(ic)) {
2573 				/*
2574 				 * idm_buf_tx_to_ini_done releases
2575 				 * idt->idt_mutex
2576 				 */
2577 				idm_buf_tx_to_ini_done(idt, idb,
2578 				    IDM_STATUS_ABORTED);
2579 			} else {
2580 				idm_so_send_rtt_data_done(idt, idb);
2581 				mutex_exit(&idt->idt_mutex);
2582 			}
2583 			mutex_enter(&so_conn->ic_tx_mutex);
2584 			break;
2585 		}
2586 		default:
2587 			IDM_CONN_LOG(CE_WARN,
2588 			    "idm_sotx_thread: Unexpected magic "
2589 			    "(0x%08x)", object->idm_tx_obj_magic);
2590 		}
2591 
2592 		object = next;
2593 	}
2594 
2595 	mutex_exit(&so_conn->ic_tx_mutex);
2596 	idm_conn_rele(ic);
2597 	thread_exit();
2598 	/*NOTREACHED*/
2599 }
2600