1d54cfbdroberto/*
2047f369cy * Copyright (C) 2004-2012  Internet Systems Consortium, Inc. ("ISC")
3d54cfbdroberto * Copyright (C) 2000-2003  Internet Software Consortium.
4d54cfbdroberto *
5d54cfbdroberto * Permission to use, copy, modify, and/or distribute this software for any
6d54cfbdroberto * purpose with or without fee is hereby granted, provided that the above
7d54cfbdroberto * copyright notice and this permission notice appear in all copies.
8d54cfbdroberto *
9d54cfbdroberto * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10d54cfbdroberto * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11d54cfbdroberto * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12d54cfbdroberto * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13d54cfbdroberto * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14d54cfbdroberto * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15d54cfbdroberto * PERFORMANCE OF THIS SOFTWARE.
16d54cfbdroberto */
17d54cfbdroberto
18047f369cy/* $Id$ */
19d54cfbdroberto
20d54cfbdroberto/* This code uses functions which are only available on Server 2003 and
21d54cfbdroberto * higher, and Windows XP and higher.
22d54cfbdroberto *
23d54cfbdroberto * This code is by nature multithreaded and takes advantage of various
24d54cfbdroberto * features to pass on information through the completion port for
25d54cfbdroberto * when I/O is completed.  All sends, receives, accepts, and connects are
26d54cfbdroberto * completed through the completion port.
27d54cfbdroberto *
28d54cfbdroberto * The number of Completion Port Worker threads used is the total number
29d54cfbdroberto * of CPU's + 1. This increases the likelihood that a Worker Thread is
30d54cfbdroberto * available for processing a completed request.
31d54cfbdroberto *
32d54cfbdroberto * XXXPDM 5 August, 2002
33d54cfbdroberto */
34d54cfbdroberto
35d54cfbdroberto#define MAKE_EXTERNAL 1
36d54cfbdroberto#include <config.h>
37d54cfbdroberto
38d54cfbdroberto#include <sys/types.h>
39d54cfbdroberto
40d54cfbdroberto#ifndef _WINSOCKAPI_
41d54cfbdroberto#define _WINSOCKAPI_   /* Prevent inclusion of winsock.h in windows.h */
42d54cfbdroberto#endif
43d54cfbdroberto
44d54cfbdroberto#include <errno.h>
45d54cfbdroberto#include <stddef.h>
46d54cfbdroberto#include <stdlib.h>
47d54cfbdroberto#include <string.h>
48d54cfbdroberto#include <unistd.h>
49d54cfbdroberto#include <io.h>
50d54cfbdroberto#include <fcntl.h>
51d54cfbdroberto#include <process.h>
52d54cfbdroberto
53d54cfbdroberto#include <isc/buffer.h>
54d54cfbdroberto#include <isc/bufferlist.h>
55d54cfbdroberto#include <isc/condition.h>
56d54cfbdroberto#include <isc/list.h>
57d54cfbdroberto#include <isc/log.h>
58d54cfbdroberto#include <isc/mem.h>
59d54cfbdroberto#include <isc/msgs.h>
60d54cfbdroberto#include <isc/mutex.h>
61d54cfbdroberto#include <isc/net.h>
62d54cfbdroberto#include <isc/once.h>
63d54cfbdroberto#include <isc/os.h>
64d54cfbdroberto#include <isc/platform.h>
65d54cfbdroberto#include <isc/print.h>
66d54cfbdroberto#include <isc/region.h>
67d54cfbdroberto#include <isc/socket.h>
68d54cfbdroberto#include <isc/stats.h>
69d54cfbdroberto#include <isc/strerror.h>
70d54cfbdroberto#include <isc/syslog.h>
71d54cfbdroberto#include <isc/task.h>
72d54cfbdroberto#include <isc/thread.h>
73d54cfbdroberto#include <isc/util.h>
74d54cfbdroberto#include <isc/win32os.h>
75d54cfbdroberto
76d54cfbdroberto#include <mswsock.h>
77d54cfbdroberto
78d54cfbdroberto#include "errno2result.h"
79d54cfbdroberto
80d54cfbdroberto/*
81d54cfbdroberto * How in the world can Microsoft exist with APIs like this?
82d54cfbdroberto * We can't actually call this directly, because it turns out
83d54cfbdroberto * no library exports this function.  Instead, we need to
84d54cfbdroberto * issue a runtime call to get the address.
85d54cfbdroberto */
86d54cfbdrobertoLPFN_CONNECTEX ISCConnectEx;
87d54cfbdrobertoLPFN_ACCEPTEX ISCAcceptEx;
88d54cfbdrobertoLPFN_GETACCEPTEXSOCKADDRS ISCGetAcceptExSockaddrs;
89d54cfbdroberto
90d54cfbdroberto/*
91d54cfbdroberto * Run expensive internal consistency checks.
92d54cfbdroberto */
93d54cfbdroberto#ifdef ISC_SOCKET_CONSISTENCY_CHECKS
94d54cfbdroberto#define CONSISTENT(sock) consistent(sock)
95d54cfbdroberto#else
96d54cfbdroberto#define CONSISTENT(sock) do {} while (0)
97d54cfbdroberto#endif
98d54cfbdrobertostatic void consistent(isc_socket_t *sock);
99d54cfbdroberto
100d54cfbdroberto/*
101d54cfbdroberto * Define this macro to control the behavior of connection
102d54cfbdroberto * resets on UDP sockets.  See Microsoft KnowledgeBase Article Q263823
103d54cfbdroberto * for details.
104d54cfbdroberto * NOTE: This requires that Windows 2000 systems install Service Pack 2
105d54cfbdroberto * or later.
106d54cfbdroberto */
107d54cfbdroberto#ifndef SIO_UDP_CONNRESET
108d54cfbdroberto#define SIO_UDP_CONNRESET _WSAIOW(IOC_VENDOR,12)
109d54cfbdroberto#endif
110d54cfbdroberto
111d54cfbdroberto/*
112d54cfbdroberto * Some systems define the socket length argument as an int, some as size_t,
113d54cfbdroberto * some as socklen_t.  This is here so it can be easily changed if needed.
114d54cfbdroberto */
115d54cfbdroberto#ifndef ISC_SOCKADDR_LEN_T
116d54cfbdroberto#define ISC_SOCKADDR_LEN_T unsigned int
117d54cfbdroberto#endif
118d54cfbdroberto
119d54cfbdroberto/*
120d54cfbdroberto * Define what the possible "soft" errors can be.  These are non-fatal returns
121d54cfbdroberto * of various network related functions, like recv() and so on.
122d54cfbdroberto */
123d54cfbdroberto#define SOFT_ERROR(e)	((e) == WSAEINTR || \
124d54cfbdroberto			 (e) == WSAEWOULDBLOCK || \
125d54cfbdroberto			 (e) == EWOULDBLOCK || \
126d54cfbdroberto			 (e) == EINTR || \
127d54cfbdroberto			 (e) == EAGAIN || \
128d54cfbdroberto			 (e) == 0)
129d54cfbdroberto
130d54cfbdroberto/*
131d54cfbdroberto * Pending errors are not really errors and should be
132d54cfbdroberto * kept separate
133d54cfbdroberto */
134d54cfbdroberto#define PENDING_ERROR(e) ((e) == WSA_IO_PENDING || (e) == 0)
135d54cfbdroberto
136d54cfbdroberto#define DOIO_SUCCESS	  0       /* i/o ok, event sent */
137d54cfbdroberto#define DOIO_SOFT	  1       /* i/o ok, soft error, no event sent */
138d54cfbdroberto#define DOIO_HARD	  2       /* i/o error, event sent */
139d54cfbdroberto#define DOIO_EOF	  3       /* EOF, no event sent */
140d54cfbdroberto#define DOIO_PENDING	  4       /* status when i/o is in process */
141d54cfbdroberto#define DOIO_NEEDMORE	  5       /* IO was processed, but we need more due to minimum */
142d54cfbdroberto
143d54cfbdroberto#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
144d54cfbdroberto
145d54cfbdroberto/*
146d54cfbdroberto * DLVL(90)  --  Function entry/exit and other tracing.
147d54cfbdroberto * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
148d54cfbdroberto * DLVL(60)  --  Socket data send/receive
149d54cfbdroberto * DLVL(50)  --  Event tracing, including receiving/sending completion events.
150d54cfbdroberto * DLVL(20)  --  Socket creation/destruction.
151d54cfbdroberto */
152d54cfbdroberto#define TRACE_LEVEL		90
153d54cfbdroberto#define CORRECTNESS_LEVEL	70
154d54cfbdroberto#define IOEVENT_LEVEL		60
155d54cfbdroberto#define EVENT_LEVEL		50
156d54cfbdroberto#define CREATION_LEVEL		20
157d54cfbdroberto
158d54cfbdroberto#define TRACE		DLVL(TRACE_LEVEL)
159d54cfbdroberto#define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
160d54cfbdroberto#define IOEVENT		DLVL(IOEVENT_LEVEL)
161d54cfbdroberto#define EVENT		DLVL(EVENT_LEVEL)
162d54cfbdroberto#define CREATION	DLVL(CREATION_LEVEL)
163d54cfbdroberto
164d54cfbdrobertotypedef isc_event_t intev_t;
165d54cfbdroberto
166d54cfbdroberto/*
167d54cfbdroberto * Socket State
168d54cfbdroberto */
169d54cfbdrobertoenum {
170d54cfbdroberto  SOCK_INITIALIZED,	/* Socket Initialized */
171d54cfbdroberto  SOCK_OPEN,		/* Socket opened but nothing yet to do */
172d54cfbdroberto  SOCK_DATA,		/* Socket sending or receiving data */
173d54cfbdroberto  SOCK_LISTEN,		/* TCP Socket listening for connects */
174d54cfbdroberto  SOCK_ACCEPT,		/* TCP socket is waiting to accept */
175d54cfbdroberto  SOCK_CONNECT,		/* TCP Socket connecting */
176d54cfbdroberto  SOCK_CLOSED,		/* Socket has been closed */
177d54cfbdroberto};
178d54cfbdroberto
179d54cfbdroberto#define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
180d54cfbdroberto#define VALID_SOCKET(t)		ISC_MAGIC_VALID(t, SOCKET_MAGIC)
181d54cfbdroberto
182d54cfbdroberto/*
183d54cfbdroberto * IPv6 control information.  If the socket is an IPv6 socket we want
184d54cfbdroberto * to collect the destination address and interface so the client can
185d54cfbdroberto * set them on outgoing packets.
186d54cfbdroberto */
187d54cfbdroberto#ifdef ISC_PLATFORM_HAVEIPV6
188d54cfbdroberto#ifndef USE_CMSG
189d54cfbdroberto#define USE_CMSG	1
190d54cfbdroberto#endif
191d54cfbdroberto#endif
192d54cfbdroberto
193d54cfbdroberto/*
194d54cfbdroberto * We really  don't want to try and use these control messages. Win32
195d54cfbdroberto * doesn't have this mechanism before XP.
196d54cfbdroberto */
197d54cfbdroberto#undef USE_CMSG
198d54cfbdroberto
199d54cfbdroberto/*
200d54cfbdroberto * Message header for recvmsg and sendmsg calls.
201d54cfbdroberto * Used value-result for recvmsg, value only for sendmsg.
202d54cfbdroberto */
203d54cfbdrobertostruct msghdr {
204d54cfbdroberto	SOCKADDR_STORAGE to_addr;	/* UDP send/recv address */
205d54cfbdroberto	int      to_addr_len;		/* length of the address */
206d54cfbdroberto	WSABUF  *msg_iov;		/* scatter/gather array */
207d54cfbdroberto	u_int   msg_iovlen;             /* # elements in msg_iov */
208d54cfbdroberto	void	*msg_control;           /* ancillary data, see below */
209d54cfbdroberto	u_int   msg_controllen;         /* ancillary data buffer len */
210d54cfbdroberto	int	msg_totallen;		/* total length of this message */
211d54cfbdroberto} msghdr;
212d54cfbdroberto
213d54cfbdroberto/*
214d54cfbdroberto * The size to raise the receive buffer to.
215d54cfbdroberto */
216d54cfbdroberto#define RCVBUFSIZE (32*1024)
217d54cfbdroberto
218d54cfbdroberto/*
219d54cfbdroberto * The number of times a send operation is repeated if the result
220d54cfbdroberto * is WSAEINTR.
221d54cfbdroberto */
222d54cfbdroberto#define NRETRIES 10
223d54cfbdroberto
224d54cfbdrobertostruct isc_socket {
225d54cfbdroberto	/* Not locked. */
226d54cfbdroberto	unsigned int		magic;
227d54cfbdroberto	isc_socketmgr_t	       *manager;
228d54cfbdroberto	isc_mutex_t		lock;
229d54cfbdroberto	isc_sockettype_t	type;
230d54cfbdroberto
231d54cfbdroberto	/* Pointers to scatter/gather buffers */
232d54cfbdroberto	WSABUF			iov[ISC_SOCKET_MAXSCATTERGATHER];
233d54cfbdroberto
234d54cfbdroberto	/* Locked by socket lock. */
235d54cfbdroberto	ISC_LINK(isc_socket_t)	link;
236d54cfbdroberto	unsigned int		references; /* EXTERNAL references */
237d54cfbdroberto	SOCKET			fd;	/* file handle */
238d54cfbdroberto	int			pf;	/* protocol family */
239d54cfbdroberto	char			name[16];
240d54cfbdroberto	void *			tag;
241d54cfbdroberto
242d54cfbdroberto	/*
243d54cfbdroberto	 * Each recv() call uses this buffer.  It is a per-socket receive
244d54cfbdroberto	 * buffer that allows us to decouple the system recv() from the
245d54cfbdroberto	 * recv_list done events.  This means the items on the recv_list
246d54cfbdroberto	 * can be removed without having to cancel pending system recv()
247d54cfbdroberto	 * calls.  It also allows us to read-ahead in some cases.
248d54cfbdroberto	 */
249d54cfbdroberto	struct {
250d54cfbdroberto		SOCKADDR_STORAGE	from_addr;	   // UDP send/recv address
251d54cfbdroberto		int		from_addr_len;	   // length of the address
252d54cfbdroberto		char		*base;		   // the base of the buffer
253d54cfbdroberto		char		*consume_position; // where to start copying data from next
254d54cfbdroberto		unsigned int	len;		   // the actual size of this buffer
255d54cfbdroberto		unsigned int	remaining;	   // the number of bytes remaining
256d54cfbdroberto	} recvbuf;
257d54cfbdroberto
258d54cfbdroberto	ISC_LIST(isc_socketevent_t)		send_list;
259d54cfbdroberto	ISC_LIST(isc_socketevent_t)		recv_list;
260d54cfbdroberto	ISC_LIST(isc_socket_newconnev_t)	accept_list;
261d54cfbdroberto	isc_socket_connev_t		       *connect_ev;
262d54cfbdroberto
263d54cfbdroberto	isc_sockaddr_t		address;  /* remote address */
264d54cfbdroberto
265d54cfbdroberto	unsigned int		listener : 1,	/* listener socket */
266d54cfbdroberto				connected : 1,
267d54cfbdroberto				pending_connect : 1, /* connect pending */
268047f369cy				bound : 1,	/* bound to local addr */
269047f369cy				dupped : 1;     /* created by isc_socket_dup() */
270d54cfbdroberto	unsigned int		pending_iocp;	/* Should equal the counters below. Debug. */
271d54cfbdroberto	unsigned int		pending_recv;  /* Number of outstanding recv() calls. */
272d54cfbdroberto	unsigned int		pending_send;  /* Number of outstanding send() calls. */
273d54cfbdroberto	unsigned int		pending_accept; /* Number of outstanding accept() calls. */
274d54cfbdroberto	unsigned int		state; /* Socket state. Debugging and consistency checking. */
275d54cfbdroberto	int			state_lineno;  /* line which last touched state */
276d54cfbdroberto};
277d54cfbdroberto
278d54cfbdroberto#define _set_state(sock, _state) do { (sock)->state = (_state); (sock)->state_lineno = __LINE__; } while (0)
279d54cfbdroberto
280d54cfbdroberto/*
281d54cfbdroberto * Buffer structure
282d54cfbdroberto */
283d54cfbdrobertotypedef struct buflist buflist_t;
284d54cfbdroberto
285d54cfbdrobertostruct buflist {
286d54cfbdroberto	void			*buf;
287d54cfbdroberto	unsigned int		buflen;
288d54cfbdroberto	ISC_LINK(buflist_t)	link;
289d54cfbdroberto};
290d54cfbdroberto
291d54cfbdroberto/*
292d54cfbdroberto * I/O Completion ports Info structures
293d54cfbdroberto */
294d54cfbdroberto
295d54cfbdrobertostatic HANDLE hHeapHandle = NULL;
296d54cfbdrobertotypedef struct IoCompletionInfo {
297d54cfbdroberto	OVERLAPPED		overlapped;
298d54cfbdroberto	isc_socketevent_t	*dev;  /* send()/recv() done event */
299d54cfbdroberto	isc_socket_connev_t	*cdev; /* connect() done event */
300d54cfbdroberto	isc_socket_newconnev_t	*adev; /* accept() done event */
301d54cfbdroberto	void			*acceptbuffer;
302d54cfbdroberto	DWORD			received_bytes;
303d54cfbdroberto	int			request_type;
304d54cfbdroberto	struct msghdr		messagehdr;
305d54cfbdroberto	ISC_LIST(buflist_t)	bufferlist;	/*%< list of buffers */
306d54cfbdroberto} IoCompletionInfo;
307d54cfbdroberto
308d54cfbdroberto/*
309d54cfbdroberto * Define a maximum number of I/O Completion Port worker threads
310d54cfbdroberto * to handle the load on the Completion Port. The actual number
311d54cfbdroberto * used is the number of CPU's + 1.
312d54cfbdroberto */
313d54cfbdroberto#define MAX_IOCPTHREADS 20
314d54cfbdroberto
315d54cfbdroberto#define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
316d54cfbdroberto#define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
317d54cfbdroberto
318d54cfbdrobertostruct isc_socketmgr {
319d54cfbdroberto	/* Not locked. */
320d54cfbdroberto	unsigned int			magic;
321d54cfbdroberto	isc_mem_t		       *mctx;
322d54cfbdroberto	isc_mutex_t			lock;
323d54cfbdroberto	isc_stats_t		       *stats;
324d54cfbdroberto
325d54cfbdroberto	/* Locked by manager lock. */
326d54cfbdroberto	ISC_LIST(isc_socket_t)		socklist;
327d54cfbdroberto	isc_boolean_t			bShutdown;
328d54cfbdroberto	isc_condition_t			shutdown_ok;
329d54cfbdroberto	HANDLE				hIoCompletionPort;
330d54cfbdroberto	int				maxIOCPThreads;
331d54cfbdroberto	HANDLE				hIOCPThreads[MAX_IOCPTHREADS];
332d54cfbdroberto	DWORD				dwIOCPThreadIds[MAX_IOCPTHREADS];
333d54cfbdroberto
334d54cfbdroberto	/*
335d54cfbdroberto	 * Debugging.
336d54cfbdroberto	 * Modified by InterlockedIncrement() and InterlockedDecrement()
337d54cfbdroberto	 */
338d54cfbdroberto	LONG				totalSockets;
339d54cfbdroberto	LONG				iocp_total;
340d54cfbdroberto};
341d54cfbdroberto
342d54cfbdrobertoenum {
343d54cfbdroberto	SOCKET_RECV,
344d54cfbdroberto	SOCKET_SEND,
345d54cfbdroberto	SOCKET_ACCEPT,
346d54cfbdroberto	SOCKET_CONNECT
347d54cfbdroberto};
348d54cfbdroberto
349d54cfbdroberto/*
350d54cfbdroberto * send() and recv() iovec counts
351d54cfbdroberto */
352d54cfbdroberto#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
353d54cfbdroberto#define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
354d54cfbdroberto
355047f369cystatic isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
356047f369cy				  isc_sockettype_t type,
357047f369cy				  isc_socket_t **socketp,
358047f369cy				  isc_socket_t *dup_socket);
359d54cfbdrobertostatic isc_threadresult_t WINAPI SocketIoThread(LPVOID ThreadContext);
360d54cfbdrobertostatic void maybe_free_socket(isc_socket_t **, int);
361d54cfbdrobertostatic void free_socket(isc_socket_t **, int);
362d54cfbdrobertostatic isc_boolean_t senddone_is_active(isc_socket_t *sock, isc_socketevent_t *dev);
363d54cfbdrobertostatic isc_boolean_t acceptdone_is_active(isc_socket_t *sock, isc_socket_newconnev_t *dev);
364d54cfbdrobertostatic isc_boolean_t connectdone_is_active(isc_socket_t *sock, isc_socket_connev_t *dev);
365d54cfbdrobertostatic void send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev);
366d54cfbdrobertostatic void send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev);
367d54cfbdrobertostatic void send_acceptdone_event(isc_socket_t *sock, isc_socket_newconnev_t **adev);
368d54cfbdrobertostatic void send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **cdev);
369d54cfbdrobertostatic void send_recvdone_abort(isc_socket_t *sock, isc_result_t result);
370d54cfbdrobertostatic void queue_receive_event(isc_socket_t *sock, isc_task_t *task, isc_socketevent_t *dev);
371d54cfbdrobertostatic void queue_receive_request(isc_socket_t *sock);
372d54cfbdroberto
373d54cfbdroberto/*
374d54cfbdroberto * This is used to dump the contents of the sock structure
375d54cfbdroberto * You should make sure that the sock is locked before
376d54cfbdroberto * dumping it. Since the code uses simple printf() statements
377d54cfbdroberto * it should only be used interactively.
378d54cfbdroberto */
379d54cfbdrobertovoid
380d54cfbdrobertosock_dump(isc_socket_t *sock) {
381d54cfbdroberto	isc_socketevent_t *ldev;
382d54cfbdroberto	isc_socket_newconnev_t *ndev;
383d54cfbdroberto
384d54cfbdroberto#if 0
385d54cfbdroberto	isc_sockaddr_t addr;
386d54cfbdroberto	char socktext[256];
387d54cfbdroberto
388d54cfbdroberto	isc_socket_getpeername(sock, &addr);
389d54cfbdroberto	isc_sockaddr_format(&addr, socktext, sizeof(socktext));
390d54cfbdroberto	printf("Remote Socket: %s\n", socktext);
391d54cfbdroberto	isc_socket_getsockname(sock, &addr);
392d54cfbdroberto	isc_sockaddr_format(&addr, socktext, sizeof(socktext));
393d54cfbdroberto	printf("This Socket: %s\n", socktext);
394d54cfbdroberto#endif
395d54cfbdroberto
396d54cfbdroberto	printf("\n\t\tSock Dump\n");
397d54cfbdroberto	printf("\t\tfd: %u\n", sock->fd);
398d54cfbdroberto	printf("\t\treferences: %d\n", sock->references);
399d54cfbdroberto	printf("\t\tpending_accept: %d\n", sock->pending_accept);
400d54cfbdroberto	printf("\t\tconnecting: %d\n", sock->pending_connect);
401d54cfbdroberto	printf("\t\tconnected: %d\n", sock->connected);
402d54cfbdroberto	printf("\t\tbound: %d\n", sock->bound);
403d54cfbdroberto	printf("\t\tpending_iocp: %d\n", sock->pending_iocp);
404d54cfbdroberto	printf("\t\tsocket type: %d\n", sock->type);
405d54cfbdroberto
406d54cfbdroberto	printf("\n\t\tSock Recv List\n");
407d54cfbdroberto	ldev = ISC_LIST_HEAD(sock->recv_list);
408d54cfbdroberto	while (ldev != NULL) {
409d54cfbdroberto		printf("\t\tdev: %p\n", ldev);
410d54cfbdroberto		ldev = ISC_LIST_NEXT(ldev, ev_link);
411d54cfbdroberto	}
412d54cfbdroberto
413d54cfbdroberto	printf("\n\t\tSock Send List\n");
414d54cfbdroberto	ldev = ISC_LIST_HEAD(sock->send_list);
415d54cfbdroberto	while (ldev != NULL) {
416d54cfbdroberto		printf("\t\tdev: %p\n", ldev);
417d54cfbdroberto		ldev = ISC_LIST_NEXT(ldev, ev_link);
418d54cfbdroberto	}
419d54cfbdroberto
420d54cfbdroberto	printf("\n\t\tSock Accept List\n");
421d54cfbdroberto	ndev = ISC_LIST_HEAD(sock->accept_list);
422d54cfbdroberto	while (ndev != NULL) {
423d54cfbdroberto		printf("\t\tdev: %p\n", ldev);
424d54cfbdroberto		ndev = ISC_LIST_NEXT(ndev, ev_link);
425d54cfbdroberto	}
426d54cfbdroberto}
427d54cfbdroberto
428d54cfbdrobertostatic void
429d54cfbdrobertosocket_log(int lineno, isc_socket_t *sock, isc_sockaddr_t *address,
430d54cfbdroberto	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
431d54cfbdroberto	   isc_msgcat_t *msgcat, int msgset, int message,
432d54cfbdroberto	   const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
433d54cfbdroberto
434d54cfbdroberto/*  This function will add an entry to the I/O completion port
435d54cfbdroberto *  that will signal the I/O thread to exit (gracefully)
436d54cfbdroberto */
437d54cfbdrobertostatic void
438d54cfbdrobertosignal_iocompletionport_exit(isc_socketmgr_t *manager) {
439d54cfbdroberto	int i;
440d54cfbdroberto	int errval;
441d54cfbdroberto	char strbuf[ISC_STRERRORSIZE];
442d54cfbdroberto
443d54cfbdroberto	REQUIRE(VALID_MANAGER(manager));
444d54cfbdroberto	for (i = 0; i < manager->maxIOCPThreads; i++) {
445d54cfbdroberto		if (!PostQueuedCompletionStatus(manager->hIoCompletionPort,
446d54cfbdroberto						0, 0, 0)) {
447d54cfbdroberto			errval = GetLastError();
448d54cfbdroberto			isc__strerror(errval, strbuf, sizeof(strbuf));
449d54cfbdroberto			FATAL_ERROR(__FILE__, __LINE__,
450d54cfbdroberto				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
451d54cfbdroberto				ISC_MSG_FAILED,
452d54cfbdroberto				"Can't request service thread to exit: %s"),
453d54cfbdroberto				strbuf);
454d54cfbdroberto		}
455d54cfbdroberto	}
456d54cfbdroberto}
457d54cfbdroberto
458d54cfbdroberto/*
459d54cfbdroberto * Create the worker threads for the I/O Completion Port
460d54cfbdroberto */
461d54cfbdrobertovoid
462d54cfbdrobertoiocompletionport_createthreads(int total_threads, isc_socketmgr_t *manager) {
463d54cfbdroberto	int errval;
464d54cfbdroberto	char strbuf[ISC_STRERRORSIZE];
465d54cfbdroberto	int i;
466d54cfbdroberto
467d54cfbdroberto	INSIST(total_threads > 0);
468d54cfbdroberto	REQUIRE(VALID_MANAGER(manager));
469d54cfbdroberto	/*
470d54cfbdroberto	 * We need at least one
471d54cfbdroberto	 */
472d54cfbdroberto	for (i = 0; i < total_threads; i++) {
473d54cfbdroberto		manager->hIOCPThreads[i] = CreateThread(NULL, 0, SocketIoThread,
474d54cfbdroberto						manager, 0,
475d54cfbdroberto						&manager->dwIOCPThreadIds[i]);
476d54cfbdroberto		if (manager->hIOCPThreads[i] == NULL) {
477d54cfbdroberto			errval = GetLastError();
478d54cfbdroberto			isc__strerror(errval, strbuf, sizeof(strbuf));
479d54cfbdroberto			FATAL_ERROR(__FILE__, __LINE__,
480d54cfbdroberto				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
481d54cfbdroberto				ISC_MSG_FAILED,
482d54cfbdroberto				"Can't create IOCP thread: %s"),
483d54cfbdroberto				strbuf);
484d54cfbdroberto			exit(1);
485d54cfbdroberto		}
486d54cfbdroberto	}
487d54cfbdroberto}
488d54cfbdroberto
489d54cfbdroberto/*
490d54cfbdroberto *  Create/initialise the I/O completion port
491d54cfbdroberto */
492d54cfbdrobertovoid
493d54cfbdrobertoiocompletionport_init(isc_socketmgr_t *manager) {
494d54cfbdroberto	int errval;
495d54cfbdroberto	char strbuf[ISC_STRERRORSIZE];
496d54cfbdroberto
497d54cfbdroberto	REQUIRE(VALID_MANAGER(manager));
498d54cfbdroberto	/*
499d54cfbdroberto	 * Create a private heap to handle the socket overlapped structure
500d54cfbdroberto	 * The minimum number of structures is 10, there is no maximum
501d54cfbdroberto	 */
502d54cfbdroberto	hHeapHandle = HeapCreate(0, 10 * sizeof(IoCompletionInfo), 0);
503d54cfbdroberto	if (hHeapHandle == NULL) {
504d54cfbdroberto		errval = GetLastError();
505d54cfbdroberto		isc__strerror(errval, strbuf, sizeof(strbuf));
506d54cfbdroberto		FATAL_ERROR(__FILE__, __LINE__,
507d54cfbdroberto			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
508d54cfbdroberto					   ISC_MSG_FAILED,
509d54cfbdroberto					   "HeapCreate() failed during "
510d54cfbdroberto					   "initialization: %s"),
511d54cfbdroberto			    strbuf);
512d54cfbdroberto		exit(1);
513d54cfbdroberto	}
514d54cfbdroberto
515d54cfbdroberto	manager->maxIOCPThreads = min(isc_os_ncpus() + 1, MAX_IOCPTHREADS);
516d54cfbdroberto
517d54cfbdroberto	/* Now Create the Completion Port */
518d54cfbdroberto	manager->hIoCompletionPort = CreateIoCompletionPort(
519d54cfbdroberto			INVALID_HANDLE_VALUE, NULL,
520d54cfbdroberto			0, manager->maxIOCPThreads);
521d54cfbdroberto	if (manager->hIoCompletionPort == NULL) {
522d54cfbdroberto		errval = GetLastError();
523d54cfbdroberto		isc__strerror(errval, strbuf, sizeof(strbuf));
524d54cfbdroberto		FATAL_ERROR(__FILE__, __LINE__,
525d54cfbdroberto				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
526d54cfbdroberto				ISC_MSG_FAILED,
527d54cfbdroberto				"CreateIoCompletionPort() failed "
528d54cfbdroberto				"during initialization: %s"),
529d54cfbdroberto				strbuf);
530d54cfbdroberto		exit(1);
531d54cfbdroberto	}
532d54cfbdroberto
533d54cfbdroberto	/*
534d54cfbdroberto	 * Worker threads for servicing the I/O
535d54cfbdroberto	 */
536d54cfbdroberto	iocompletionport_createthreads(manager->maxIOCPThreads, manager);
537d54cfbdroberto}
538d54cfbdroberto
539d54cfbdroberto/*
540d54cfbdroberto * Associate a socket with an IO Completion Port.  This allows us to queue events for it
541d54cfbdroberto * and have our worker pool of threads process them.
542d54cfbdroberto */
543d54cfbdrobertovoid
544d54cfbdrobertoiocompletionport_update(isc_socket_t *sock) {
545d54cfbdroberto	HANDLE hiocp;
546d54cfbdroberto	char strbuf[ISC_STRERRORSIZE];
547d54cfbdroberto
548d54cfbdroberto	REQUIRE(VALID_SOCKET(sock));
549d54cfbdroberto
550d54cfbdroberto	hiocp = CreateIoCompletionPort((HANDLE)sock->fd,
551d54cfbdroberto		sock->manager->hIoCompletionPort, (ULONG_PTR)sock, 0);
552d54cfbdroberto
553d54cfbdroberto	if (hiocp == NULL) {
554d54cfbdroberto		DWORD errval = GetLastError();
555d54cfbdroberto		isc__strerror(errval, strbuf, sizeof(strbuf));
556d54cfbdroberto		isc_log_iwrite(isc_lctx,
557d54cfbdroberto				ISC_LOGCATEGORY_GENERAL,
558d54cfbdroberto				ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
559d54cfbdroberto				isc_msgcat, ISC_MSGSET_SOCKET,
560d54cfbdroberto				ISC_MSG_TOOMANYHANDLES,
561d54cfbdroberto				"iocompletionport_update: failed to open"
562d54cfbdroberto				" io completion port: %s",
563d54cfbdroberto				strbuf);
564d54cfbdroberto
565d54cfbdroberto		/* XXXMLG temporary hack to make failures detected.
566d54cfbdroberto		 * This function should return errors to the caller, not
567d54cfbdroberto		 * exit here.
568d54cfbdroberto		 */
569d54cfbdroberto		FATAL_ERROR(__FILE__, __LINE__,
570d54cfbdroberto				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
571d54cfbdroberto				ISC_MSG_FAILED,
572d54cfbdroberto				"CreateIoCompletionPort() failed "
573d54cfbdroberto				"during initialization: %s"),
574d54cfbdroberto				strbuf);
575d54cfbdroberto		exit(1);
576d54cfbdroberto	}
577d54cfbdroberto
578d54cfbdroberto	InterlockedIncrement(&sock->manager->iocp_total);
579d54cfbdroberto}
580d54cfbdroberto
581d54cfbdroberto/*
582d54cfbdroberto * Routine to cleanup and then close the socket.
583d54cfbdroberto * Only close the socket here if it is NOT associated
584d54cfbdroberto * with an event, otherwise the WSAWaitForMultipleEvents
585d54cfbdroberto * may fail due to the fact that the Wait should not
586d54cfbdroberto * be running while closing an event or a socket.
587d54cfbdroberto * The socket is locked before calling this function
588d54cfbdroberto */
589d54cfbdrobertovoid
590d54cfbdrobertosocket_close(isc_socket_t *sock) {
591d54cfbdroberto
592d54cfbdroberto	REQUIRE(sock != NULL);
593d54cfbdroberto
594d54cfbdroberto	if (sock->fd != INVALID_SOCKET) {
595d54cfbdroberto		closesocket(sock->fd);
596d54cfbdroberto		sock->fd = INVALID_SOCKET;
597d54cfbdroberto		_set_state(sock, SOCK_CLOSED);
598d54cfbdroberto		InterlockedDecrement(&sock->manager->totalSockets);
599d54cfbdroberto	}
600d54cfbdroberto}
601d54cfbdroberto
602d54cfbdrobertostatic isc_once_t initialise_once = ISC_ONCE_INIT;
603d54cfbdrobertostatic isc_boolean_t initialised = ISC_FALSE;
604d54cfbdroberto
605d54cfbdrobertostatic void
606d54cfbdrobertoinitialise(void) {
607d54cfbdroberto	WORD wVersionRequested;
608d54cfbdroberto	WSADATA wsaData;
609d54cfbdroberto	int err;
610d54cfbdroberto	SOCKET sock;
611d54cfbdroberto	GUID GUIDConnectEx = WSAID_CONNECTEX;
612d54cfbdroberto	GUID GUIDAcceptEx = WSAID_ACCEPTEX;
613d54cfbdroberto	GUID GUIDGetAcceptExSockaddrs = WSAID_GETACCEPTEXSOCKADDRS;
614d54cfbdroberto	DWORD dwBytes;
615d54cfbdroberto
616d54cfbdroberto	/* Need Winsock 2.2 or better */
617d54cfbdroberto	wVersionRequested = MAKEWORD(2, 2);
618d54cfbdroberto
619d54cfbdroberto	err = WSAStartup(wVersionRequested, &wsaData);
620d54cfbdroberto	if (err != 0) {
621d54cfbdroberto		char strbuf[ISC_STRERRORSIZE];
622d54cfbdroberto		isc__strerror(err, strbuf, sizeof(strbuf));
623d54cfbdroberto		FATAL_ERROR(__FILE__, __LINE__, "WSAStartup() %s: %s",
624d54cfbdroberto			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
625d54cfbdroberto					   ISC_MSG_FAILED, "failed"),
626d54cfbdroberto			    strbuf);
627d54cfbdroberto		exit(1);
628d54cfbdroberto	}
629d54cfbdroberto	/*
630d54cfbdroberto	 * The following APIs do not exist as functions in a library, but we must
631d54cfbdroberto	 * ask winsock for them.  They are "extensions" -- but why they cannot be
632d54cfbdroberto	 * actual functions is beyond me.  So, ask winsock for the pointers to the
633d54cfbdroberto	 * functions we need.
634d54cfbdroberto	 */
635d54cfbdroberto	sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
636d54cfbdroberto	INSIST(sock != INVALID_SOCKET);
637d54cfbdroberto	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
638d54cfbdroberto		 &GUIDConnectEx, sizeof(GUIDConnectEx),
639d54cfbdroberto		 &ISCConnectEx, sizeof(ISCConnectEx),
640d54cfbdroberto		 &dwBytes, NULL, NULL);
641d54cfbdroberto	INSIST(err == 0);
642d54cfbdroberto
643d54cfbdroberto	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
644d54cfbdroberto		 &GUIDAcceptEx, sizeof(GUIDAcceptEx),
645d54cfbdroberto		 &ISCAcceptEx, sizeof(ISCAcceptEx),
646d54cfbdroberto		 &dwBytes, NULL, NULL);
647d54cfbdroberto	INSIST(err == 0);
648d54cfbdroberto
649d54cfbdroberto	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
650d54cfbdroberto		 &GUIDGetAcceptExSockaddrs, sizeof(GUIDGetAcceptExSockaddrs),
651d54cfbdroberto		 &ISCGetAcceptExSockaddrs, sizeof(ISCGetAcceptExSockaddrs),
652d54cfbdroberto		 &dwBytes, NULL, NULL);
653d54cfbdroberto	INSIST(err == 0);
654d54cfbdroberto
655d54cfbdroberto	closesocket(sock);
656d54cfbdroberto
657d54cfbdroberto	initialised = ISC_TRUE;
658d54cfbdroberto}
659d54cfbdroberto
660d54cfbdroberto/*
661d54cfbdroberto * Initialize socket services
662d54cfbdroberto */
663d54cfbdrobertovoid
664d54cfbdrobertoInitSockets(void) {
665d54cfbdroberto	RUNTIME_CHECK(isc_once_do(&initialise_once,
666d54cfbdroberto				  initialise) == ISC_R_SUCCESS);
667d54cfbdroberto	if (!initialised)
668d54cfbdroberto		exit(1);
669d54cfbdroberto}
670d54cfbdroberto
671d54cfbdrobertoint
672d54cfbdrobertointernal_sendmsg(isc_socket_t *sock, IoCompletionInfo *lpo,
673d54cfbdroberto		 struct msghdr *messagehdr, int flags, int *Error)
674d54cfbdroberto{
675d54cfbdroberto	int Result;
676d54cfbdroberto	DWORD BytesSent;
677d54cfbdroberto	DWORD Flags = flags;
678d54cfbdroberto	int total_sent;
679d54cfbdroberto
680d54cfbdroberto	*Error = 0;
681d54cfbdroberto	Result = WSASendTo(sock->fd, messagehdr->msg_iov,
682d54cfbdroberto			   messagehdr->msg_iovlen, &BytesSent,
683d54cfbdroberto			   Flags, (SOCKADDR *)&messagehdr->to_addr,
684d54cfbdroberto			   messagehdr->to_addr_len, (LPWSAOVERLAPPED)lpo,
685d54cfbdroberto			   NULL);
686d54cfbdroberto
687d54cfbdroberto	total_sent = (int)BytesSent;
688d54cfbdroberto
689d54cfbdroberto	/* Check for errors.*/
690d54cfbdroberto	if (Result == SOCKET_ERROR) {
691d54cfbdroberto		*Error = WSAGetLastError();
692d54cfbdroberto
693d54cfbdroberto		switch (*Error) {
694d54cfbdroberto		case WSA_IO_INCOMPLETE:
695d54cfbdroberto		case WSA_WAIT_IO_COMPLETION:
696d54cfbdroberto		case WSA_IO_PENDING:
697d54cfbdroberto		case NO_ERROR:		/* Strange, but okay */
698d54cfbdroberto			sock->pending_iocp++;
699d54cfbdroberto			sock->pending_send++;
700d54cfbdroberto			break;
701d54cfbdroberto
702d54cfbdroberto		default:
703d54cfbdroberto			return (-1);
704d54cfbdroberto			break;
705d54cfbdroberto		}
706d54cfbdroberto	} else {
707d54cfbdroberto		sock->pending_iocp++;
708d54cfbdroberto		sock->pending_send++;
709d54cfbdroberto	}
710d54cfbdroberto
711d54cfbdroberto	if (lpo != NULL)
712d54cfbdroberto		return (0);
713d54cfbdroberto	else
714d54cfbdroberto		return (total_sent);
715d54cfbdroberto}
716d54cfbdroberto
717d54cfbdrobertostatic void
718d54cfbdrobertoqueue_receive_request(isc_socket_t *sock) {
719d54cfbdroberto	DWORD Flags = 0;
720d54cfbdroberto	DWORD NumBytes = 0;
721d54cfbdroberto	int total_bytes = 0;
722d54cfbdroberto	int Result;
723d54cfbdroberto	int Error;
724047f369cy	int need_retry;
725d54cfbdroberto	WSABUF iov[1];
726047f369cy	IoCompletionInfo *lpo = NULL;
727d54cfbdroberto	isc_result_t isc_result;
728d54cfbdroberto
729047f369cy retry:
730047f369cy	need_retry = ISC_FALSE;
731047f369cy
732d54cfbdroberto	/*
733d54cfbdroberto	 * If we already have a receive pending, do nothing.
734d54cfbdroberto	 */
735047f369cy	if (sock->pending_recv > 0) {
736047f369cy		if (lpo != NULL)
737047f369cy			HeapFree(hHeapHandle, 0, lpo);
738d54cfbdroberto		return;
739047f369cy	}
740d54cfbdroberto
741d54cfbdroberto	/*
742d54cfbdroberto	 * If no one is waiting, do nothing.
743d54cfbdroberto	 */
744047f369cy	if (ISC_LIST_EMPTY(sock->recv_list)) {
745047f369cy		if (lpo != NULL)
746047f369cy			HeapFree(hHeapHandle, 0, lpo);
747d54cfbdroberto		return;
748047f369cy	}
749d54cfbdroberto
750d54cfbdroberto	INSIST(sock->recvbuf.remaining == 0);
751d54cfbdroberto	INSIST(sock->fd != INVALID_SOCKET);
752d54cfbdroberto
753d54cfbdroberto	iov[0].len = sock->recvbuf.len;
754d54cfbdroberto	iov[0].buf = sock->recvbuf.base;
755d54cfbdroberto
756047f369cy	if (lpo == NULL) {
757047f369cy		lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
758047f369cy						    HEAP_ZERO_MEMORY,
759047f369cy						    sizeof(IoCompletionInfo));
760047f369cy		RUNTIME_CHECK(lpo != NULL);
761047f369cy	} else
762047f369cy		ZeroMemory(lpo, sizeof(IoCompletionInfo));
763d54cfbdroberto	lpo->request_type = SOCKET_RECV;
764d54cfbdroberto
765d54cfbdroberto	sock->recvbuf.from_addr_len = sizeof(sock->recvbuf.from_addr);
766d54cfbdroberto
767d54cfbdroberto	Error = 0;
768d54cfbdroberto	Result = WSARecvFrom((SOCKET)sock->fd, iov, 1,
769d54cfbdroberto			     &NumBytes, &Flags,
770d54cfbdroberto			     (SOCKADDR *)&sock->recvbuf.from_addr,
771d54cfbdroberto			     &sock->recvbuf.from_addr_len,
772d54cfbdroberto			     (LPWSAOVERLAPPED)lpo, NULL);
773d54cfbdroberto
774d54cfbdroberto	/* Check for errors. */
775d54cfbdroberto	if (Result == SOCKET_ERROR) {
776d54cfbdroberto		Error = WSAGetLastError();
777d54cfbdroberto
778d54cfbdroberto		switch (Error) {
779d54cfbdroberto		case WSA_IO_PENDING:
780d54cfbdroberto			sock->pending_iocp++;
781d54cfbdroberto			sock->pending_recv++;
782d54cfbdroberto			break;
783d54cfbdroberto
784047f369cy		/* direct error: no completion event */
785047f369cy		case ERROR_HOST_UNREACHABLE:
786047f369cy		case WSAENETRESET:
787047f369cy		case WSAECONNRESET:
788047f369cy			if (!sock->connected) {
789047f369cy				/* soft error */
790047f369cy				need_retry = ISC_TRUE;
791047f369cy				break;
792047f369cy			}
793047f369cy			/* FALLTHROUGH */
794047f369cy
795d54cfbdroberto		default:
796d54cfbdroberto			isc_result = isc__errno2result(Error);
797d54cfbdroberto			if (isc_result == ISC_R_UNEXPECTED)
798d54cfbdroberto				UNEXPECTED_ERROR(__FILE__, __LINE__,
799d54cfbdroberto					"WSARecvFrom: Windows error code: %d, isc result %d",
800d54cfbdroberto					Error, isc_result);
801d54cfbdroberto			send_recvdone_abort(sock, isc_result);
802047f369cy			HeapFree(hHeapHandle, 0, lpo);
803047f369cy			lpo = NULL;
804d54cfbdroberto			break;
805d54cfbdroberto		}
806d54cfbdroberto	} else {
807d54cfbdroberto		/*
808d54cfbdroberto		 * The recv() finished immediately, but we will still get
809d54cfbdroberto		 * a completion event.  Rather than duplicate code, let
810d54cfbdroberto		 * that thread handle sending the data along its way.
811d54cfbdroberto		 */
812d54cfbdroberto		sock->pending_iocp++;
813d54cfbdroberto		sock->pending_recv++;
814d54cfbdroberto	}
815d54cfbdroberto
816d54cfbdroberto	socket_log(__LINE__, sock, NULL, IOEVENT,
817d54cfbdroberto		   isc_msgcat, ISC_MSGSET_SOCKET,
818d54cfbdroberto		   ISC_MSG_DOIORECV,
819d54cfbdroberto		   "queue_io_request: fd %d result %d error %d",
820d54cfbdroberto		   sock->fd, Result, Error);
821d54cfbdroberto
822d54cfbdroberto	CONSISTENT(sock);
823047f369cy
824047f369cy	if (need_retry)
825047f369cy		goto retry;
826d54cfbdroberto}
827d54cfbdroberto
828d54cfbdrobertostatic void
829d54cfbdrobertomanager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
830d54cfbdroberto	    isc_logmodule_t *module, int level, const char *fmt, ...)
831d54cfbdroberto{
832d54cfbdroberto	char msgbuf[2048];
833d54cfbdroberto	va_list ap;
834d54cfbdroberto
835d54cfbdroberto	if (!isc_log_wouldlog(isc_lctx, level))
836d54cfbdroberto		return;
837d54cfbdroberto
838d54cfbdroberto	va_start(ap, fmt);
839d54cfbdroberto	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
840d54cfbdroberto	va_end(ap);
841d54cfbdroberto
842d54cfbdroberto	isc_log_write(isc_lctx, category, module, level,
843d54cfbdroberto		      "sockmgr %p: %s", sockmgr, msgbuf);
844d54cfbdroberto}
845d54cfbdroberto
846d54cfbdrobertostatic void
847d54cfbdrobertosocket_log(int lineno, isc_socket_t *sock, isc_sockaddr_t *address,
848d54cfbdroberto	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
849d54cfbdroberto	   isc_msgcat_t *msgcat, int msgset, int message,
850d54cfbdroberto	   const char *fmt, ...)
851d54cfbdroberto{
852d54cfbdroberto	char msgbuf[2048];
853d54cfbdroberto	char peerbuf[256];
854d54cfbdroberto	va_list ap;
855d54cfbdroberto
856d54cfbdroberto
857d54cfbdroberto	if (!isc_log_wouldlog(isc_lctx, level))
858d54cfbdroberto		return;
859d54cfbdroberto
860d54cfbdroberto	va_start(ap, fmt);
861d54cfbdroberto	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
862d54cfbdroberto	va_end(ap);
863d54cfbdroberto
864d54cfbdroberto	if (address == NULL) {
865d54cfbdroberto		isc_log_iwrite(isc_lctx, category, module, level,
866d54cfbdroberto			       msgcat, msgset, message,
867d54cfbdroberto			       "socket %p line %d: %s", sock, lineno, msgbuf);
868d54cfbdroberto	} else {
869d54cfbdroberto		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
870d54cfbdroberto		isc_log_iwrite(isc_lctx, category, module, level,
871d54cfbdroberto			       msgcat, msgset, message,
872d54cfbdroberto				   "socket %p line %d peer %s: %s", sock, lineno,
873d54cfbdroberto				   peerbuf, msgbuf);
874d54cfbdroberto	}
875d54cfbdroberto
876d54cfbdroberto}
877d54cfbdroberto
878d54cfbdroberto/*
879d54cfbdroberto * Make an fd SOCKET non-blocking.
880d54cfbdroberto */
881d54cfbdrobertostatic isc_result_t
882d54cfbdrobertomake_nonblock(SOCKET fd) {
883d54cfbdroberto	int ret;
884d54cfbdroberto	unsigned long flags = 1;
885d54cfbdroberto	char strbuf[ISC_STRERRORSIZE];
886d54cfbdroberto
887d54cfbdroberto	/* Set the socket to non-blocking */
888d54cfbdroberto	ret = ioctlsocket(fd, FIONBIO, &flags);
889d54cfbdroberto
890d54cfbdroberto	if (ret == -1) {
891d54cfbdroberto		isc__strerror(errno, strbuf, sizeof(strbuf));
892d54cfbdroberto		UNEXPECTED_ERROR(__FILE__, __LINE__,
893d54cfbdroberto				 "ioctlsocket(%d, FIOBIO, %d): %s",
894d54cfbdroberto				 fd, flags, strbuf);
895d54cfbdroberto
896d54cfbdroberto		return (ISC_R_UNEXPECTED);
897d54cfbdroberto	}
898d54cfbdroberto
899d54cfbdroberto	return (ISC_R_SUCCESS);
900d54cfbdroberto}
901d54cfbdroberto
902d54cfbdroberto/*
903047f369cy * Windows 2000 systems incorrectly cause UDP sockets using WSARecvFrom
904d54cfbdroberto * to not work correctly, returning a WSACONNRESET error when a WSASendTo
905d54cfbdroberto * fails with an "ICMP port unreachable" response and preventing the
906d54cfbdroberto * socket from using the WSARecvFrom in subsequent operations.
907d54cfbdroberto * The function below fixes this, but requires that Windows 2000
908d54cfbdroberto * Service Pack 2 or later be installed on the system.  NT 4.0
909d54cfbdroberto * systems are not affected by this and work correctly.
910d54cfbdroberto * See Microsoft Knowledge Base Article Q263823 for details of this.
911d54cfbdroberto */
912d54cfbdrobertoisc_result_t
913d54cfbdrobertoconnection_reset_fix(SOCKET fd) {
914d54cfbdroberto	DWORD dwBytesReturned = 0;
915d54cfbdroberto	BOOL  bNewBehavior = FALSE;
916d54cfbdroberto	DWORD status;
917d54cfbdroberto
918d54cfbdroberto	if (isc_win32os_majorversion() < 5)
919d54cfbdroberto		return (ISC_R_SUCCESS); /*  NT 4.0 has no problem */
920d54cfbdroberto
921d54cfbdroberto	/* disable bad behavior using IOCTL: SIO_UDP_CONNRESET */
922d54cfbdroberto	status = WSAIoctl(fd, SIO_UDP_CONNRESET, &bNewBehavior,
923d54cfbdroberto			  sizeof(bNewBehavior), NULL, 0,
924d54cfbdroberto			  &dwBytesReturned, NULL, NULL);
925d54cfbdroberto	if (status != SOCKET_ERROR)
926d54cfbdroberto		return (ISC_R_SUCCESS);
927d54cfbdroberto	else {
928d54cfbdroberto		UNEXPECTED_ERROR(__FILE__, __LINE__,
929d54cfbdroberto				 "WSAIoctl(SIO_UDP_CONNRESET, oldBehaviour) %s",
930d54cfbdroberto				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
931d54cfbdroberto						ISC_MSG_FAILED, "failed"));
932d54cfbdroberto		return (ISC_R_UNEXPECTED);
933d54cfbdroberto	}
934d54cfbdroberto}
935d54cfbdroberto
936d54cfbdroberto/*
937d54cfbdroberto * Construct an iov array and attach it to the msghdr passed in.  This is
938d54cfbdroberto * the SEND constructor, which will use the used region of the buffer
939d54cfbdroberto * (if using a buffer list) or will use the internal region (if a single
940d54cfbdroberto * buffer I/O is requested).
941d54cfbdroberto *
942d54cfbdroberto * Nothing can be NULL, and the done event must list at least one buffer
943d54cfbdroberto * on the buffer linked list for this function to be meaningful.
944d54cfbdroberto */
945d54cfbdrobertostatic void
946d54cfbdrobertobuild_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
947d54cfbdroberto		  struct msghdr *msg, char *cmsg, WSABUF *iov,
948d54cfbdroberto		  IoCompletionInfo  *lpo)
949d54cfbdroberto{
950d54cfbdroberto	unsigned int iovcount;
951d54cfbdroberto	isc_buffer_t *buffer;
952d54cfbdroberto	buflist_t  *cpbuffer;
953d54cfbdroberto	isc_region_t used;
954d54cfbdroberto	size_t write_count;
955d54cfbdroberto	size_t skip_count;
956d54cfbdroberto
957d54cfbdroberto	memset(msg, 0, sizeof(*msg));
958d54cfbdroberto
959d54cfbdroberto	memcpy(&msg->to_addr, &dev->address.type, dev->address.length);
960d54cfbdroberto	msg->to_addr_len = dev->address.length;
961d54cfbdroberto
962d54cfbdroberto	buffer = ISC_LIST_HEAD(dev->bufferlist);
963d54cfbdroberto	write_count = 0;
964d54cfbdroberto	iovcount = 0;
965d54cfbdroberto
966d54cfbdroberto	/*
967d54cfbdroberto	 * Single buffer I/O?  Skip what we've done so far in this region.
968d54cfbdroberto	 */
969d54cfbdroberto	if (buffer == NULL) {
970d54cfbdroberto		write_count = dev->region.length - dev->n;
971d54cfbdroberto		cpbuffer = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, sizeof(buflist_t));
972d54cfbdroberto		RUNTIME_CHECK(cpbuffer != NULL);
973d54cfbdroberto		cpbuffer->buf = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, write_count);
974d54cfbdroberto		RUNTIME_CHECK(cpbuffer->buf != NULL);
975d54cfbdroberto
976d54cfbdroberto		socket_log(__LINE__, sock, NULL, TRACE,
977d54cfbdroberto		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
978d54cfbdroberto		   "alloc_buffer %p %d %p %d", cpbuffer, sizeof(buflist_t),
979d54cfbdroberto		   cpbuffer->buf, write_count);
980d54cfbdroberto
981d54cfbdroberto		memcpy(cpbuffer->buf,(dev->region.base + dev->n), write_count);
982d54cfbdroberto		cpbuffer->buflen = write_count;
983d54cfbdroberto		ISC_LIST_ENQUEUE(lpo->bufferlist, cpbuffer, link);
984d54cfbdroberto		iov[0].buf = cpbuffer->buf;
985d54cfbdroberto		iov[0].len = write_count;
986d54cfbdroberto		iovcount = 1;
987d54cfbdroberto
988d54cfbdroberto		goto config;
989d54cfbdroberto	}
990d54cfbdroberto
991d54cfbdroberto	/*
992d54cfbdroberto	 * Multibuffer I/O.
993d54cfbdroberto	 * Skip the data in the buffer list that we have already written.
994d54cfbdroberto	 */
995d54cfbdroberto	skip_count = dev->n;
996d54cfbdroberto	while (buffer != NULL) {
997d54cfbdroberto		REQUIRE(ISC_BUFFER_VALID(buffer));
998d54cfbdroberto		if (skip_count < isc_buffer_usedlength(buffer))
999d54cfbdroberto			break;
1000d54cfbdroberto		skip_count -= isc_buffer_usedlength(buffer);
1001d54cfbdroberto		buffer = ISC_LIST_NEXT(buffer, link);
1002d54cfbdroberto	}
1003d54cfbdroberto
1004d54cfbdroberto	while (buffer != NULL) {
1005d54cfbdroberto		INSIST(iovcount < MAXSCATTERGATHER_SEND);
1006d54cfbdroberto
1007d54cfbdroberto		isc_buffer_usedregion(buffer, &used);
1008d54cfbdroberto
1009d54cfbdroberto		if (used.length > 0) {
1010d54cfbdroberto			int uselen = used.length - skip_count;
1011d54cfbdroberto			cpbuffer = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, sizeof(buflist_t));
1012d54cfbdroberto			RUNTIME_CHECK(cpbuffer != NULL);
1013d54cfbdroberto			cpbuffer->buf = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, uselen);
1014d54cfbdroberto			RUNTIME_CHECK(cpbuffer->buf != NULL);
1015d54cfbdroberto
1016d54cfbdroberto			socket_log(__LINE__, sock, NULL, TRACE,
1017d54cfbdroberto			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
1018d54cfbdroberto			   "alloc_buffer %p %d %p %d", cpbuffer, sizeof(buflist_t),
1019d54cfbdroberto			   cpbuffer->buf, write_count);
1020d54cfbdroberto
1021d54cfbdroberto			memcpy(cpbuffer->buf,(used.base + skip_count), uselen);
1022d54cfbdroberto			cpbuffer->buflen = uselen;
1023d54cfbdroberto			iov[iovcount].buf = cpbuffer->buf;
1024d54cfbdroberto			iov[iovcount].len = used.length - skip_count;
1025d54cfbdroberto			write_count += uselen;
1026d54cfbdroberto			skip_count = 0;
1027d54cfbdroberto			iovcount++;
1028d54cfbdroberto		}
1029d54cfbdroberto		buffer = ISC_LIST_NEXT(buffer, link);
1030d54cfbdroberto	}
1031d54cfbdroberto
1032d54cfbdroberto	INSIST(skip_count == 0);
1033d54cfbdroberto
1034d54cfbdroberto config:
1035d54cfbdroberto	msg->msg_iov = iov;
1036d54cfbdroberto	msg->msg_iovlen = iovcount;
1037d54cfbdroberto	msg->msg_totallen = write_count;
1038d54cfbdroberto}
1039d54cfbdroberto
1040d54cfbdrobertostatic void
1041d54cfbdrobertoset_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
1042d54cfbdroberto		isc_socketevent_t *dev)
1043d54cfbdroberto{
1044d54cfbdroberto	if (sock->type == isc_sockettype_udp) {
1045d54cfbdroberto		if (address != NULL)
1046d54cfbdroberto			dev->address = *address;
1047d54cfbdroberto		else
1048d54cfbdroberto			dev->address = sock->address;
1049d54cfbdroberto	} else if (sock->type == isc_sockettype_tcp) {
1050d54cfbdroberto		INSIST(address == NULL);
1051d54cfbdroberto		dev->address = sock->address;
1052d54cfbdroberto	}
1053d54cfbdroberto}
1054d54cfbdroberto
1055d54cfbdrobertostatic void
1056d54cfbdrobertodestroy_socketevent(isc_event_t *event) {
1057d54cfbdroberto	isc_socketevent_t *ev = (isc_socketevent_t *)event;
1058d54cfbdroberto
1059d54cfbdroberto	INSIST(ISC_LIST_EMPTY(ev->bufferlist));
1060d54cfbdroberto
1061d54cfbdroberto	(ev->destroy)(event);
1062d54cfbdroberto}
1063d54cfbdroberto
1064d54cfbdrobertostatic isc_socketevent_t *
1065d54cfbdrobertoallocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
1066d54cfbdroberto		     isc_taskaction_t action, const void *arg)
1067d54cfbdroberto{
1068d54cfbdroberto	isc_socketevent_t *ev;
1069d54cfbdroberto
1070d54cfbdroberto	ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
1071d54cfbdroberto						     sock, eventtype,
1072d54cfbdroberto						     action, arg,
1073d54cfbdroberto						     sizeof(*ev));
1074d54cfbdroberto	if (ev == NULL)
1075d54cfbdroberto		return (NULL);
1076d54cfbdroberto
1077d54cfbdroberto	ev->result = ISC_R_IOERROR; // XXXMLG temporary change to detect failure to set
1078d54cfbdroberto	ISC_LINK_INIT(ev, ev_link);
1079d54cfbdroberto	ISC_LIST_INIT(ev->bufferlist);
1080d54cfbdroberto	ev->region.base = NULL;
1081d54cfbdroberto	ev->n = 0;
1082d54cfbdroberto	ev->offset = 0;
1083d54cfbdroberto	ev->attributes = 0;
1084d54cfbdroberto	ev->destroy = ev->ev_destroy;
1085d54cfbdroberto	ev->ev_destroy = destroy_socketevent;
1086d54cfbdroberto
1087d54cfbdroberto	return (ev);
1088d54cfbdroberto}
1089d54cfbdroberto
1090d54cfbdroberto#if defined(ISC_SOCKET_DEBUG)
1091d54cfbdrobertostatic void
1092d54cfbdrobertodump_msg(struct msghdr *msg, isc_socket_t *sock) {
1093d54cfbdroberto	unsigned int i;
1094d54cfbdroberto
1095d54cfbdroberto	printf("MSGHDR %p, Socket #: %u\n", msg, sock->fd);
1096d54cfbdroberto	printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
1097d54cfbdroberto	printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
1098d54cfbdroberto	for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
1099d54cfbdroberto		printf("\t\t%d\tbase %p, len %d\n", i,
1100d54cfbdroberto		       msg->msg_iov[i].buf,
1101d54cfbdroberto		       msg->msg_iov[i].len);
1102d54cfbdroberto}
1103d54cfbdroberto#endif
1104d54cfbdroberto
1105d54cfbdroberto/*
1106d54cfbdroberto * map the error code
1107d54cfbdroberto */
1108d54cfbdrobertoint
1109d54cfbdrobertomap_socket_error(isc_socket_t *sock, int windows_errno, int *isc_errno,
1110d54cfbdroberto		 char *errorstring, size_t bufsize) {
1111d54cfbdroberto
1112d54cfbdroberto	int doreturn;
1113d54cfbdroberto	switch (windows_errno) {
1114d54cfbdroberto	case WSAECONNREFUSED:
1115d54cfbdroberto		*isc_errno = ISC_R_CONNREFUSED;
1116d54cfbdroberto		if (sock->connected)
1117d54cfbdroberto			doreturn = DOIO_HARD;
1118d54cfbdroberto		else
1119d54cfbdroberto			doreturn = DOIO_SOFT;
1120d54cfbdroberto		break;
1121d54cfbdroberto	case WSAENETUNREACH:
1122d54cfbdroberto	case ERROR_NETWORK_UNREACHABLE:
1123d54cfbdroberto		*isc_errno = ISC_R_NETUNREACH;
1124d54cfbdroberto		if (sock->connected)
1125d54cfbdroberto			doreturn = DOIO_HARD;
1126d54cfbdroberto		else
1127d54cfbdroberto			doreturn = DOIO_SOFT;
1128d54cfbdroberto		break;
1129d54cfbdroberto	case ERROR_PORT_UNREACHABLE:
1130d54cfbdroberto	case ERROR_HOST_UNREACHABLE:
1131d54cfbdroberto	case WSAEHOSTUNREACH:
1132d54cfbdroberto		*isc_errno = ISC_R_HOSTUNREACH;
1133d54cfbdroberto		if (sock->connected)
1134d54cfbdroberto			doreturn = DOIO_HARD;
1135d54cfbdroberto		else
1136d54cfbdroberto			doreturn = DOIO_SOFT;
1137d54cfbdroberto		break;
1138d54cfbdroberto	case WSAENETDOWN:
1139d54cfbdroberto		*isc_errno = ISC_R_NETDOWN;
1140d54cfbdroberto		if (sock->connected)
1141d54cfbdroberto			doreturn = DOIO_HARD;
1142d54cfbdroberto		else
1143d54cfbdroberto			doreturn = DOIO_SOFT;
1144d54cfbdroberto		break;
1145d54cfbdroberto	case WSAEHOSTDOWN:
1146d54cfbdroberto		*isc_errno = ISC_R_HOSTDOWN;
1147d54cfbdroberto		if (sock->connected)
1148d54cfbdroberto			doreturn = DOIO_HARD;
1149d54cfbdroberto		else
1150d54cfbdroberto			doreturn = DOIO_SOFT;
1151d54cfbdroberto		break;
1152d54cfbdroberto	case WSAEACCES:
1153d54cfbdroberto		*isc_errno = ISC_R_NOPERM;
1154d54cfbdroberto		if (sock->connected)
1155d54cfbdroberto			doreturn = DOIO_HARD;
1156d54cfbdroberto		else
1157d54cfbdroberto			doreturn = DOIO_SOFT;
1158d54cfbdroberto		break;
1159d54cfbdroberto	case WSAECONNRESET:
1160d54cfbdroberto	case WSAENETRESET:
1161d54cfbdroberto	case WSAECONNABORTED:
1162d54cfbdroberto	case WSAEDISCON:
1163d54cfbdroberto		*isc_errno = ISC_R_CONNECTIONRESET;
1164d54cfbdroberto		if (sock->connected)
1165d54cfbdroberto			doreturn = DOIO_HARD;
1166d54cfbdroberto		else
1167d54cfbdroberto			doreturn = DOIO_SOFT;
1168d54cfbdroberto		break;
1169d54cfbdroberto	case WSAENOTCONN:
1170d54cfbdroberto		*isc_errno = ISC_R_NOTCONNECTED;
1171d54cfbdroberto		if (sock->connected)
1172d54cfbdroberto			doreturn = DOIO_HARD;
1173d54cfbdroberto		else
1174d54cfbdroberto			doreturn = DOIO_SOFT;
1175d54cfbdroberto		break;
1176d54cfbdroberto	case ERROR_OPERATION_ABORTED:
1177d54cfbdroberto	case ERROR_CONNECTION_ABORTED:
1178d54cfbdroberto	case ERROR_REQUEST_ABORTED:
1179d54cfbdroberto		*isc_errno = ISC_R_CONNECTIONRESET;
1180d54cfbdroberto		doreturn = DOIO_HARD;
1181d54cfbdroberto		break;
1182d54cfbdroberto	case WSAENOBUFS:
1183d54cfbdroberto		*isc_errno = ISC_R_NORESOURCES;
1184d54cfbdroberto		doreturn = DOIO_HARD;
1185d54cfbdroberto		break;
1186d54cfbdroberto	case WSAEAFNOSUPPORT:
1187d54cfbdroberto		*isc_errno = ISC_R_FAMILYNOSUPPORT;
1188d54cfbdroberto		doreturn = DOIO_HARD;
1189d54cfbdroberto		break;
1190d54cfbdroberto	case WSAEADDRNOTAVAIL:
1191d54cfbdroberto		*isc_errno = ISC_R_ADDRNOTAVAIL;
1192d54cfbdroberto		doreturn = DOIO_HARD;
1193d54cfbdroberto		break;
1194d54cfbdroberto	case WSAEDESTADDRREQ:
1195d54cfbdroberto		*isc_errno = ISC_R_BADADDRESSFORM;
1196d54cfbdroberto		doreturn = DOIO_HARD;
1197d54cfbdroberto		break;
1198d54cfbdroberto	case ERROR_NETNAME_DELETED:
1199d54cfbdroberto		*isc_errno = ISC_R_NETDOWN;
1200d54cfbdroberto		doreturn = DOIO_HARD;
1201d54cfbdroberto		break;
1202d54cfbdroberto	default:
1203d54cfbdroberto		*isc_errno = ISC_R_IOERROR;
1204d54cfbdroberto		doreturn = DOIO_HARD;
1205d54cfbdroberto		break;
1206d54cfbdroberto	}
1207d54cfbdroberto	if (doreturn == DOIO_HARD) {
1208d54cfbdroberto		isc__strerror(windows_errno, errorstring, bufsize);
1209d54cfbdroberto	}
1210d54cfbdroberto	return (doreturn);
1211d54cfbdroberto}
1212d54cfbdroberto
1213d54cfbdrobertostatic void
1214d54cfbdrobertofill_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
1215d54cfbdroberto	isc_region_t r;
1216d54cfbdroberto	int copylen;
1217d54cfbdroberto	isc_buffer_t *buffer;
1218d54cfbdroberto
1219d54cfbdroberto	INSIST(dev->n < dev->minimum);
1220d54cfbdroberto	INSIST(sock->recvbuf.remaining > 0);
1221d54cfbdroberto	INSIST(sock->pending_recv == 0);
1222d54cfbdroberto
1223d54cfbdroberto	if (sock->type == isc_sockettype_udp) {
1224d54cfbdroberto		dev->address.length = sock->recvbuf.from_addr_len;
1225d54cfbdroberto		memcpy(&dev->address.type, &sock->recvbuf.from_addr,
1226d54cfbdroberto		    sock->recvbuf.from_addr_len);
1227d54cfbdroberto		if (isc_sockaddr_getport(&dev->address) == 0) {
1228d54cfbdroberto			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1229d54cfbdroberto				socket_log(__LINE__, sock, &dev->address, IOEVENT,
1230d54cfbdroberto					   isc_msgcat, ISC_MSGSET_SOCKET,
1231d54cfbdroberto					   ISC_MSG_ZEROPORT,
1232d54cfbdroberto					   "dropping source port zero packet");
1233d54cfbdroberto			}
1234d54cfbdroberto			sock->recvbuf.remaining = 0;
1235d54cfbdroberto			return;
1236d54cfbdroberto		}
1237d54cfbdroberto	} else if (sock->type == isc_sockettype_tcp) {
1238d54cfbdroberto		dev->address = sock->address;
1239d54cfbdroberto	}
1240d54cfbdroberto
1241d54cfbdroberto	/*
1242d54cfbdroberto	 * Run through the list of buffers we were given, and find the
1243d54cfbdroberto	 * first one with space.  Once it is found, loop through, filling
1244d54cfbdroberto	 * the buffers as much as possible.
1245d54cfbdroberto	 */
1246d54cfbdroberto	buffer = ISC_LIST_HEAD(dev->bufferlist);
1247d54cfbdroberto	if (buffer != NULL) { // Multi-buffer receive
1248d54cfbdroberto		while (buffer != NULL && sock->recvbuf.remaining > 0) {
1249d54cfbdroberto			REQUIRE(ISC_BUFFER_VALID(buffer));
1250d54cfbdroberto			if (isc_buffer_availablelength(buffer) > 0) {
1251d54cfbdroberto				isc_buffer_availableregion(buffer, &r);
1252d54cfbdroberto				copylen = min(r.length, sock->recvbuf.remaining);
1253d54cfbdroberto				memcpy(r.base, sock->recvbuf.consume_position, copylen);
1254d54cfbdroberto				sock->recvbuf.consume_position += copylen;
1255d54cfbdroberto				sock->recvbuf.remaining -= copylen;
1256d54cfbdroberto				isc_buffer_add(buffer, copylen);
1257d54cfbdroberto				dev->n += copylen;
1258d54cfbdroberto			}
1259d54cfbdroberto			buffer = ISC_LIST_NEXT(buffer, link);
1260d54cfbdroberto		}
1261d54cfbdroberto	} else { // Single-buffer receive
1262d54cfbdroberto		copylen = min(dev->region.length - dev->n, sock->recvbuf.remaining);
1263d54cfbdroberto		memcpy(dev->region.base + dev->n, sock->recvbuf.consume_position, copylen);
1264d54cfbdroberto		sock->recvbuf.consume_position += copylen;
1265d54cfbdroberto		sock->recvbuf.remaining -= copylen;
1266d54cfbdroberto		dev->n += copylen;
1267d54cfbdroberto	}
1268d54cfbdroberto
1269d54cfbdroberto	/*
1270d54cfbdroberto	 * UDP receives are all-consuming.  That is, if we have 4k worth of
1271d54cfbdroberto	 * data in our receive buffer, and the caller only gave us
1272d54cfbdroberto	 * 1k of space, we will toss the remaining 3k of data.  TCP
1273d54cfbdroberto	 * will keep the extra data around and use it for later requests.
1274d54cfbdroberto	 */
1275d54cfbdroberto	if (sock->type == isc_sockettype_udp)
1276d54cfbdroberto		sock->recvbuf.remaining = 0;
1277d54cfbdroberto}
1278d54cfbdroberto
1279d54cfbdroberto/*
1280d54cfbdroberto * Copy out as much data from the internal buffer to done events.
1281d54cfbdroberto * As each done event is filled, send it along its way.
1282d54cfbdroberto */
1283d54cfbdrobertostatic void
1284d54cfbdrobertocompleteio_recv(isc_socket_t *sock)
1285d54cfbdroberto{
1286d54cfbdroberto	isc_socketevent_t *dev;
1287d54cfbdroberto
1288d54cfbdroberto	/*
1289d54cfbdroberto	 * If we are in the process of filling our buffer, we cannot
1290d54cfbdroberto	 * touch it yet, so don't.
1291d54cfbdroberto	 */
1292d54cfbdroberto	if (sock->pending_recv > 0)
1293d54cfbdroberto		return;
1294d54cfbdroberto
1295d54cfbdroberto	while (sock->recvbuf.remaining > 0 && !ISC_LIST_EMPTY(sock->recv_list)) {
1296d54cfbdroberto		dev = ISC_LIST_HEAD(sock->recv_list);
1297d54cfbdroberto
1298d54cfbdroberto		/*
1299d54cfbdroberto		 * See if we have sufficient data in our receive buffer
1300d54cfbdroberto		 * to handle this.  If we do, copy out the data.
1301d54cfbdroberto		 */
1302d54cfbdroberto		fill_recv(sock, dev);
1303d54cfbdroberto
1304d54cfbdroberto		/*
1305d54cfbdroberto		 * Did we satisfy it?
1306d54cfbdroberto		 */
1307d54cfbdroberto		if (dev->n >= dev->minimum) {
1308d54cfbdroberto			dev->result = ISC_R_SUCCESS;
1309d54cfbdroberto			send_recvdone_event(sock, &dev);
1310d54cfbdroberto		}
1311d54cfbdroberto	}
1312d54cfbdroberto}
1313d54cfbdroberto
1314d54cfbdroberto/*
1315d54cfbdroberto * Returns:
1316d54cfbdroberto *	DOIO_SUCCESS	The operation succeeded.  dev->result contains
1317d54cfbdroberto *			ISC_R_SUCCESS.
1318d54cfbdroberto *
1319d54cfbdroberto *	DOIO_HARD	A hard or unexpected I/O error was encountered.
1320d54cfbdroberto *			dev->result contains the appropriate error.
1321d54cfbdroberto *
1322d54cfbdroberto *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
1323d54cfbdroberto *			event was sent.  The operation should be retried.
1324d54cfbdroberto *
1325d54cfbdroberto *	No other return values are possible.
1326d54cfbdroberto */
1327d54cfbdrobertostatic int
1328d54cfbdrobertocompleteio_send(isc_socket_t *sock, isc_socketevent_t *dev,
1329d54cfbdroberto		struct msghdr *messagehdr, int cc, int send_errno)
1330d54cfbdroberto{
1331d54cfbdroberto	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
1332d54cfbdroberto	char strbuf[ISC_STRERRORSIZE];
1333d54cfbdroberto
1334d54cfbdroberto	if (send_errno != 0) {
1335d54cfbdroberto		if (SOFT_ERROR(send_errno))
1336d54cfbdroberto			return (DOIO_SOFT);
1337d54cfbdroberto
1338d54cfbdroberto		return (map_socket_error(sock, send_errno, &dev->result,
1339d54cfbdroberto			strbuf, sizeof(strbuf)));
1340d54cfbdroberto
1341d54cfbdroberto		/*
1342d54cfbdroberto		 * The other error types depend on whether or not the
1343d54cfbdroberto		 * socket is UDP or TCP.  If it is UDP, some errors
1344d54cfbdroberto		 * that we expect to be fatal under TCP are merely
1345d54cfbdroberto		 * annoying, and are really soft errors.
1346d54cfbdroberto		 *
1347d54cfbdroberto		 * However, these soft errors are still returned as
1348d54cfbdroberto		 * a status.
1349d54cfbdroberto		 */
1350d54cfbdroberto		isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
1351d54cfbdroberto		isc__strerror(send_errno, strbuf, sizeof(strbuf));
1352d54cfbdroberto		UNEXPECTED_ERROR(__FILE__, __LINE__, "completeio_send: %s: %s",
1353d54cfbdroberto				 addrbuf, strbuf);
1354d54cfbdroberto		dev->result = isc__errno2result(send_errno);
1355047f369cy		return (DOIO_HARD);
1356d54cfbdroberto	}
1357d54cfbdroberto
1358d54cfbdroberto	/*
1359d54cfbdroberto	 * If we write less than we expected, update counters, poke.
1360d54cfbdroberto	 */
1361d54cfbdroberto	dev->n += cc;
1362d54cfbdroberto	if (cc != messagehdr->msg_totallen)
1363d54cfbdroberto		return (DOIO_SOFT);
1364d54cfbdroberto
1365d54cfbdroberto	/*
1366d54cfbdroberto	 * Exactly what we wanted to write.  We're done with this
1367d54cfbdroberto	 * entry.  Post its completion event.
1368d54cfbdroberto	 */
1369d54cfbdroberto	dev->result = ISC_R_SUCCESS;
1370d54cfbdroberto	return (DOIO_SUCCESS);
1371d54cfbdroberto}
1372d54cfbdroberto
1373d54cfbdrobertostatic int
1374d54cfbdrobertostartio_send(isc_socket_t *sock, isc_socketevent_t *dev, int *nbytes,
1375d54cfbdroberto	     int *send_errno)
1376d54cfbdroberto{
1377d54cfbdroberto	char *cmsg = NULL;
1378d54cfbdroberto	char strbuf[ISC_STRERRORSIZE];
1379d54cfbdroberto	IoCompletionInfo *lpo;
1380d54cfbdroberto	int status;
1381d54cfbdroberto	struct msghdr *msghdr;
1382d54cfbdroberto
1383d54cfbdroberto	lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
1384d54cfbdroberto					    HEAP_ZERO_MEMORY,
1385d54cfbdroberto					    sizeof(IoCompletionInfo));
1386d54cfbdroberto	RUNTIME_CHECK(lpo != NULL);
1387d54cfbdroberto	lpo->request_type = SOCKET_SEND;
1388d54cfbdroberto	lpo->dev = dev;
1389d54cfbdroberto	msghdr = &lpo->messagehdr;
1390d54cfbdroberto	memset(msghdr, 0, sizeof(struct msghdr));
1391d54cfbdroberto	ISC_LIST_INIT(lpo->bufferlist);
1392d54cfbdroberto
1393d54cfbdroberto	build_msghdr_send(sock, dev, msghdr, cmsg, sock->iov, lpo);
1394d54cfbdroberto
1395d54cfbdroberto	*nbytes = internal_sendmsg(sock, lpo, msghdr, 0, send_errno);
1396d54cfbdroberto
1397d54cfbdroberto	if (*nbytes < 0) {
1398d54cfbdroberto		/*
1399d54cfbdroberto		 * I/O has been initiated
1400d54cfbdroberto		 * completion will be through the completion port
1401d54cfbdroberto		 */
1402d54cfbdroberto		if (PENDING_ERROR(*send_errno)) {
1403d54cfbdroberto			status = DOIO_PENDING;
1404d54cfbdroberto			goto done;
1405d54cfbdroberto		}
1406d54cfbdroberto
1407d54cfbdroberto		if (SOFT_ERROR(*send_errno)) {
1408d54cfbdroberto			status = DOIO_SOFT;
1409d54cfbdroberto			goto done;
1410d54cfbdroberto		}
1411d54cfbdroberto
1412d54cfbdroberto		/*
1413d54cfbdroberto		 * If we got this far then something is wrong
1414d54cfbdroberto		 */
1415d54cfbdroberto		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1416d54cfbdroberto			isc__strerror(*send_errno, strbuf, sizeof(strbuf));
1417d54cfbdroberto			socket_log(__LINE__, sock, NULL, IOEVENT,
1418d54cfbdroberto				   isc_msgcat, ISC_MSGSET_SOCKET,
1419d54cfbdroberto				   ISC_MSG_INTERNALSEND,
1420d54cfbdroberto				   "startio_send: internal_sendmsg(%d) %d "
1421d54cfbdroberto				   "bytes, err %d/%s",
1422d54cfbdroberto				   sock->fd, *nbytes, *send_errno, strbuf);
1423d54cfbdroberto		}
1424047f369cy		status = DOIO_HARD;
1425d54cfbdroberto		goto done;
1426d54cfbdroberto	}
1427d54cfbdroberto	dev->result = ISC_R_SUCCESS;
1428d54cfbdroberto	status = DOIO_SOFT;
1429d54cfbdroberto done:
1430d54cfbdroberto	_set_state(sock, SOCK_DATA);
1431d54cfbdroberto	return (status);
1432d54cfbdroberto}
1433d54cfbdroberto
1434d54cfbdrobertostatic isc_result_t
1435d54cfbdrobertoallocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
1436d54cfbdroberto		isc_socket_t **socketp) {
1437d54cfbdroberto	isc_socket_t *sock;
1438d54cfbdroberto	isc_result_t result;
1439d54cfbdroberto
1440d54cfbdroberto	sock = isc_mem_get(manager->mctx, sizeof(*sock));
1441d54cfbdroberto
1442d54cfbdroberto	if (sock == NULL)
1443d54cfbdroberto		return (ISC_R_NOMEMORY);
1444d54cfbdroberto
1445d54cfbdroberto	sock->magic = 0;
1446d54cfbdroberto	sock->references = 0;
1447d54cfbdroberto
1448d54cfbdroberto	sock->manager = manager;
1449d54cfbdroberto	sock->type = type;
1450d54cfbdroberto	sock->fd = INVALID_SOCKET;
1451d54cfbdroberto
1452d54cfbdroberto	ISC_LINK_INIT(sock, link);
1453d54cfbdroberto
1454d54cfbdroberto	/*
1455d54cfbdroberto	 * set up list of readers and writers to be initially empty
1456d54cfbdroberto	 */
1457d54cfbdroberto	ISC_LIST_INIT(sock->recv_list);
1458d54cfbdroberto	ISC_LIST_INIT(sock->send_list);
1459d54cfbdroberto	ISC_LIST_INIT(sock->accept_list);
1460d54cfbdroberto	sock->connect_ev = NULL;
1461d54cfbdroberto	sock->pending_accept = 0;
1462d54cfbdroberto	sock->pending_recv = 0;
1463d54cfbdroberto	sock->pending_send = 0;
1464d54cfbdroberto	sock->pending_iocp = 0;
1465d54cfbdroberto	sock->listener = 0;
1466d54cfbdroberto	sock->connected = 0;
1467d54cfbdroberto	sock->pending_connect = 0;
1468d54cfbdroberto	sock->bound = 0;
1469047f369cy	sock->dupped = 0;
1470d54cfbdroberto	memset(sock->name, 0, sizeof(sock->name));	// zero the name field
1471d54cfbdroberto	_set_state(sock, SOCK_INITIALIZED);
1472d54cfbdroberto
1473d54cfbdroberto	sock->recvbuf.len = 65536;
1474d54cfbdroberto	sock->recvbuf.consume_position = sock->recvbuf.base;
1475d54cfbdroberto	sock->recvbuf.remaining = 0;
1476d54cfbdroberto	sock->recvbuf.base = isc_mem_get(manager->mctx, sock->recvbuf.len); // max buffer size
1477d54cfbdroberto	if (sock->recvbuf.base == NULL) {
1478d54cfbdroberto		sock->magic = 0;
1479d54cfbdroberto		goto error;
1480d54cfbdroberto	}
1481d54cfbdroberto
1482d54cfbdroberto	/*
1483d54cfbdroberto	 * initialize the lock
1484d54cfbdroberto	 */
1485d54cfbdroberto	result = isc_mutex_init(&sock->lock);
1486d54cfbdroberto	if (result != ISC_R_SUCCESS) {
1487d54cfbdroberto		sock->magic = 0;
1488d54cfbdroberto		isc_mem_put(manager->mctx, sock->recvbuf.base, sock->recvbuf.len);
1489d54cfbdroberto		sock->recvbuf.base = NULL;
1490d54cfbdroberto		goto error;
1491d54cfbdroberto	}
1492d54cfbdroberto
1493d54cfbdroberto	socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1494d54cfbdroberto		   "allocated");
1495d54cfbdroberto
1496d54cfbdroberto	sock->magic = SOCKET_MAGIC;
1497d54cfbdroberto	*socketp = sock;
1498d54cfbdroberto
1499d54cfbdroberto	return (ISC_R_SUCCESS);
1500d54cfbdroberto
1501d54cfbdroberto error:
1502d54cfbdroberto	isc_mem_put(manager->mctx, sock, sizeof(*sock));
1503d54cfbdroberto
1504d54cfbdroberto	return (result);
1505d54cfbdroberto}
1506d54cfbdroberto
1507d54cfbdroberto/*
1508d54cfbdroberto * Verify that the socket state is consistent.
1509d54cfbdroberto */
1510d54cfbdrobertostatic void
1511d54cfbdrobertoconsistent(isc_socket_t *sock) {
1512d54cfbdroberto
1513d54cfbdroberto	isc_socketevent_t *dev;
1514d54cfbdroberto	isc_socket_newconnev_t *nev;
1515d54cfbdroberto	unsigned int count;
1516d54cfbdroberto	char *crash_reason;
1517d54cfbdroberto	isc_boolean_t crash = ISC_FALSE;
1518d54cfbdroberto
1519d54cfbdroberto	REQUIRE(sock->pending_iocp == sock->pending_recv + sock->pending_send
1520d54cfbdroberto		+ sock->pending_accept + sock->pending_connect);
1521d54cfbdroberto
1522d54cfbdroberto	dev = ISC_LIST_HEAD(sock->send_list);
1523d54cfbdroberto	count = 0;
1524d54cfbdroberto	while (dev != NULL) {
1525d54cfbdroberto		count++;
1526d54cfbdroberto		dev = ISC_LIST_NEXT(dev, ev_link);
1527d54cfbdroberto	}
1528d54cfbdroberto	if (count > sock->pending_send) {
1529d54cfbdroberto		crash = ISC_TRUE;
1530d54cfbdroberto		crash_reason = "send_list > sock->pending_send";
1531d54cfbdroberto	}
1532d54cfbdroberto
1533d54cfbdroberto	nev = ISC_LIST_HEAD(sock->accept_list);
1534d54cfbdroberto	count = 0;
1535d54cfbdroberto	while (nev != NULL) {
1536d54cfbdroberto		count++;
1537d54cfbdroberto		nev = ISC_LIST_NEXT(nev, ev_link);
1538d54cfbdroberto	}
1539d54cfbdroberto	if (count > sock->pending_accept) {
1540d54cfbdroberto		crash = ISC_TRUE;
1541d54cfbdroberto		crash_reason = "send_list > sock->pending_send";
1542d54cfbdroberto	}
1543d54cfbdroberto
1544d54cfbdroberto	if (crash) {
1545d54cfbdroberto		socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1546d54cfbdroberto			   ISC_MSG_DESTROYING, "SOCKET INCONSISTENT: %s",
1547d54cfbdroberto			   crash_reason);
1548d54cfbdroberto		sock_dump(sock);
1549d54cfbdroberto		INSIST(crash == ISC_FALSE);
1550d54cfbdroberto	}
1551d54cfbdroberto}
1552d54cfbdroberto
1553d54cfbdroberto/*
1554d54cfbdroberto * Maybe free the socket.
1555d54cfbdroberto *
1556d54cfbdroberto * This function will verify tht the socket is no longer in use in any way,
1557d54cfbdroberto * either internally or externally.  This is the only place where this
1558d54cfbdroberto * check is to be made; if some bit of code believes that IT is done with
1559d54cfbdroberto * the socket (e.g., some reference counter reaches zero), it should call
1560d54cfbdroberto * this function.
1561d54cfbdroberto *
1562d54cfbdroberto * When calling this function, the socket must be locked, and the manager
1563d54cfbdroberto * must be unlocked.
1564d54cfbdroberto *
1565d54cfbdroberto * When this function returns, *socketp will be NULL.  No tricks to try
1566d54cfbdroberto * to hold on to this pointer are allowed.
1567d54cfbdroberto */
1568d54cfbdrobertostatic void
1569d54cfbdrobertomaybe_free_socket(isc_socket_t **socketp, int lineno) {
1570d54cfbdroberto	isc_socket_t *sock = *socketp;
1571d54cfbdroberto	*socketp = NULL;
1572d54cfbdroberto
1573d54cfbdroberto	INSIST(VALID_SOCKET(sock));
1574d54cfbdroberto	CONSISTENT(sock);
1575d54cfbdroberto
1576d54cfbdroberto	if (sock->pending_iocp > 0
1577d54cfbdroberto	    || sock->pending_recv > 0
1578d54cfbdroberto	    || sock->pending_send > 0
1579d54cfbdroberto	    || sock->pending_accept > 0
1580d54cfbdroberto	    || sock->references > 0
1581d54cfbdroberto	    || sock->pending_connect == 1
1582d54cfbdroberto	    || !ISC_LIST_EMPTY(sock->recv_list)
1583d54cfbdroberto	    || !ISC_LIST_EMPTY(sock->send_list)
1584d54cfbdroberto	    || !ISC_LIST_EMPTY(sock->accept_list)
1585d54cfbdroberto	    || sock->fd != INVALID_SOCKET) {
1586d54cfbdroberto		UNLOCK(&sock->lock);
1587d54cfbdroberto		return;
1588d54cfbdroberto	}
1589d54cfbdroberto	UNLOCK(&sock->lock);
1590d54cfbdroberto
1591d54cfbdroberto	free_socket(&sock, lineno);
1592d54cfbdroberto}
1593d54cfbdroberto
1594d54cfbdrobertovoid
1595d54cfbdrobertofree_socket(isc_socket_t **sockp, int lineno) {
1596d54cfbdroberto	isc_socketmgr_t *manager;
1597d54cfbdroberto	isc_socket_t *sock = *sockp;
1598d54cfbdroberto	*sockp = NULL;
1599d54cfbdroberto
1600d54cfbdroberto	manager = sock->manager;
1601d54cfbdroberto
1602d54cfbdroberto	/*
1603d54cfbdroberto	 * Seems we can free the socket after all.
1604d54cfbdroberto	 */
1605d54cfbdroberto	manager = sock->manager;
1606d54cfbdroberto	socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
1607d54cfbdroberto		   ISC_MSG_DESTROYING, "freeing socket line %d fd %d lock %p semaphore %p",
1608d54cfbdroberto		   lineno, sock->fd, &sock->lock, sock->lock.LockSemaphore);
1609d54cfbdroberto
1610d54cfbdroberto	sock->magic = 0;
1611d54cfbdroberto	DESTROYLOCK(&sock->lock);
1612d54cfbdroberto
1613d54cfbdroberto	if (sock->recvbuf.base != NULL)
1614d54cfbdroberto		isc_mem_put(manager->mctx, sock->recvbuf.base, sock->recvbuf.len);
1615d54cfbdroberto
1616d54cfbdroberto	LOCK(&manager->lock);
1617d54cfbdroberto	if (ISC_LINK_LINKED(sock, link))
1618d54cfbdroberto		ISC_LIST_UNLINK(manager->socklist, sock, link);
1619d54cfbdroberto	isc_mem_put(manager->mctx, sock, sizeof(*sock));
1620d54cfbdroberto
1621d54cfbdroberto	if (ISC_LIST_EMPTY(manager->socklist))
1622d54cfbdroberto		SIGNAL(&manager->shutdown_ok);
1623d54cfbdroberto	UNLOCK(&manager->lock);
1624d54cfbdroberto}
1625d54cfbdroberto
1626d54cfbdroberto/*
1627d54cfbdroberto * Create a new 'type' socket managed by 'manager'.  Events
1628d54cfbdroberto * will be posted to 'task' and when dispatched 'action' will be
1629d54cfbdroberto * called with 'arg' as the arg value.  The new socket is returned
1630d54cfbdroberto * in 'socketp'.
1631d54cfbdroberto */
1632047f369cystatic isc_result_t
1633047f369cysocket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
1634047f369cy	      isc_socket_t **socketp, isc_socket_t *dup_socket)
1635047f369cy{
1636d54cfbdroberto	isc_socket_t *sock = NULL;
1637d54cfbdroberto	isc_result_t result;
1638d54cfbdroberto#if defined(USE_CMSG)
1639d54cfbdroberto	int on = 1;
1640d54cfbdroberto#endif
1641d54cfbdroberto#if defined(SO_RCVBUF)
1642d54cfbdroberto	ISC_SOCKADDR_LEN_T optlen;
1643d54cfbdroberto	int size;
1644d54cfbdroberto#endif
1645d54cfbdroberto	int socket_errno;
1646d54cfbdroberto	char strbuf[ISC_STRERRORSIZE];
1647d54cfbdroberto
1648d54cfbdroberto	REQUIRE(VALID_MANAGER(manager));
1649d54cfbdroberto	REQUIRE(socketp != NULL && *socketp == NULL);
1650d54cfbdroberto	REQUIRE(type != isc_sockettype_fdwatch);
1651d54cfbdroberto
1652047f369cy	if (dup_socket != NULL)
1653047f369cy		return (ISC_R_NOTIMPLEMENTED);
1654047f369cy
1655d54cfbdroberto	result = allocate_socket(manager, type, &sock);
1656d54cfbdroberto	if (result != ISC_R_SUCCESS)
1657d54cfbdroberto		return (result);
1658d54cfbdroberto
1659d54cfbdroberto	sock->pf = pf;
1660047f369cy#if 0
1661047f369cy	if (dup_socket == NULL) {
1662047f369cy#endif
1663047f369cy		switch (type) {
1664047f369cy		case isc_sockettype_udp:
1665047f369cy			sock->fd = socket(pf, SOCK_DGRAM, IPPROTO_UDP);
1666047f369cy			if (sock->fd != INVALID_SOCKET) {
1667047f369cy				result = connection_reset_fix(sock->fd);
1668047f369cy				if (result != ISC_R_SUCCESS) {
1669047f369cy					socket_log(__LINE__, sock,
1670047f369cy						NULL, EVENT, NULL, 0, 0,
1671047f369cy						"closed %d %d %d "
1672047f369cy						"con_reset_fix_failed",
1673047f369cy						sock->pending_recv,
1674047f369cy						sock->pending_send,
1675047f369cy						sock->references);
1676047f369cy					closesocket(sock->fd);
1677047f369cy					_set_state(sock, SOCK_CLOSED);
1678047f369cy					sock->fd = INVALID_SOCKET;
1679047f369cy					free_socket(&sock, __LINE__);
1680047f369cy					return (result);
1681047f369cy				}
1682d54cfbdroberto			}
1683047f369cy			break;
1684047f369cy		case isc_sockettype_tcp:
1685047f369cy			sock->fd = socket(pf, SOCK_STREAM, IPPROTO_TCP);
1686047f369cy			break;
1687d54cfbdroberto		}
1688047f369cy#if 0
1689047f369cy	} else {
1690047f369cy		/*
1691047f369cy		 * XXX: dup() is deprecated in windows, use _dup()
1692047f369cy		 * instead.  In future we may want to investigate
1693047f369cy		 * WSADuplicateSocket().
1694047f369cy		 */
1695047f369cy		sock->fd = _dup(dup_socket->fd);
1696047f369cy		sock->dupped = 1;
1697047f369cy		sock->bound = dup_socket->bound;
1698d54cfbdroberto	}
1699047f369cy#endif
1700d54cfbdroberto
1701d54cfbdroberto	if (sock->fd == INVALID_SOCKET) {
1702d54cfbdroberto		socket_errno = WSAGetLastError();
1703d54cfbdroberto		free_socket(&sock, __LINE__);
1704d54cfbdroberto
1705d54cfbdroberto		switch (socket_errno) {
1706d54cfbdroberto		case WSAEMFILE:
1707d54cfbdroberto		case WSAENOBUFS:
1708d54cfbdroberto			return (ISC_R_NORESOURCES);
1709d54cfbdroberto
1710d54cfbdroberto		case WSAEPROTONOSUPPORT:
1711d54cfbdroberto		case WSAEPFNOSUPPORT:
1712d54cfbdroberto		case WSAEAFNOSUPPORT:
1713d54cfbdroberto			return (ISC_R_FAMILYNOSUPPORT);
1714d54cfbdroberto
1715d54cfbdroberto		default:
1716d54cfbdroberto			isc__strerror(socket_errno, strbuf, sizeof(strbuf));
1717d54cfbdroberto			UNEXPECTED_ERROR(__FILE__, __LINE__,
1718d54cfbdroberto					 "socket() %s: %s",
1719d54cfbdroberto					 isc_msgcat_get(isc_msgcat,
1720d54cfbdroberto							ISC_MSGSET_GENERAL,
1721d54cfbdroberto							ISC_MSG_FAILED,
1722d54cfbdroberto							"failed"),
1723d54cfbdroberto					 strbuf);
1724d54cfbdroberto			return (ISC_R_UNEXPECTED);
1725d54cfbdroberto		}
1726d54cfbdroberto	}
1727d54cfbdroberto
1728d54cfbdroberto	result = make_nonblock(sock->fd);
1729d54cfbdroberto	if (result != ISC_R_SUCCESS) {
1730d54cfbdroberto		socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1731d54cfbdroberto			"closed %d %d %d make_nonblock_failed",
1732d54cfbdroberto			sock->pending_recv, sock->pending_send,
1733d54cfbdroberto			sock->references);
1734d54cfbdroberto		closesocket(sock->fd);
1735d54cfbdroberto		sock->fd = INVALID_SOCKET;
1736d54cfbdroberto		free_socket(&sock, __LINE__);
1737d54cfbdroberto		return (result);
1738d54cfbdroberto	}
1739d54cfbdroberto
1740d54cfbdroberto
1741d54cfbdroberto#if defined(USE_CMSG) || defined(SO_RCVBUF)
1742d54cfbdroberto	if (type == isc_sockettype_udp) {
1743d54cfbdroberto
1744d54cfbdroberto#if defined(USE_CMSG)
1745d54cfbdroberto#if defined(ISC_PLATFORM_HAVEIPV6)
1746d54cfbdroberto#ifdef IPV6_RECVPKTINFO
1747d54cfbdroberto		/* 2292bis */
1748d54cfbdroberto		if ((pf == AF_INET6)
1749d54cfbdroberto		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1750047f369cy				   (char *)&on, sizeof(on)) < 0)) {
1751d54cfbdroberto			isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
1752d54cfbdroberto			UNEXPECTED_ERROR(__FILE__, __LINE__,
1753d54cfbdroberto					 "setsockopt(%d, IPV6_RECVPKTINFO) "
1754d54cfbdroberto					 "%s: %s", sock->fd,
1755d54cfbdroberto					 isc_msgcat_get(isc_msgcat,
1756d54cfbdroberto							ISC_MSGSET_GENERAL,
1757d54cfbdroberto							ISC_MSG_FAILED,
1758d54cfbdroberto							"failed"),
1759d54cfbdroberto					 strbuf);
1760d54cfbdroberto		}
1761d54cfbdroberto#else
1762d54cfbdroberto		/* 2292 */
1763d54cfbdroberto		if ((pf == AF_INET6)
1764d54cfbdroberto		    && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
1765047f369cy				   (char *)&on, sizeof(on)) < 0)) {
1766d54cfbdroberto			isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf));
1767d54cfbdroberto			UNEXPECTED_ERROR(__FILE__, __LINE__,
1768d54cfbdroberto					 "setsockopt(%d, IPV6_PKTINFO) %s: %s",
1769d54cfbdroberto					 sock->fd,
1770d54cfbdroberto					 isc_msgcat_get(isc_msgcat,
1771d54cfbdroberto							ISC_MSGSET_GENERAL,
1772d54cfbdroberto							ISC_MSG_FAILED,
1773d54cfbdroberto							"failed"),
1774d54cfbdroberto					 strbuf);
1775d54cfbdroberto		}
1776d54cfbdroberto#endif /* IPV6_RECVPKTINFO */
1777d54cfbdroberto#ifdef IPV6_USE_MIN_MTU	/*2292bis, not too common yet*/
1778d54cfbdroberto		/* use minimum MTU */
1779d54cfbdroberto		if (pf == AF_INET6) {
1780d54cfbdroberto			(void)setsockopt(sock->fd, IPPROTO_IPV6,
1781d54cfbdroberto					 IPV6_USE_MIN_MTU,
1782047f369cy					 (char *)&on, sizeof(on));
1783d54cfbdroberto		}
1784d54cfbdroberto#endif
1785d54cfbdroberto#endif /* ISC_PLATFORM_HAVEIPV6 */
1786d54cfbdroberto#endif /* defined(USE_CMSG) */
1787d54cfbdroberto
1788d54cfbdroberto#if defined(SO_RCVBUF)
1789d54cfbdroberto	       optlen = sizeof(size);
1790d54cfbdroberto	       if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1791047f369cy			      (char *)&size, &optlen) >= 0 &&
1792d54cfbdroberto		    size < RCVBUFSIZE) {
1793d54cfbdroberto		       size = RCVBUFSIZE;
1794d54cfbdroberto		       (void)setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
1795047f369cy					(char *)&size, sizeof(size));
1796d54cfbdroberto	       }
1797d54cfbdroberto#endif
1798d54cfbdroberto
1799d54cfbdroberto	}
1800d54cfbdroberto#endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
1801d54cfbdroberto
1802d54cfbdroberto	_set_state(sock, SOCK_OPEN);
1803d54cfbdroberto	sock->references = 1;
1804d54cfbdroberto	*socketp = sock;
1805d54cfbdroberto
1806d54cfbdroberto	iocompletionport_update(sock);
1807d54cfbdroberto
1808d54cfbdroberto	/*
1809d54cfbdroberto	 * Note we don't have to lock the socket like we normally would because
1810d54cfbdroberto	 * there are no external references to it yet.
1811d54cfbdroberto	 */
1812d54cfbdroberto	LOCK(&manager->lock);
1813d54cfbdroberto	ISC_LIST_APPEND(manager->socklist, sock, link);
1814d54cfbdroberto	InterlockedIncrement(&manager->totalSockets);
1815d54cfbdroberto	UNLOCK(&manager->lock);
1816d54cfbdroberto
1817047f369cy	socket_log(__LINE__, sock, NULL, CREATION, isc_msgcat,
1818047f369cy		   ISC_MSGSET_SOCKET, ISC_MSG_CREATED,
1819047f369cy		   "created %u type %u", sock->fd, type);
1820d54cfbdroberto
1821d54cfbdroberto	return (ISC_R_SUCCESS);
1822d54cfbdroberto}
1823d54cfbdroberto
1824d54cfbdrobertoisc_result_t
1825047f369cyisc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
1826047f369cy		   isc_socket_t **socketp)
1827047f369cy{
1828047f369cy	return (socket_create(manager, pf, type, socketp, NULL));
1829047f369cy}
1830047f369cy
1831047f369cyisc_result_t
1832047f369cyisc__socket_dup(isc_socket_t *sock, isc_socket_t **socketp) {
1833047f369cy	REQUIRE(VALID_SOCKET(sock));
1834047f369cy	REQUIRE(socketp != NULL && *socketp == NULL);
1835047f369cy
1836047f369cy#if 1
1837047f369cy	return (ISC_R_NOTIMPLEMENTED);
1838047f369cy#else
1839047f369cy	return (socket_create(sock->manager, sock->pf, sock->type,
1840047f369cy			      socketp, sock));
1841047f369cy#endif
1842047f369cy}
1843047f369cy
1844047f369cyisc_result_t
1845d54cfbdrobertoisc_socket_open(isc_socket_t *sock) {
1846d54cfbdroberto	REQUIRE(VALID_SOCKET(sock));
1847d54cfbdroberto	REQUIRE(sock->type != isc_sockettype_fdwatch);
1848d54cfbdroberto
1849d54cfbdroberto	return (ISC_R_NOTIMPLEMENTED);
1850d54cfbdroberto}
1851d54cfbdroberto
1852d54cfbdroberto/*
1853d54cfbdroberto * Attach to a socket.  Caller must explicitly detach when it is done.
1854d54cfbdroberto */
1855d54cfbdrobertovoid
1856047f369cyisc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp) {
1857d54cfbdroberto	REQUIRE(VALID_SOCKET(sock));
1858d54cfbdroberto	REQUIRE(socketp != NULL && *socketp == NULL);
1859d54cfbdroberto
1860d54cfbdroberto	LOCK(&sock->lock);
1861d54cfbdroberto	CONSISTENT(sock);
1862d54cfbdroberto	sock->references++;
1863d54cfbdroberto	UNLOCK(&sock->lock);
1864d54cfbdroberto
1865d54cfbdroberto	*socketp = sock;
1866d54cfbdroberto}
1867d54cfbdroberto
1868d54cfbdroberto/*
1869d54cfbdroberto * Dereference a socket.  If this is the last reference to it, clean things
1870d54cfbdroberto * up by destroying the socket.
1871d54cfbdroberto */
1872d54cfbdrobertovoid
1873047f369cyisc__socket_detach(isc_socket_t **socketp) {
1874d54cfbdroberto	isc_socket_t *sock;
1875d54cfbdroberto	isc_boolean_t kill_socket = ISC_FALSE;
1876d54cfbdroberto
1877d54cfbdroberto	REQUIRE(socketp != NULL);
1878d54cfbdroberto	sock = *socketp;
1879d54cfbdroberto	REQUIRE(VALID_SOCKET(sock));
1880d54cfbdroberto	REQUIRE(sock->type != isc_sockettype_fdwatch);
1881d54cfbdroberto
1882d54cfbdroberto	LOCK(&sock->lock);
1883d54cfbdroberto	CONSISTENT(sock);
1884d54cfbdroberto	REQUIRE(sock->references > 0);
1885d54cfbdroberto	sock->references--;
1886d54cfbdroberto
1887d54cfbdroberto	socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
1888d54cfbdroberto		"detach_socket %d %d %d",
1889d54cfbdroberto		sock->pending_recv, sock->pending_send,
1890d54cfbdroberto		sock->references);
1891d54cfbdroberto
1892d54cfbdroberto	if (sock->references == 0 && sock->fd != INVALID_SOCKET) {
1893d54cfbdroberto		closesocket(sock->fd);
1894d54cfbdroberto		sock->fd = INVALID_SOCKET;
1895d54cfbdroberto		_set_state(sock, SOCK_CLOSED);
1896d54cfbdroberto	}
1897d54cfbdroberto
1898d54cfbdroberto	maybe_free_socket(&sock, __LINE__);
1899d54cfbdroberto
1900d54cfbdroberto	*socketp = NULL;
1901d54cfbdroberto}
1902d54cfbdroberto
1903d54cfbdrobertoisc_result_t
1904d54cfbdrobertoisc_socket_close(isc_socket_t *sock) {
1905d54cfbdroberto	REQUIRE(VALID_SOCKET(sock));
1906d54cfbdroberto	REQUIRE(sock->type != isc_sockettype_fdwatch);
1907d54cfbdroberto
1908d54cfbdroberto	return (ISC_R_NOTIMPLEMENTED);
1909d54cfbdroberto}
1910d54cfbdroberto
1911d54cfbdroberto/*
1912d54cfbdroberto * Dequeue an item off the given socket's read queue, set the result code
1913d54cfbdroberto * in the done event to the one provided, and send it to the task it was
1914d54cfbdroberto * destined for.
1915d54cfbdroberto *
1916d54cfbdroberto * If the event to be sent is on a list, remove it before sending.  If
1917d54cfbdroberto * asked to, send and detach from the task as well.
1918d54cfbdroberto *
1919d54cfbdroberto * Caller must have the socket locked if the event is attached to the socket.
1920d54cfbdroberto */
1921d54cfbdrobertostatic void
1922d54cfbdrobertosend_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1923d54cfbdroberto	isc_task_t *task;
1924d54cfbdroberto
1925d54cfbdroberto	task = (*dev)->ev_sender;
1926d54cfbdroberto	(*dev)->ev_sender = sock;
1927d54cfbdroberto
1928d54cfbdroberto	if (ISC_LINK_LINKED(*dev, ev_link))
1929d54cfbdroberto		ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
1930d54cfbdroberto
1931d54cfbdroberto	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1932d54cfbdroberto	    == ISC_SOCKEVENTATTR_ATTACHED)
1933d54cfbdroberto		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1934d54cfbdroberto	else
1935d54cfbdroberto		isc_task_send(task, (isc_event_t **)dev);
1936d54cfbdroberto
1937d54cfbdroberto	CONSISTENT(sock);
1938d54cfbdroberto}
1939d54cfbdroberto
1940d54cfbdroberto/*
1941d54cfbdroberto * See comments for send_recvdone_event() above.
1942d54cfbdroberto */
1943d54cfbdrobertostatic void
1944d54cfbdrobertosend_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) {
1945d54cfbdroberto	isc_task_t *task;
1946d54cfbdroberto
1947d54cfbdroberto	INSIST(dev != NULL && *dev != NULL);
1948d54cfbdroberto
1949d54cfbdroberto	task = (*dev)->ev_sender;
1950d54cfbdroberto	(*dev)->ev_sender = sock;
1951d54cfbdroberto
1952d54cfbdroberto	if (ISC_LINK_LINKED(*dev, ev_link))
1953d54cfbdroberto		ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
1954d54cfbdroberto
1955d54cfbdroberto	if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
1956d54cfbdroberto	    == ISC_SOCKEVENTATTR_ATTACHED)
1957d54cfbdroberto		isc_task_sendanddetach(&task, (isc_event_t **)dev);
1958d54cfbdroberto	else
1959d54cfbdroberto		isc_task_send(task, (isc_event_t **)dev);
1960d54cfbdroberto
1961d54cfbdroberto	CONSISTENT(sock);
1962d54cfbdroberto}
1963d54cfbdroberto
1964d54cfbdroberto/*
1965d54cfbdroberto * See comments for send_recvdone_event() above.
1966d54cfbdroberto */
1967d54cfbdrobertostatic void
1968d54cfbdrobertosend_acceptdone_event(isc_socket_t *sock, isc_socket_newconnev_t **adev) {
1969d54cfbdroberto	isc_task_t *task;
1970d54cfbdroberto
1971d54cfbdroberto	INSIST(adev != NULL && *adev != NULL);
1972d54cfbdroberto
1973d54cfbdroberto	task = (*adev)->ev_sender;
1974d54cfbdroberto	(*adev)->ev_sender = sock;
1975d54cfbdroberto
1976d54cfbdroberto	if (ISC_LINK_LINKED(*adev, ev_link))
1977d54cfbdroberto		ISC_LIST_DEQUEUE(sock->accept_list, *adev, ev_link);
1978d54cfbdroberto
1979d54cfbdroberto	isc_task_sendanddetach(&task, (isc_event_t **)adev);
1980d54cfbdroberto
1981d54cfbdroberto	CONSISTENT(sock);
1982d54cfbdroberto}
1983d54cfbdroberto
1984d54cfbdroberto/*
1985d54cfbdroberto * See comments for send_recvdone_event() above.
1986d54cfbdroberto */
1987d54cfbdrobertostatic void
1988d54cfbdrobertosend_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **cdev) {
1989d54cfbdroberto	isc_task_t *task;
1990d54cfbdroberto
1991d54cfbdroberto	INSIST(cdev != NULL && *cdev != NULL);
1992d54cfbdroberto
1993d54cfbdroberto	task = (*cdev)->ev_sender;
1994d54cfbdroberto	(*cdev)->ev_sender = sock;
1995d54cfbdroberto
1996d54cfbdroberto	sock->connect_ev = NULL;
1997d54cfbdroberto
1998d54cfbdroberto	isc_task_sendanddetach(&task, (isc_event_t **)cdev);
1999d54cfbdroberto
2000d54cfbdroberto	CONSISTENT(sock);
2001d54cfbdroberto}
2002d54cfbdroberto
2003d54cfbdroberto/*
2004d54cfbdroberto * On entry to this function, the event delivered is the internal
2005d54cfbdroberto * readable event, and the first item on the accept_list should be
2006d54cfbdroberto * the done event we want to send.  If the list is empty, this is a no-op,
2007d54cfbdroberto * so just close the new connection, unlock, and return.
2008d54cfbdroberto *
2009d54cfbdroberto * Note the socket is locked before entering here
2010d54cfbdroberto */
2011d54cfbdrobertostatic void
2012d54cfbdrobertointernal_accept(isc_socket_t *sock, IoCompletionInfo *lpo, int accept_errno) {
2013d54cfbdroberto	isc_socket_newconnev_t *adev;
2014d54cfbdroberto	isc_result_t result = ISC_R_SUCCESS;
2015d54cfbdroberto	isc_socket_t *nsock;
2016d54cfbdroberto	struct sockaddr *localaddr;
2017d54cfbdroberto	int localaddr_len = sizeof(*localaddr);
2018d54cfbdroberto	struct sockaddr *remoteaddr;
2019d54cfbdroberto	int remoteaddr_len = sizeof(*remoteaddr);
2020d54cfbdroberto
2021d54cfbdroberto	INSIST(VALID_SOCKET(sock));
2022d54cfbdroberto	LOCK(&sock->lock);
2023d54cfbdroberto	CONSISTENT(sock);
2024d54cfbdroberto
2025d54cfbdroberto	socket_log(__LINE__, sock, NULL, TRACE,
2026d54cfbdroberto		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2027d54cfbdroberto		   "internal_accept called");
2028d54cfbdroberto
2029d54cfbdroberto	INSIST(sock->listener);
2030d54cfbdroberto
2031d54cfbdroberto	INSIST(sock->pending_iocp > 0);
2032d54cfbdroberto	sock->pending_iocp--;
2033d54cfbdroberto	INSIST(sock->pending_accept > 0);
2034d54cfbdroberto	sock->pending_accept--;
2035d54cfbdroberto
2036d54cfbdroberto	adev = lpo->adev;
2037d54cfbdroberto
2038d54cfbdroberto	/*
2039d54cfbdroberto	 * If the event is no longer in the list we can just return.
2040d54cfbdroberto	 */
2041d54cfbdroberto	if (!acceptdone_is_active(sock, adev))
2042d54cfbdroberto		goto done;
2043d54cfbdroberto
2044d54cfbdroberto	nsock = adev->newsocket;
2045d54cfbdroberto
2046d54cfbdroberto	/*
2047d54cfbdroberto	 * Pull off the done event.
2048d54cfbdroberto	 */
2049d54cfbdroberto	ISC_LIST_UNLINK(sock->accept_list, adev, ev_link);
2050d54cfbdroberto
2051d54cfbdroberto	/*
2052d54cfbdroberto	 * Extract the addresses from the socket, copy them into the structure,
2053d54cfbdroberto	 * and return the new socket.
2054d54cfbdroberto	 */
2055d54cfbdroberto	ISCGetAcceptExSockaddrs(lpo->acceptbuffer, 0,
2056d54cfbdroberto		sizeof(SOCKADDR_STORAGE) + 16, sizeof(SOCKADDR_STORAGE) + 16,
2057d54cfbdroberto		(LPSOCKADDR *)&localaddr, &localaddr_len,
2058d54cfbdroberto		(LPSOCKADDR *)&remoteaddr, &remoteaddr_len);
2059d54cfbdroberto	memcpy(&adev->address.type, remoteaddr, remoteaddr_len);
2060d54cfbdroberto	adev->address.length = remoteaddr_len;
2061d54cfbdroberto	nsock->address = adev->address;
2062d54cfbdroberto	nsock->pf = adev->address.type.sa.sa_family;
2063d54cfbdroberto
2064d54cfbdroberto	socket_log(__LINE__, nsock, &nsock->address, TRACE,
2065d54cfbdroberto		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2066d54cfbdroberto		   "internal_accept parent %p", sock);
2067d54cfbdroberto
2068d54cfbdroberto	result = make_nonblock(adev->newsocket->fd);
2069d54cfbdroberto	INSIST(result == ISC_R_SUCCESS);
2070d54cfbdroberto
2071d54cfbdroberto	INSIST(setsockopt(nsock->fd, SOL_SOCKET, SO_UPDATE_ACCEPT_CONTEXT,
2072047f369cy			  (char *)&sock->fd, sizeof(sock->fd)) == 0);
2073d54cfbdroberto
2074d54cfbdroberto	/*
2075d54cfbdroberto	 * Hook it up into the manager.
2076d54cfbdroberto	 */
2077d54cfbdroberto	nsock->bound = 1;
2078d54cfbdroberto	nsock->connected = 1;
2079d54cfbdroberto	_set_state(nsock, SOCK_OPEN);
2080d54cfbdroberto
2081d54cfbdroberto	LOCK(&nsock->manager->lock);
2082d54cfbdroberto	ISC_LIST_APPEND(nsock->manager->socklist, nsock, link);
2083d54cfbdroberto	InterlockedIncrement(&nsock->manager->totalSockets);
2084d54cfbdroberto	UNLOCK(&nsock->manager->lock);
2085d54cfbdroberto
2086d54cfbdroberto	socket_log(__LINE__, sock, &nsock->address, CREATION,
2087d54cfbdroberto		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2088d54cfbdroberto		   "accepted_connection new_socket %p fd %d",
2089d54cfbdroberto		   nsock, nsock->fd);
2090d54cfbdroberto
2091d54cfbdroberto	adev->result = result;
2092d54cfbdroberto	send_acceptdone_event(sock, &adev);
2093d54cfbdroberto
2094d54cfbdrobertodone:
2095d54cfbdroberto	CONSISTENT(sock);
2096d54cfbdroberto	UNLOCK(&sock->lock);
2097d54cfbdroberto
2098d54cfbdroberto	HeapFree(hHeapHandle, 0, lpo->acceptbuffer);
2099d54cfbdroberto	lpo->acceptbuffer = NULL;
2100d54cfbdroberto}
2101d54cfbdroberto
2102d54cfbdroberto/*
2103d54cfbdroberto * Called when a socket with a pending connect() finishes.
2104d54cfbdroberto * Note that the socket is locked before entering.
2105d54cfbdroberto */
2106d54cfbdrobertostatic void
2107d54cfbdrobertointernal_connect(isc_socket_t *sock, IoCompletionInfo *lpo, int connect_errno) {
2108d54cfbdroberto	isc_socket_connev_t *cdev;
2109d54cfbdroberto	char strbuf[ISC_STRERRORSIZE];
2110d54cfbdroberto
2111d54cfbdroberto	INSIST(VALID_SOCKET(sock));
2112d54cfbdroberto
2113d54cfbdroberto	LOCK(&sock->lock);
2114d54cfbdroberto
2115d54cfbdroberto	INSIST(sock->pending_iocp > 0);
2116d54cfbdroberto	sock->pending_iocp--;
2117d54cfbdroberto	INSIST(sock->pending_connect == 1);
2118d54cfbdroberto	sock->pending_connect = 0;
2119d54cfbdroberto
2120d54cfbdroberto	/*
2121d54cfbdroberto	 * Has this event been canceled?
2122d54cfbdroberto	 */
2123d54cfbdroberto	cdev = lpo->cdev;
2124d54cfbdroberto	if (!connectdone_is_active(sock, cdev)) {
2125d54cfbdroberto		sock->pending_connect = 0;
2126d54cfbdroberto		if (sock->fd != INVALID_SOCKET) {
2127d54cfbdroberto			closesocket(sock->fd);
2128d54cfbdroberto			sock->fd = INVALID_SOCKET;
2129d54cfbdroberto			_set_state(sock, SOCK_CLOSED);
2130d54cfbdroberto		}
2131d54cfbdroberto		CONSISTENT(sock);
2132d54cfbdroberto		UNLOCK(&sock->lock);
2133d54cfbdroberto		return;
2134d54cfbdroberto	}
2135d54cfbdroberto
2136d54cfbdroberto	/*
2137d54cfbdroberto	 * Check possible Windows network event error status here.
2138d54cfbdroberto	 */
2139d54cfbdroberto	if (connect_errno != 0) {
2140d54cfbdroberto		/*
2141d54cfbdroberto		 * If the error is SOFT, just try again on this
2142d54cfbdroberto		 * fd and pretend nothing strange happened.
2143d54cfbdroberto		 */
2144d54cfbdroberto		if (SOFT_ERROR(connect_errno) ||
2145d54cfbdroberto		    connect_errno == WSAEINPROGRESS) {
2146d54cfbdroberto			sock->pending_connect = 1;
2147d54cfbdroberto			CONSISTENT(sock);
2148d54cfbdroberto			UNLOCK(&sock->lock);
2149d54cfbdroberto			return;
2150d54cfbdroberto		}
2151d54cfbdroberto
2152d54cfbdroberto		/*
2153d54cfbdroberto		 * Translate other errors into ISC_R_* flavors.
2154d54cfbdroberto		 */
2155d54cfbdroberto		switch (connect_errno) {
2156d54cfbdroberto#define ERROR_MATCH(a, b) case a: cdev->result = b; break;
2157d54cfbdroberto			ERROR_MATCH(WSAEACCES, ISC_R_NOPERM);
2158d54cfbdroberto			ERROR_MATCH(WSAEADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
2159d54cfbdroberto			ERROR_MATCH(WSAEAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
2160d54cfbdroberto			ERROR_MATCH(WSAECONNREFUSED, ISC_R_CONNREFUSED);
2161d54cfbdroberto			ERROR_MATCH(WSAEHOSTUNREACH, ISC_R_HOSTUNREACH);
2162d54cfbdroberto			ERROR_MATCH(WSAEHOSTDOWN, ISC_R_HOSTDOWN);
2163d54cfbdroberto			ERROR_MATCH(WSAENETUNREACH, ISC_R_NETUNREACH);
2164d54cfbdroberto			ERROR_MATCH(WSAENETDOWN, ISC_R_NETDOWN);
2165d54cfbdroberto			ERROR_MATCH(WSAENOBUFS, ISC_R_NORESOURCES);
2166d54cfbdroberto			ERROR_MATCH(WSAECONNRESET, ISC_R_CONNECTIONRESET);
2167d54cfbdroberto			ERROR_MATCH(WSAECONNABORTED, ISC_R_CONNECTIONRESET);
2168d54cfbdroberto			ERROR_MATCH(WSAETIMEDOUT, ISC_R_TIMEDOUT);
2169d54cfbdroberto#undef ERROR_MATCH
2170d54cfbdroberto		default:
2171d54cfbdroberto			cdev->result = ISC_R_UNEXPECTED;
2172d54cfbdroberto			isc__strerror(connect_errno, strbuf, sizeof(strbuf));
2173d54cfbdroberto			UNEXPECTED_ERROR(__FILE__, __LINE__,
2174d54cfbdroberto					 "internal_connect: connect() %s",
2175d54cfbdroberto					 strbuf);
2176d54cfbdroberto		}
2177d54cfbdroberto	} else {
2178047f369cy		INSIST(setsockopt(sock->fd, SOL_SOCKET,
2179047f369cy				  SO_UPDATE_CONNECT_CONTEXT, NULL, 0) == 0);
2180d54cfbdroberto		cdev->result = ISC_R_SUCCESS;
2181d54cfbdroberto		sock->connected = 1;
2182d54cfbdroberto		socket_log(__LINE__, sock, &sock->address, IOEVENT,
2183d54cfbdroberto			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
2184d54cfbdroberto			   "internal_connect: success");
2185d54cfbdroberto	}
2186d54cfbdroberto
2187d54cfbdroberto	send_connectdone_event(sock, &cdev);
2188d54cfbdroberto
2189d54cfbdroberto	UNLOCK(&sock->lock);
2190d54cfbdroberto}
2191d54cfbdroberto
2192d54cfbdroberto/*
2193d54cfbdroberto * Loop through the socket, returning ISC_R_EOF for each done event pending.
2194d54cfbdroberto */
2195d54cfbdrobertostatic void
2196d54cfbdrobertosend_recvdone_abort(isc_socket_t *sock, isc_result_t result) {
2197d54cfbdroberto	isc_socketevent_t *dev;
2198d54cfbdroberto
2199d54cfbdroberto	while (!ISC_LIST_EMPTY(sock->recv_list)) {
2200d54cfbdroberto		dev = ISC_LIST_HEAD(sock->recv_list);
2201d54cfbdroberto		dev->result = result;
2202d54cfbdroberto		send_recvdone_event(sock, &dev);
2203d54cfbdroberto	}
2204d54cfbdroberto}
2205d54cfbdroberto
2206d54cfbdroberto/*
2207d54cfbdroberto * Take the data we received in our private buffer, and if any recv() calls on
2208d54cfbdroberto * our list are satisfied, send the corresponding done event.
2209d54cfbdroberto *
2210d54cfbdroberto * If we need more data (there are still items on the recv_list after we consume all
2211d54cfbdroberto * our data) then arrange for another system recv() call to fill our buffers.
2212d54cfbdroberto */
2213d54cfbdrobertostatic void
2214d54cfbdrobertointernal_recv(isc_socket_t *sock, int nbytes)
2215d54cfbdroberto{
2216d54cfbdroberto	INSIST(VALID_SOCKET(sock));
2217d54cfbdroberto
2218d54cfbdroberto	LOCK(&sock->lock);
2219d54cfbdroberto	CONSISTENT(sock);
2220d54cfbdroberto
2221d54cfbdroberto	socket_log(__LINE__, sock, NULL, IOEVENT,
2222d54cfbdroberto		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
2223d54cfbdroberto		   "internal_recv: %d bytes received", nbytes);
2224d54cfbdroberto
2225d54cfbdroberto	/*
2226d54cfbdroberto	 * If we got here, the I/O operation succeeded.  However, we might still have removed this
2227d54cfbdroberto	 * event from our notification list (or never placed it on it due to immediate completion.)
2228d54cfbdroberto	 * Handle the reference counting here, and handle the cancellation event just after.
2229d54cfbdroberto	 */
2230d54cfbdroberto	INSIST(sock->pending_iocp > 0);
2231d54cfbdroberto	sock->pending_iocp--;
2232d54cfbdroberto	INSIST(sock->pending_recv > 0);
2233d54cfbdroberto	sock->pending_recv--;
2234d54cfbdroberto
2235d54cfbdroberto	/*
2236d54cfbdroberto	 * The only way we could have gotten here is that our I/O has successfully completed.
2237d54cfbdroberto	 * Update our pointers, and move on.  The only odd case here is that we might not
2238d54cfbdroberto	 * have received enough data on a TCP stream to satisfy the minimum requirements.  If
2239d54cfbdroberto	 * this is the case, we will re-issue the recv() call for what we need.
2240d54cfbdroberto	 *
2241d54cfbdroberto	 * We do check for a recv() of 0 bytes on a TCP stream.  This means the remote end
2242d54cfbdroberto	 * has closed.
2243d54cfbdroberto	 */
2244d54cfbdroberto	if (nbytes == 0 && sock->type == isc_sockettype_tcp) {
2245d54cfbdroberto		send_recvdone_abort(sock, ISC_R_EOF);
2246d54cfbdroberto		maybe_free_socket(&sock, __LINE__);
2247d54cfbdroberto		return;
2248d54cfbdroberto	}
2249d54cfbdroberto	sock->recvbuf.remaining = nbytes;
2250d54cfbdroberto	sock->recvbuf.consume_position = sock->recvbuf.base;
2251d54cfbdroberto	completeio_recv(sock);
2252d54cfbdroberto
2253d54cfbdroberto	/*
2254d54cfbdroberto	 * If there are more receivers waiting for data, queue another receive
2255d54cfbdroberto	 * here.
2256d54cfbdroberto	 */
2257d54cfbdroberto	queue_receive_request(sock);
2258d54cfbdroberto
2259d54cfbdroberto	/*
2260d54cfbdroberto	 * Unlock and/or destroy if we are the last thing this socket has left to do.
2261d54cfbdroberto	 */
2262d54cfbdroberto	maybe_free_socket(&sock, __LINE__);
2263d54cfbdroberto}
2264d54cfbdroberto
2265d54cfbdrobertostatic void
2266d54cfbdrobertointernal_send(isc_socket_t *sock, isc_socketevent_t *dev,
2267d54cfbdroberto	      struct msghdr *messagehdr, int nbytes, int send_errno, IoCompletionInfo *lpo)
2268d54cfbdroberto{
2269d54cfbdroberto	buflist_t *buffer;
2270d54cfbdroberto
2271d54cfbdroberto	/*
2272d54cfbdroberto	 * Find out what socket this is and lock it.
2273d54cfbdroberto	 */
2274d54cfbdroberto	INSIST(VALID_SOCKET(sock));
2275d54cfbdroberto
2276d54cfbdroberto	LOCK(&sock->lock);
2277d54cfbdroberto	CONSISTENT(sock);
2278d54cfbdroberto
2279d54cfbdroberto	socket_log(__LINE__, sock, NULL, IOEVENT,
2280d54cfbdroberto		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
2281d54cfbdroberto		   "internal_send: task got socket event %p", dev);
2282d54cfbdroberto
2283d54cfbdroberto	buffer = ISC_LIST_HEAD(lpo->bufferlist);
2284d54cfbdroberto	while (buffer != NULL) {
2285d54cfbdroberto		ISC_LIST_DEQUEUE(lpo->bufferlist, buffer, link);
2286d54cfbdroberto
2287d54cfbdroberto		socket_log(__LINE__, sock, NULL, TRACE,
2288d54cfbdroberto		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
2289d54cfbdroberto		   "free_buffer %p %p", buffer, buffer->buf);
2290d54cfbdroberto
2291d54cfbdroberto		HeapFree(hHeapHandle, 0, buffer->buf);
2292d54cfbdroberto		HeapFree(hHeapHandle, 0, buffer);
2293d54cfbdroberto		buffer = ISC_LIST_HEAD(lpo->bufferlist);
2294d54cfbdroberto	}
2295d54cfbdroberto
2296d54cfbdroberto	INSIST(sock->pending_iocp > 0);
2297d54cfbdroberto	sock->pending_iocp--;
2298d54cfbdroberto	INSIST(sock->pending_send > 0);
2299d54cfbdroberto	sock->pending_send--;
2300d54cfbdroberto
2301d54cfbdroberto	/* If the event is no longer in the list we can just return */
2302d54cfbdroberto	if (!senddone_is_active(sock, dev))
2303d54cfbdroberto		goto done;
2304d54cfbdroberto
2305d54cfbdroberto	/*
2306d54cfbdroberto	 * Set the error code and send things on its way.
2307d54cfbdroberto	 */
2308d54cfbdroberto	switch (completeio_send(sock, dev, messagehdr, nbytes, send_errno)) {
2309d54cfbdroberto	case DOIO_SOFT:
2310d54cfbdroberto		break;
2311d54cfbdroberto	case DOIO_HARD:
2312d54cfbdroberto	case DOIO_SUCCESS:
2313d54cfbdroberto		send_senddone_event(sock, &dev);
2314d54cfbdroberto		break;
2315d54cfbdroberto	}
2316d54cfbdroberto
2317d54cfbdroberto done:
2318d54cfbdroberto	maybe_free_socket(&sock, __LINE__);
2319d54cfbdroberto}
2320d54cfbdroberto
2321d54cfbdroberto/*
2322d54cfbdroberto * These return if the done event passed in is on the list (or for connect, is
2323d54cfbdroberto * the one we're waiting for.  Using these ensures we will not double-send an
2324d54cfbdroberto * event.
2325d54cfbdroberto */
2326d54cfbdrobertostatic isc_boolean_t
2327d54cfbdrobertosenddone_is_active(isc_socket_t *sock, isc_socketevent_t *dev)
2328d54cfbdroberto{
2329d54cfbdroberto	isc_socketevent_t *ldev;
2330d54cfbdroberto
2331d54cfbdroberto	ldev = ISC_LIST_HEAD(sock->send_list);
2332d54cfbdroberto	while (ldev != NULL && ldev != dev)
2333d54cfbdroberto		ldev = ISC_LIST_NEXT(ldev, ev_link);
2334d54cfbdroberto
2335d54cfbdroberto	return (ldev == NULL ? ISC_FALSE : ISC_TRUE);
2336d54cfbdroberto}
2337d54cfbdroberto
2338d54cfbdrobertostatic isc_boolean_t
2339d54cfbdrobertoacceptdone_is_active(isc_socket_t *sock, isc_socket_newconnev_t *dev)
2340d54cfbdroberto{
2341d54cfbdroberto	isc_socket_newconnev_t *ldev;
2342d54cfbdroberto
2343d54cfbdroberto	ldev = ISC_LIST_HEAD(sock->accept_list);
2344d54cfbdroberto	while (ldev != NULL && ldev != dev)
2345d54cfbdroberto		ldev = ISC_LIST_NEXT(ldev, ev_link);
2346d54cfbdroberto
2347d54cfbdroberto	return (ldev == NULL ? ISC_FALSE : ISC_TRUE);
2348d54cfbdroberto}
2349d54cfbdroberto
2350d54cfbdrobertostatic isc_boolean_t
2351d54cfbdrobertoconnectdone_is_active(isc_socket_t *sock, isc_socket_connev_t *dev)
2352d54cfbdroberto{
2353d54cfbdroberto	return (sock->connect_ev == dev ? ISC_TRUE : ISC_FALSE);
2354d54cfbdroberto}
2355d54cfbdroberto
2356047f369cy//
2357047f369cy// The Windows network stack seems to have two very distinct paths depending
2358047f369cy// on what is installed.  Specifically, if something is looking at network
2359047f369cy// connections (like an anti-virus or anti-malware application, such as
2360047f369cy// McAfee products) Windows may return additional error conditions which
2361047f369cy// were not previously returned.
2362047f369cy//
2363047f369cy// One specific one is when a TCP SYN scan is used.  In this situation,
2364047f369cy// Windows responds with the SYN-ACK, but the scanner never responds with
2365047f369cy// the 3rd packet, the ACK.  Windows consiers this a partially open connection.
2366047f369cy// Most Unix networking stacks, and Windows without McAfee installed, will
2367047f369cy// not return this to the caller.  However, with this product installed,
2368047f369cy// Windows returns this as a failed status on the Accept() call.  Here, we
2369047f369cy// will just re-issue the ISCAcceptEx() call as if nothing had happened.
2370047f369cy//
2371047f369cy// This code should only be called when the listening socket has received
2372047f369cy// such an error.  Additionally, the "parent" socket must be locked.
2373047f369cy// Additionally, the lpo argument is re-used here, and must not be freed
2374047f369cy// by the caller.
2375047f369cy//
2376047f369cystatic isc_result_t
2377047f369cyrestart_accept(isc_socket_t *parent, IoCompletionInfo *lpo)
2378047f369cy{
2379047f369cy	isc_socket_t *nsock = lpo->adev->newsocket;
2380047f369cy	SOCKET new_fd;
2381047f369cy
2382047f369cy	/*
2383047f369cy	 * AcceptEx() requires we pass in a socket.  Note that we carefully
2384047f369cy	 * do not close the previous socket in case of an error message returned by
2385047f369cy	 * our new socket() call.  If we return an error here, our caller will
2386047f369cy	 * clean up.
2387047f369cy	 */
2388047f369cy	new_fd = socket(parent->pf, SOCK_STREAM, IPPROTO_TCP);
2389047f369cy	if (nsock->fd == INVALID_SOCKET) {
2390047f369cy		return (ISC_R_FAILURE); // parent will ask windows for error message
2391047f369cy	}
2392047f369cy	closesocket(nsock->fd);
2393047f369cy	nsock->fd = new_fd;
2394047f369cy
2395047f369cy	memset(&lpo->overlapped, 0, sizeof(lpo->overlapped));
2396047f369cy
2397047f369cy	ISCAcceptEx(parent->fd,
2398047f369cy		    nsock->fd,				/* Accepted Socket */
2399047f369cy		    lpo->acceptbuffer,			/* Buffer for initial Recv */
2400047f369cy		    0,					/* Length of Buffer */
2401047f369cy		    sizeof(SOCKADDR_STORAGE) + 16,	/* Local address length + 16 */
2402047f369cy		    sizeof(SOCKADDR_STORAGE) + 16,	/* Remote address lengh + 16 */
2403047f369cy		    (LPDWORD)&lpo->received_bytes,	/* Bytes Recved */
2404047f369cy		    (LPOVERLAPPED)lpo			/* Overlapped structure */
2405047f369cy		    );
2406047f369cy
2407047f369cy	InterlockedDecrement(&nsock->manager->iocp_total);
2408047f369cy	iocompletionport_update(nsock);
2409047f369cy
2410047f369cy	return (ISC_R_SUCCESS);
2411047f369cy}
2412047f369cy
2413d54cfbdroberto/*
2414d54cfbdroberto * This is the I/O Completion Port Worker Function. It loops forever
2415d54cfbdroberto * waiting for I/O to complete and then forwards them for further
2416d54cfbdroberto * processing. There are a number of these in separate threads.
2417d54cfbdroberto */
2418d54cfbdrobertostatic isc_threadresult_t WINAPI
2419d54cfbdrobertoSocketIoThread(LPVOID ThreadContext) {
2420d54cfbdroberto	isc_socketmgr_t *manager = ThreadContext;
2421d54cfbdroberto	BOOL bSuccess = FALSE;
2422d54cfbdroberto	DWORD nbytes;
2423d54cfbdroberto	IoCompletionInfo *lpo = NULL;
2424d54cfbdroberto	isc_socket_t *sock = NULL;
2425d54cfbdroberto	int request;
2426d54cfbdroberto	struct msghdr *messagehdr = NULL;
2427d54cfbdroberto	int errval;
2428d54cfbdroberto	char strbuf[ISC_STRERRORSIZE];
2429d54cfbdroberto	int errstatus;
2430d54cfbdroberto
2431d54cfbdroberto	REQUIRE(VALID_MANAGER(manager));
2432d54cfbdroberto
2433d54cfbdroberto	/*
2434d54cfbdroberto	 * Set the thread priority high enough so I/O will
2435d54cfbdroberto	 * preempt normal recv packet processing, but not
2436d54cfbdroberto	 * higher than the timer sync thread.
2437d54cfbdroberto	 */
2438d54cfbdroberto	if (!SetThreadPriority(GetCurrentThread(),
2439d54cfbdroberto			       THREAD_PRIORITY_ABOVE_NORMAL)) {
2440d54cfbdroberto		errval = GetLastError();
2441d54cfbdroberto		isc__strerror(errval, strbuf, sizeof(strbuf));
2442d54cfbdroberto		FATAL_ERROR(__FILE__, __LINE__,
2443d54cfbdroberto				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2444d54cfbdroberto				ISC_MSG_FAILED,
2445d54cfbdroberto				"Can't set thread priority: %s"),
2446d54cfbdroberto				strbuf);
2447d54cfbdroberto	}
2448d54cfbdroberto
2449d54cfbdroberto	/*
2450d54cfbdroberto	 * Loop forever waiting on I/O Completions and then processing them
2451d54cfbdroberto	 */
2452d54cfbdroberto	while (TRUE) {
2453047f369cy		wait_again:
2454d54cfbdroberto		bSuccess = GetQueuedCompletionStatus(manager->hIoCompletionPort,
2455d54cfbdroberto						     &nbytes, (LPDWORD)&sock,
2456d54cfbdroberto						     (LPWSAOVERLAPPED *)&lpo,
2457d54cfbdroberto						     INFINITE);
2458d54cfbdroberto		if (lpo == NULL) /* Received request to exit */
2459d54cfbdroberto			break;
2460d54cfbdroberto
2461d54cfbdroberto		REQUIRE(VALID_SOCKET(sock));
2462d54cfbdroberto
2463d54cfbdroberto		request = lpo->request_type;
2464d54cfbdroberto
2465d54cfbdroberto		errstatus = 0;
2466d54cfbdroberto		if (!bSuccess) {
2467d54cfbdroberto			isc_result_t isc_result;
2468d54cfbdroberto
2469d54cfbdroberto			/*
2470d54cfbdroberto			 * Did the I/O operation complete?
2471d54cfbdroberto			 */
2472047f369cy			errstatus = GetLastError();
2473d54cfbdroberto			isc_result = isc__errno2resultx(errstatus, __FILE__, __LINE__);
2474d54cfbdroberto
2475d54cfbdroberto			LOCK(&sock->lock);
2476d54cfbdroberto			CONSISTENT(sock);
2477d54cfbdroberto			switch (request) {
2478d54cfbdroberto			case SOCKET_RECV:
2479d54cfbdroberto				INSIST(sock->pending_iocp > 0);
2480d54cfbdroberto				sock->pending_iocp--;
2481d54cfbdroberto				INSIST(sock->pending_recv > 0);
2482d54cfbdroberto				sock->pending_recv--;
2483047f369cy				if (!sock->connected &&
2484047f369cy				    ((errstatus == ERROR_HOST_UNREACHABLE) ||
2485047f369cy				     (errstatus == WSAENETRESET) ||
2486047f369cy				     (errstatus == WSAECONNRESET))) {
2487047f369cy					/* ignore soft errors */
2488047f369cy					queue_receive_request(sock);
2489047f369cy					break;
2490047f369cy				}
2491d54cfbdroberto				send_recvdone_abort(sock, isc_result);
2492d54cfbdroberto				if (isc_result == ISC_R_UNEXPECTED) {
2493d54cfbdroberto					UNEXPECTED_ERROR(__FILE__, __LINE__,
2494d54cfbdroberto						"SOCKET_RECV: Windows error code: %d, returning ISC error %d",
2495d54cfbdroberto						errstatus, isc_result);
2496d54cfbdroberto				}
2497d54cfbdroberto				break;
2498d54cfbdroberto
2499d54cfbdroberto			case SOCKET_SEND:
2500d54cfbdroberto				INSIST(sock->pending_iocp > 0);
2501d54cfbdroberto				sock->pending_iocp--;
2502d54cfbdroberto				INSIST(sock->pending_send > 0);
2503d54cfbdroberto				sock->pending_send--;
2504d54cfbdroberto				if (senddone_is_active(sock, lpo->dev)) {
2505d54cfbdroberto					lpo->dev->result = isc_result;
2506d54cfbdroberto					socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2507d54cfbdroberto						"canceled_send");
2508d54cfbdroberto					send_senddone_event(sock, &lpo->dev);
2509d54cfbdroberto				}
2510d54cfbdroberto				break;
2511d54cfbdroberto
2512d54cfbdroberto			case SOCKET_ACCEPT:
2513d54cfbdroberto				INSIST(sock->pending_iocp > 0);
2514d54cfbdroberto				INSIST(sock->pending_accept > 0);
2515047f369cy
2516047f369cy				socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2517047f369cy					"Accept: errstatus=%d isc_result=%d", errstatus, isc_result);
2518047f369cy
2519047f369cy				if (acceptdone_is_active(sock, lpo->adev)) {
2520047f369cy					if (restart_accept(sock, lpo) == ISC_R_SUCCESS) {
2521047f369cy						UNLOCK(&sock->lock);
2522047f369cy						goto wait_again;
2523047f369cy					} else {
2524047f369cy						errstatus = GetLastError();
2525047f369cy						isc_result = isc__errno2resultx(errstatus, __FILE__, __LINE__);
2526047f369cy						socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2527047f369cy							"restart_accept() failed: errstatus=%d isc_result=%d",
2528047f369cy							errstatus, isc_result);
2529047f369cy					}
2530047f369cy				}
2531047f369cy
2532047f369cy				sock->pending_iocp--;
2533d54cfbdroberto				sock->pending_accept--;
2534d54cfbdroberto				if (acceptdone_is_active(sock, lpo->adev)) {
2535d54cfbdroberto					closesocket(lpo->adev->newsocket->fd);
2536d54cfbdroberto					lpo->adev->newsocket->fd = INVALID_SOCKET;
2537d54cfbdroberto					lpo->adev->newsocket->references--;
2538d54cfbdroberto					free_socket(&lpo->adev->newsocket, __LINE__);
2539d54cfbdroberto					lpo->adev->result = isc_result;
2540d54cfbdroberto					socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2541d54cfbdroberto						"canceled_accept");
2542d54cfbdroberto					send_acceptdone_event(sock, &lpo->adev);
2543d54cfbdroberto				}
2544d54cfbdroberto				break;
2545d54cfbdroberto
2546d54cfbdroberto			case SOCKET_CONNECT:
2547d54cfbdroberto				INSIST(sock->pending_iocp > 0);
2548d54cfbdroberto				sock->pending_iocp--;
2549d54cfbdroberto				INSIST(sock->pending_connect == 1);
2550d54cfbdroberto				sock->pending_connect = 0;
2551d54cfbdroberto				if (connectdone_is_active(sock, lpo->cdev)) {
2552d54cfbdroberto					lpo->cdev->result = isc_result;
2553d54cfbdroberto					socket_log(__LINE__, sock, NULL, EVENT, NULL, 0, 0,
2554d54cfbdroberto						"canceled_connect");
2555d54cfbdroberto					send_connectdone_event(sock, &lpo->cdev);
2556d54cfbdroberto				}
2557d54cfbdroberto				break;
2558d54cfbdroberto			}
2559d54cfbdroberto			maybe_free_socket(&sock, __LINE__);
2560d54cfbdroberto
2561d54cfbdroberto			if (lpo != NULL)
2562d54cfbdroberto				HeapFree(hHeapHandle, 0, lpo);
2563d54cfbdroberto			continue;
2564d54cfbdroberto		}
2565d54cfbdroberto
2566d54cfbdroberto		messagehdr = &lpo->messagehdr;
2567d54cfbdroberto
2568d54cfbdroberto		switch (request) {
2569d54cfbdroberto		case SOCKET_RECV:
2570d54cfbdroberto			internal_recv(sock, nbytes);
2571d54cfbdroberto			break;
2572d54cfbdroberto		case SOCKET_SEND:
2573d54cfbdroberto			internal_send(sock, lpo->dev, messagehdr, nbytes, errstatus, lpo);
2574d54cfbdroberto			break;
2575d54cfbdroberto		case SOCKET_ACCEPT:
2576d54cfbdroberto			internal_accept(sock, lpo, errstatus);
2577d54cfbdroberto			break;
2578d54cfbdroberto		case SOCKET_CONNECT:
2579d54cfbdroberto			internal_connect(sock, lpo, errstatus);
2580d54cfbdroberto			break;
2581d54cfbdroberto		}
2582d54cfbdroberto
2583d54cfbdroberto		if (lpo != NULL)
2584d54cfbdroberto			HeapFree(hHeapHandle, 0, lpo);
2585d54cfbdroberto	}
2586d54cfbdroberto
2587d54cfbdroberto	/*
2588d54cfbdroberto	 * Exit Completion Port Thread
2589d54cfbdroberto	 */
2590d54cfbdroberto	manager_log(manager, TRACE,
2591d54cfbdroberto		    isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2592d54cfbdroberto				   ISC_MSG_EXITING, "SocketIoThread exiting"));
2593d54cfbdroberto	return ((isc_threadresult_t)0);
2594d54cfbdroberto}
2595d54cfbdroberto
2596d54cfbdroberto/*
2597d54cfbdroberto * Create a new socket manager.
2598d54cfbdroberto */
2599d54cfbdrobertoisc_result_t
2600047f369cyisc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
2601d54cfbdroberto	return (isc_socketmgr_create2(mctx, managerp, 0));
2602d54cfbdroberto}
2603d54cfbdroberto
2604d54cfbdrobertoisc_result_t
2605047f369cyisc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
2606047f369cy		       unsigned int maxsocks)
2607d54cfbdroberto{
2608d54cfbdroberto	isc_socketmgr_t *manager;
2609d54cfbdroberto	isc_result_t result;
2610d54cfbdroberto
2611d54cfbdroberto	REQUIRE(managerp != NULL && *managerp == NULL);
2612d54cfbdroberto
2613d54cfbdroberto	if (maxsocks != 0)
2614d54cfbdroberto		return (ISC_R_NOTIMPLEMENTED);
2615d54cfbdroberto
2616d54cfbdroberto	manager = isc_mem_get(mctx, sizeof(*manager));
2617d54cfbdroberto	if (manager == NULL)
2618d54cfbdroberto		return (ISC_R_NOMEMORY);
2619d54cfbdroberto
2620d54cfbdroberto	InitSockets();
2621d54cfbdroberto
2622d54cfbdroberto	manager->magic = SOCKET_MANAGER_MAGIC;
2623d54cfbdroberto	manager->mctx = NULL;
2624d54cfbdroberto	manager->stats = NULL;
2625d54cfbdroberto	ISC_LIST_INIT(manager->socklist);
2626d54cfbdroberto	result = isc_mutex_init(&manager->lock);
2627d54cfbdroberto	if (result != ISC_R_SUCCESS) {
2628d54cfbdroberto		isc_mem_put(mctx, manager, sizeof(*manager));
2629d54cfbdroberto		return (result);
2630d54cfbdroberto	}
2631d54cfbdroberto	if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
2632d54cfbdroberto		DESTROYLOCK(&manager->lock);
2633d54cfbdroberto		isc_mem_put(mctx, manager, sizeof(*manager));
2634d54cfbdroberto		UNEXPECTED_ERROR(__FILE__, __LINE__,
2635d54cfbdroberto				 "isc_condition_init() %s",
2636d54cfbdroberto				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2637d54cfbdroberto						ISC_MSG_FAILED, "failed"));
2638d54cfbdroberto		return (ISC_R_UNEXPECTED);
2639d54cfbdroberto	}
2640d54cfbdroberto
2641d54cfbdroberto	isc_mem_attach(mctx, &manager->mctx);
2642d54cfbdroberto
2643d54cfbdroberto	iocompletionport_init(manager);	/* Create the Completion Ports */
2644d54cfbdroberto
2645d54cfbdroberto	manager->bShutdown = ISC_FALSE;
2646d54cfbdroberto	manager->totalSockets = 0;
2647d54cfbdroberto	manager->iocp_total = 0;
2648d54cfbdroberto
2649d54cfbdroberto	*managerp = manager;
2650d54cfbdroberto
2651d54cfbdroberto	return (ISC_R_SUCCESS);
2652d54cfbdroberto}
2653d54cfbdroberto
2654d54cfbdrobertoisc_result_t
2655047f369cyisc__socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
2656d54cfbdroberto	REQUIRE(VALID_MANAGER(manager));
2657d54cfbdroberto	REQUIRE(nsockp != NULL);
2658d54cfbdroberto
2659d54cfbdroberto	return (ISC_R_NOTIMPLEMENTED);
2660d54cfbdroberto}
2661d54cfbdroberto
2662d54cfbdrobertovoid
2663047f369cyisc__socketmgr_setstats(isc_socketmgr_t *manager, isc_stats_t *stats) {
2664d54cfbdroberto	REQUIRE(VALID_MANAGER(manager));
2665d54cfbdroberto	REQUIRE(ISC_LIST_EMPTY(manager->socklist));
2666d54cfbdroberto	REQUIRE(manager->stats == NULL);
2667d54cfbdroberto	REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
2668d54cfbdroberto
2669d54cfbdroberto	isc_stats_attach(stats, &manager->stats);
2670d54cfbdroberto}
2671d54cfbdroberto
2672d54cfbdrobertovoid
2673047f369cyisc__socketmgr_destroy(isc_socketmgr_t **managerp) {
2674d54cfbdroberto	isc_socketmgr_t *manager;
2675d54cfbdroberto	int i;
2676d54cfbdroberto	isc_mem_t *mctx;
2677d54cfbdroberto
2678d54cfbdroberto	/*
2679d54cfbdroberto	 * Destroy a socket manager.
2680d54cfbdroberto	 */
2681d54cfbdroberto
2682d54cfbdroberto	REQUIRE(managerp != NULL);
2683d54cfbdroberto	manager = *managerp;
2684d54cfbdroberto	REQUIRE(VALID_MANAGER(manager));
2685d54cfbdroberto
2686d54cfbdroberto	LOCK(&manager->lock);
2687d54cfbdroberto
2688d54cfbdroberto	/*
2689d54cfbdroberto	 * Wait for all sockets to be destroyed.
2690d54cfbdroberto	 */
2691d54cfbdroberto	while (!ISC_LIST_EMPTY(manager->socklist)) {
2692d54cfbdroberto		manager_log(manager, CREATION,
2693d54cfbdroberto			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
2694d54cfbdroberto					   ISC_MSG_SOCKETSREMAIN,
2695d54cfbdroberto					   "sockets exist"));
2696d54cfbdroberto		WAIT(&manager->shutdown_ok, &manager->lock);
2697d54cfbdroberto	}
2698d54cfbdroberto
2699d54cfbdroberto	UNLOCK(&manager->lock);
2700d54cfbdroberto
2701d54cfbdroberto	/*
2702d54cfbdroberto	 * Here, we need to had some wait code for the completion port
2703d54cfbdroberto	 * thread.
2704d54cfbdroberto	 */
2705d54cfbdroberto	signal_iocompletionport_exit(manager);
2706d54cfbdroberto	manager->bShutdown = ISC_TRUE;
2707d54cfbdroberto
2708d54cfbdroberto	/*
2709d54cfbdroberto	 * Wait for threads to exit.
2710d54cfbdroberto	 */
2711d54cfbdroberto	for (i = 0; i < manager->maxIOCPThreads; i++) {
2712d54cfbdroberto		if (isc_thread_join((isc_thread_t) manager->hIOCPThreads[i],
2713d54cfbdroberto			NULL) != ISC_R_SUCCESS)
2714d54cfbdroberto			UNEXPECTED_ERROR(__FILE__, __LINE__,
2715d54cfbdroberto				 "isc_thread_join() for Completion Port %s",
2716d54cfbdroberto				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
2717d54cfbdroberto						ISC_MSG_FAILED, "failed"));
2718d54cfbdroberto	}
2719d54cfbdroberto	/*
2720d54cfbdroberto	 * Clean up.
2721d54cfbdroberto	 */
2722d54cfbdroberto
2723d54cfbdroberto	CloseHandle(manager->hIoCompletionPort);
2724d54cfbdroberto
2725d54cfbdroberto	(void)isc_condition_destroy(&manager->shutdown_ok);
2726d54cfbdroberto
2727d54cfbdroberto	DESTROYLOCK(&manager->lock);
2728d54cfbdroberto	if (manager->stats != NULL)
2729d54cfbdroberto		isc_stats_detach(&manager->stats);
2730d54cfbdroberto	manager->magic = 0;
2731d54cfbdroberto	mctx= manager->mctx;
2732d54cfbdroberto	isc_mem_put(mctx, manager, sizeof(*manager));
2733d54cfbdroberto
2734d54cfbdroberto	isc_mem_detach(&mctx);
2735d54cfbdroberto
2736d54cfbdroberto	*managerp = NULL;
2737d54cfbdroberto}
2738d54cfbdroberto
2739d54cfbdrobertostatic void
2740d54cfbdrobertoqueue_receive_event(isc_socket_t *sock, isc_task_t *task, isc_socketevent_t *