xref: /illumos-gate/usr/src/cmd/sendmail/db/db/db_region.c (revision 7c478bd9)
1*7c478bd9Sstevel@tonic-gate /*-
2*7c478bd9Sstevel@tonic-gate  * See the file LICENSE for redistribution information.
3*7c478bd9Sstevel@tonic-gate  *
4*7c478bd9Sstevel@tonic-gate  * Copyright (c) 1996, 1997, 1998
5*7c478bd9Sstevel@tonic-gate  *	Sleepycat Software.  All rights reserved.
6*7c478bd9Sstevel@tonic-gate  */
7*7c478bd9Sstevel@tonic-gate 
8*7c478bd9Sstevel@tonic-gate #include "config.h"
9*7c478bd9Sstevel@tonic-gate 
10*7c478bd9Sstevel@tonic-gate #ifndef lint
11*7c478bd9Sstevel@tonic-gate static const char sccsid[] = "@(#)db_region.c	10.53 (Sleepycat) 11/10/98";
12*7c478bd9Sstevel@tonic-gate #endif /* not lint */
13*7c478bd9Sstevel@tonic-gate 
14*7c478bd9Sstevel@tonic-gate #ifndef NO_SYSTEM_INCLUDES
15*7c478bd9Sstevel@tonic-gate #include <sys/types.h>
16*7c478bd9Sstevel@tonic-gate 
17*7c478bd9Sstevel@tonic-gate #include <errno.h>
18*7c478bd9Sstevel@tonic-gate #include <string.h>
19*7c478bd9Sstevel@tonic-gate #include <unistd.h>
20*7c478bd9Sstevel@tonic-gate #endif
21*7c478bd9Sstevel@tonic-gate 
22*7c478bd9Sstevel@tonic-gate #include "db_int.h"
23*7c478bd9Sstevel@tonic-gate #include "common_ext.h"
24*7c478bd9Sstevel@tonic-gate 
25*7c478bd9Sstevel@tonic-gate static int __db_growregion __P((REGINFO *, size_t));
26*7c478bd9Sstevel@tonic-gate 
27*7c478bd9Sstevel@tonic-gate /*
28*7c478bd9Sstevel@tonic-gate  * __db_rattach --
29*7c478bd9Sstevel@tonic-gate  *	Optionally create and attach to a shared memory region.
30*7c478bd9Sstevel@tonic-gate  *
31*7c478bd9Sstevel@tonic-gate  * PUBLIC: int __db_rattach __P((REGINFO *));
32*7c478bd9Sstevel@tonic-gate  */
33*7c478bd9Sstevel@tonic-gate int
__db_rattach(infop)34*7c478bd9Sstevel@tonic-gate __db_rattach(infop)
35*7c478bd9Sstevel@tonic-gate 	REGINFO *infop;
36*7c478bd9Sstevel@tonic-gate {
37*7c478bd9Sstevel@tonic-gate 	RLAYOUT *rlp, rl;
38*7c478bd9Sstevel@tonic-gate 	size_t grow_region, size;
39*7c478bd9Sstevel@tonic-gate 	ssize_t nr, nw;
40*7c478bd9Sstevel@tonic-gate 	u_int32_t flags, mbytes, bytes;
41*7c478bd9Sstevel@tonic-gate 	u_int8_t *p;
42*7c478bd9Sstevel@tonic-gate 	int malloc_possible, ret, retry_cnt;
43*7c478bd9Sstevel@tonic-gate 
44*7c478bd9Sstevel@tonic-gate 	grow_region = 0;
45*7c478bd9Sstevel@tonic-gate 	malloc_possible = 1;
46*7c478bd9Sstevel@tonic-gate 	ret = retry_cnt = 0;
47*7c478bd9Sstevel@tonic-gate 
48*7c478bd9Sstevel@tonic-gate 	/* Round off the requested size to the next page boundary. */
49*7c478bd9Sstevel@tonic-gate 	DB_ROUNDOFF(infop->size, DB_VMPAGESIZE);
50*7c478bd9Sstevel@tonic-gate 
51*7c478bd9Sstevel@tonic-gate 	/* Some architectures have hard limits on the maximum region size. */
52*7c478bd9Sstevel@tonic-gate #ifdef DB_REGIONSIZE_MAX
53*7c478bd9Sstevel@tonic-gate 	if (infop->size > DB_REGIONSIZE_MAX) {
54*7c478bd9Sstevel@tonic-gate 		__db_err(infop->dbenv, "__db_rattach: cache size too large");
55*7c478bd9Sstevel@tonic-gate 		return (EINVAL);
56*7c478bd9Sstevel@tonic-gate 	}
57*7c478bd9Sstevel@tonic-gate #endif
58*7c478bd9Sstevel@tonic-gate 
59*7c478bd9Sstevel@tonic-gate 	/* Intialize the return information in the REGINFO structure. */
60*7c478bd9Sstevel@tonic-gate loop:	infop->addr = NULL;
61*7c478bd9Sstevel@tonic-gate 	infop->fd = -1;
62*7c478bd9Sstevel@tonic-gate 	infop->segid = INVALID_SEGID;
63*7c478bd9Sstevel@tonic-gate 	if (infop->name != NULL) {
64*7c478bd9Sstevel@tonic-gate 		__os_freestr(infop->name);
65*7c478bd9Sstevel@tonic-gate 		infop->name = NULL;
66*7c478bd9Sstevel@tonic-gate 	}
67*7c478bd9Sstevel@tonic-gate 	F_CLR(infop, REGION_CANGROW | REGION_CREATED);
68*7c478bd9Sstevel@tonic-gate 
69*7c478bd9Sstevel@tonic-gate #ifndef HAVE_SPINLOCKS
70*7c478bd9Sstevel@tonic-gate 	/*
71*7c478bd9Sstevel@tonic-gate 	 * XXX
72*7c478bd9Sstevel@tonic-gate 	 * Lacking spinlocks, we must have a file descriptor for fcntl(2)
73*7c478bd9Sstevel@tonic-gate 	 * locking, which implies using mmap(2) to map in a regular file.
74*7c478bd9Sstevel@tonic-gate 	 * (Theoretically, we could probably get a file descriptor to lock
75*7c478bd9Sstevel@tonic-gate 	 * other types of shared regions, but I don't see any reason to
76*7c478bd9Sstevel@tonic-gate 	 * bother.)
77*7c478bd9Sstevel@tonic-gate 	 *
78*7c478bd9Sstevel@tonic-gate 	 * Since we may be using shared memory regions, e.g., shmget(2),
79*7c478bd9Sstevel@tonic-gate 	 * and not mmap of regular files, the backing file may be only a
80*7c478bd9Sstevel@tonic-gate 	 * few tens of bytes in length.  So, this depends on the ability
81*7c478bd9Sstevel@tonic-gate 	 * to fcntl lock file offsets much larger than the physical file.
82*7c478bd9Sstevel@tonic-gate 	 */
83*7c478bd9Sstevel@tonic-gate 	malloc_possible = 0;
84*7c478bd9Sstevel@tonic-gate #endif
85*7c478bd9Sstevel@tonic-gate 
86*7c478bd9Sstevel@tonic-gate #ifdef __hppa
87*7c478bd9Sstevel@tonic-gate 	/*
88*7c478bd9Sstevel@tonic-gate 	 * XXX
89*7c478bd9Sstevel@tonic-gate 	 * HP-UX won't permit mutexes to live in anything but shared memory.
90*7c478bd9Sstevel@tonic-gate 	 * Instantiate a shared region file on that architecture, regardless.
91*7c478bd9Sstevel@tonic-gate 	 */
92*7c478bd9Sstevel@tonic-gate 	malloc_possible = 0;
93*7c478bd9Sstevel@tonic-gate #endif
94*7c478bd9Sstevel@tonic-gate 	/*
95*7c478bd9Sstevel@tonic-gate 	 * If a region is truly private, malloc the memory.  That's faster
96*7c478bd9Sstevel@tonic-gate 	 * than either anonymous memory or a shared file.
97*7c478bd9Sstevel@tonic-gate 	 */
98*7c478bd9Sstevel@tonic-gate 	if (malloc_possible && F_ISSET(infop, REGION_PRIVATE)) {
99*7c478bd9Sstevel@tonic-gate 		if ((ret = __os_malloc(infop->size, NULL, &infop->addr)) != 0)
100*7c478bd9Sstevel@tonic-gate 			return (ret);
101*7c478bd9Sstevel@tonic-gate 
102*7c478bd9Sstevel@tonic-gate 		/*
103*7c478bd9Sstevel@tonic-gate 		 * It's sometimes significantly faster to page-fault in all of
104*7c478bd9Sstevel@tonic-gate 		 * the region's pages before we run the application, as we see
105*7c478bd9Sstevel@tonic-gate 		 * nasty side-effects when we page-fault while holding various
106*7c478bd9Sstevel@tonic-gate 		 * locks, i.e., the lock takes a long time to acquire because
107*7c478bd9Sstevel@tonic-gate 		 * of the underlying page fault, and the other threads convoy
108*7c478bd9Sstevel@tonic-gate 		 * behind the lock holder.
109*7c478bd9Sstevel@tonic-gate 		 */
110*7c478bd9Sstevel@tonic-gate 		if (DB_GLOBAL(db_region_init))
111*7c478bd9Sstevel@tonic-gate 			for (p = infop->addr;
112*7c478bd9Sstevel@tonic-gate 			    p < (u_int8_t *)infop->addr + infop->size;
113*7c478bd9Sstevel@tonic-gate 			    p += DB_VMPAGESIZE)
114*7c478bd9Sstevel@tonic-gate 				p[0] = '\0';
115*7c478bd9Sstevel@tonic-gate 
116*7c478bd9Sstevel@tonic-gate 		F_SET(infop, REGION_CREATED | REGION_MALLOC);
117*7c478bd9Sstevel@tonic-gate 		goto region_init;
118*7c478bd9Sstevel@tonic-gate 	}
119*7c478bd9Sstevel@tonic-gate 
120*7c478bd9Sstevel@tonic-gate 	/*
121*7c478bd9Sstevel@tonic-gate 	 * Get the name of the region (creating the file if a temporary file
122*7c478bd9Sstevel@tonic-gate 	 * is being used).  The dbenv contains the current DB environment,
123*7c478bd9Sstevel@tonic-gate 	 * including naming information.  The path argument may be a file or
124*7c478bd9Sstevel@tonic-gate 	 * a directory.  If path is a directory, it must exist and file is the
125*7c478bd9Sstevel@tonic-gate 	 * file name to be created inside the directory.  If path is a file,
126*7c478bd9Sstevel@tonic-gate 	 * then file must be NULL.
127*7c478bd9Sstevel@tonic-gate 	 */
128*7c478bd9Sstevel@tonic-gate 	if ((ret = __db_appname(infop->dbenv, infop->appname, infop->path,
129*7c478bd9Sstevel@tonic-gate 	    infop->file, infop->dbflags, &infop->fd, &infop->name)) != 0)
130*7c478bd9Sstevel@tonic-gate 		return (ret);
131*7c478bd9Sstevel@tonic-gate 	if (infop->fd != -1)
132*7c478bd9Sstevel@tonic-gate 		F_SET(infop, REGION_CREATED);
133*7c478bd9Sstevel@tonic-gate 
134*7c478bd9Sstevel@tonic-gate 	/*
135*7c478bd9Sstevel@tonic-gate 	 * Try to create the file, if we have authority.  We have to make sure
136*7c478bd9Sstevel@tonic-gate 	 * that multiple threads/processes attempting to simultaneously create
137*7c478bd9Sstevel@tonic-gate 	 * the region are properly ordered, so we open it using DB_CREATE and
138*7c478bd9Sstevel@tonic-gate 	 * DB_EXCL, so two attempts to create the region will return failure in
139*7c478bd9Sstevel@tonic-gate 	 * one.
140*7c478bd9Sstevel@tonic-gate 	 */
141*7c478bd9Sstevel@tonic-gate 	if (infop->fd == -1 && infop->dbflags & DB_CREATE) {
142*7c478bd9Sstevel@tonic-gate 		flags = infop->dbflags;
143*7c478bd9Sstevel@tonic-gate 		LF_SET(DB_EXCL);
144*7c478bd9Sstevel@tonic-gate 		if ((ret = __db_open(infop->name,
145*7c478bd9Sstevel@tonic-gate 		    flags, flags, infop->mode, &infop->fd)) == 0)
146*7c478bd9Sstevel@tonic-gate 			F_SET(infop, REGION_CREATED);
147*7c478bd9Sstevel@tonic-gate 		else
148*7c478bd9Sstevel@tonic-gate 			if (ret != EEXIST)
149*7c478bd9Sstevel@tonic-gate 				goto errmsg;
150*7c478bd9Sstevel@tonic-gate 	}
151*7c478bd9Sstevel@tonic-gate 
152*7c478bd9Sstevel@tonic-gate 	/* If we couldn't create the file, try and open it. */
153*7c478bd9Sstevel@tonic-gate 	if (infop->fd == -1) {
154*7c478bd9Sstevel@tonic-gate 		flags = infop->dbflags;
155*7c478bd9Sstevel@tonic-gate 		LF_CLR(DB_CREATE | DB_EXCL);
156*7c478bd9Sstevel@tonic-gate 		if ((ret = __db_open(infop->name,
157*7c478bd9Sstevel@tonic-gate 		    flags, flags, infop->mode, &infop->fd)) != 0)
158*7c478bd9Sstevel@tonic-gate 			goto errmsg;
159*7c478bd9Sstevel@tonic-gate 	}
160*7c478bd9Sstevel@tonic-gate 
161*7c478bd9Sstevel@tonic-gate 	/*
162*7c478bd9Sstevel@tonic-gate 	 * There are three cases we support:
163*7c478bd9Sstevel@tonic-gate 	 *    1. Named anonymous memory (shmget(2)).
164*7c478bd9Sstevel@tonic-gate 	 *    2. Unnamed anonymous memory (mmap(2): MAP_ANON/MAP_ANONYMOUS).
165*7c478bd9Sstevel@tonic-gate 	 *    3. Memory backed by a regular file (mmap(2)).
166*7c478bd9Sstevel@tonic-gate 	 *
167*7c478bd9Sstevel@tonic-gate 	 * We instantiate a backing file in all cases, which contains at least
168*7c478bd9Sstevel@tonic-gate 	 * the RLAYOUT structure, and in case #3, contains the actual region.
169*7c478bd9Sstevel@tonic-gate 	 * This is necessary for a couple of reasons:
170*7c478bd9Sstevel@tonic-gate 	 *
171*7c478bd9Sstevel@tonic-gate 	 * First, the mpool region uses temporary files to name regions, and
172*7c478bd9Sstevel@tonic-gate 	 * since you may have multiple regions in the same directory, we need
173*7c478bd9Sstevel@tonic-gate 	 * a filesystem name to ensure that they don't collide.
174*7c478bd9Sstevel@tonic-gate 	 *
175*7c478bd9Sstevel@tonic-gate 	 * Second, applications are allowed to forcibly remove regions, even
176*7c478bd9Sstevel@tonic-gate 	 * if they don't know anything about them other than the name.  If a
177*7c478bd9Sstevel@tonic-gate 	 * region is backed by anonymous memory, there has to be some way for
178*7c478bd9Sstevel@tonic-gate 	 * the application to find out that information, and, in some cases,
179*7c478bd9Sstevel@tonic-gate 	 * determine ID information for the anonymous memory.
180*7c478bd9Sstevel@tonic-gate 	 */
181*7c478bd9Sstevel@tonic-gate 	if (F_ISSET(infop, REGION_CREATED)) {
182*7c478bd9Sstevel@tonic-gate 		/*
183*7c478bd9Sstevel@tonic-gate 		 * If we're using anonymous memory to back this region, set
184*7c478bd9Sstevel@tonic-gate 		 * the flag.
185*7c478bd9Sstevel@tonic-gate 		 */
186*7c478bd9Sstevel@tonic-gate 		if (DB_GLOBAL(db_region_anon))
187*7c478bd9Sstevel@tonic-gate 			F_SET(infop, REGION_ANONYMOUS);
188*7c478bd9Sstevel@tonic-gate 
189*7c478bd9Sstevel@tonic-gate 		/*
190*7c478bd9Sstevel@tonic-gate 		 * If we're using a regular file to back a region we created,
191*7c478bd9Sstevel@tonic-gate 		 * grow it to the specified size.
192*7c478bd9Sstevel@tonic-gate 		 */
193*7c478bd9Sstevel@tonic-gate 		if (!DB_GLOBAL(db_region_anon) &&
194*7c478bd9Sstevel@tonic-gate 		    (ret = __db_growregion(infop, infop->size)) != 0)
195*7c478bd9Sstevel@tonic-gate 			goto err;
196*7c478bd9Sstevel@tonic-gate 	} else {
197*7c478bd9Sstevel@tonic-gate 		/*
198*7c478bd9Sstevel@tonic-gate 		 * If we're joining a region, figure out what it looks like.
199*7c478bd9Sstevel@tonic-gate 		 *
200*7c478bd9Sstevel@tonic-gate 		 * XXX
201*7c478bd9Sstevel@tonic-gate 		 * We have to figure out if the file is a regular file backing
202*7c478bd9Sstevel@tonic-gate 		 * a region that we want to map into our address space, or a
203*7c478bd9Sstevel@tonic-gate 		 * file with the information we need to find a shared anonymous
204*7c478bd9Sstevel@tonic-gate 		 * region that we want to map into our address space.
205*7c478bd9Sstevel@tonic-gate 		 *
206*7c478bd9Sstevel@tonic-gate 		 * All this noise is because some systems don't have a coherent
207*7c478bd9Sstevel@tonic-gate 		 * VM and buffer cache, and worse, if you mix operations on the
208*7c478bd9Sstevel@tonic-gate 		 * VM and buffer cache, half the time you hang the system.
209*7c478bd9Sstevel@tonic-gate 		 *
210*7c478bd9Sstevel@tonic-gate 		 * There are two possibilities.  If the file is the size of an
211*7c478bd9Sstevel@tonic-gate 		 * RLAYOUT structure, then we know that the real region is in
212*7c478bd9Sstevel@tonic-gate 		 * shared memory, because otherwise it would be bigger.  (As
213*7c478bd9Sstevel@tonic-gate 		 * the RLAYOUT structure size is smaller than a disk sector,
214*7c478bd9Sstevel@tonic-gate 		 * the only way it can be this size is if deliberately written
215*7c478bd9Sstevel@tonic-gate 		 * that way.)  In which case, retrieve the information we need
216*7c478bd9Sstevel@tonic-gate 		 * from the RLAYOUT structure and use it to acquire the shared
217*7c478bd9Sstevel@tonic-gate 		 * memory.
218*7c478bd9Sstevel@tonic-gate 		 *
219*7c478bd9Sstevel@tonic-gate 		 * If the structure is larger than an RLAYOUT structure, then
220*7c478bd9Sstevel@tonic-gate 		 * the file is backing the shared memory region, and we use
221*7c478bd9Sstevel@tonic-gate 		 * the current size of the file without reading any information
222*7c478bd9Sstevel@tonic-gate 		 * from the file itself so that we don't confuse the VM.
223*7c478bd9Sstevel@tonic-gate 		 *
224*7c478bd9Sstevel@tonic-gate 		 * And yes, this makes me want to take somebody and kill them,
225*7c478bd9Sstevel@tonic-gate 		 * but I can't think of any other solution.
226*7c478bd9Sstevel@tonic-gate 		 */
227*7c478bd9Sstevel@tonic-gate 		if ((ret = __os_ioinfo(infop->name,
228*7c478bd9Sstevel@tonic-gate 		    infop->fd, &mbytes, &bytes, NULL)) != 0)
229*7c478bd9Sstevel@tonic-gate 			goto errmsg;
230*7c478bd9Sstevel@tonic-gate 		size = mbytes * MEGABYTE + bytes;
231*7c478bd9Sstevel@tonic-gate 
232*7c478bd9Sstevel@tonic-gate 		if (size <= sizeof(RLAYOUT)) {
233*7c478bd9Sstevel@tonic-gate 			/*
234*7c478bd9Sstevel@tonic-gate 			 * If the size is too small, the read fails or the
235*7c478bd9Sstevel@tonic-gate 			 * valid flag is incorrect, assume it's because the
236*7c478bd9Sstevel@tonic-gate 			 * RLAYOUT information hasn't been written out yet,
237*7c478bd9Sstevel@tonic-gate 			 * and retry.
238*7c478bd9Sstevel@tonic-gate 			 */
239*7c478bd9Sstevel@tonic-gate 			if (size < sizeof(RLAYOUT))
240*7c478bd9Sstevel@tonic-gate 				goto retry;
241*7c478bd9Sstevel@tonic-gate 			if ((ret =
242*7c478bd9Sstevel@tonic-gate 			    __os_read(infop->fd, &rl, sizeof(rl), &nr)) != 0)
243*7c478bd9Sstevel@tonic-gate 				goto retry;
244*7c478bd9Sstevel@tonic-gate 			if (rl.valid != DB_REGIONMAGIC)
245*7c478bd9Sstevel@tonic-gate 				goto retry;
246*7c478bd9Sstevel@tonic-gate 
247*7c478bd9Sstevel@tonic-gate 			/* Copy the size, memory id and characteristics. */
248*7c478bd9Sstevel@tonic-gate 			size = rl.size;
249*7c478bd9Sstevel@tonic-gate 			infop->segid = rl.segid;
250*7c478bd9Sstevel@tonic-gate 			if (F_ISSET(&rl, REGION_ANONYMOUS))
251*7c478bd9Sstevel@tonic-gate 				F_SET(infop, REGION_ANONYMOUS);
252*7c478bd9Sstevel@tonic-gate 		}
253*7c478bd9Sstevel@tonic-gate 
254*7c478bd9Sstevel@tonic-gate 		/*
255*7c478bd9Sstevel@tonic-gate 		 * If the region is larger than we think, that's okay, use the
256*7c478bd9Sstevel@tonic-gate 		 * current size.  If it's smaller than we think, and we were
257*7c478bd9Sstevel@tonic-gate 		 * just using the default size, that's okay, use the current
258*7c478bd9Sstevel@tonic-gate 		 * size.  If it's smaller than we think and we really care,
259*7c478bd9Sstevel@tonic-gate 		 * save the size and we'll catch that further down -- we can't
260*7c478bd9Sstevel@tonic-gate 		 * correct it here because we have to have a lock to grow the
261*7c478bd9Sstevel@tonic-gate 		 * region.
262*7c478bd9Sstevel@tonic-gate 		 */
263*7c478bd9Sstevel@tonic-gate 		if (infop->size > size && !F_ISSET(infop, REGION_SIZEDEF))
264*7c478bd9Sstevel@tonic-gate 			grow_region = infop->size;
265*7c478bd9Sstevel@tonic-gate 		infop->size = size;
266*7c478bd9Sstevel@tonic-gate 	}
267*7c478bd9Sstevel@tonic-gate 
268*7c478bd9Sstevel@tonic-gate 	/*
269*7c478bd9Sstevel@tonic-gate 	 * Map the region into our address space.  If we're creating it, the
270*7c478bd9Sstevel@tonic-gate 	 * underlying routines will make it the right size.
271*7c478bd9Sstevel@tonic-gate 	 *
272*7c478bd9Sstevel@tonic-gate 	 * There are at least two cases where we can "reasonably" fail when
273*7c478bd9Sstevel@tonic-gate 	 * we attempt to map in the region.  On Windows/95, closing the last
274*7c478bd9Sstevel@tonic-gate 	 * reference to a region causes it to be zeroed out.  On UNIX, when
275*7c478bd9Sstevel@tonic-gate 	 * using the shmget(2) interfaces, the region will no longer exist
276*7c478bd9Sstevel@tonic-gate 	 * if the system was rebooted.  In these cases, the underlying map call
277*7c478bd9Sstevel@tonic-gate 	 * returns EAGAIN, and we *remove* our file and try again.  There are
278*7c478bd9Sstevel@tonic-gate 	 * obvious races in doing this, but it should eventually settle down
279*7c478bd9Sstevel@tonic-gate 	 * to a winner and then things should proceed normally.
280*7c478bd9Sstevel@tonic-gate 	 */
281*7c478bd9Sstevel@tonic-gate 	if ((ret = __db_mapregion(infop->name, infop)) != 0)
282*7c478bd9Sstevel@tonic-gate 		if (ret == EAGAIN) {
283*7c478bd9Sstevel@tonic-gate 			/*
284*7c478bd9Sstevel@tonic-gate 			 * Pretend we created the region even if we didn't so
285*7c478bd9Sstevel@tonic-gate 			 * that our error processing unlinks it.
286*7c478bd9Sstevel@tonic-gate 			 */
287*7c478bd9Sstevel@tonic-gate 			F_SET(infop, REGION_CREATED);
288*7c478bd9Sstevel@tonic-gate 			ret = 0;
289*7c478bd9Sstevel@tonic-gate 			goto retry;
290*7c478bd9Sstevel@tonic-gate 		} else
291*7c478bd9Sstevel@tonic-gate 			goto err;
292*7c478bd9Sstevel@tonic-gate 
293*7c478bd9Sstevel@tonic-gate region_init:
294*7c478bd9Sstevel@tonic-gate 	/*
295*7c478bd9Sstevel@tonic-gate 	 * Initialize the common region information.
296*7c478bd9Sstevel@tonic-gate 	 *
297*7c478bd9Sstevel@tonic-gate 	 * !!!
298*7c478bd9Sstevel@tonic-gate 	 * We have to order the region creates so that two processes don't try
299*7c478bd9Sstevel@tonic-gate 	 * to simultaneously create the region.  This is handled by using the
300*7c478bd9Sstevel@tonic-gate 	 * DB_CREATE and DB_EXCL flags when we create the "backing" region file.
301*7c478bd9Sstevel@tonic-gate 	 *
302*7c478bd9Sstevel@tonic-gate 	 * We also have to order region joins so that processes joining regions
303*7c478bd9Sstevel@tonic-gate 	 * never see inconsistent data.  We'd like to play permissions games
304*7c478bd9Sstevel@tonic-gate 	 * with the backing file, but we can't because WNT filesystems won't
305*7c478bd9Sstevel@tonic-gate 	 * open a file mode 0.
306*7c478bd9Sstevel@tonic-gate 	 */
307*7c478bd9Sstevel@tonic-gate 	rlp = (RLAYOUT *)infop->addr;
308*7c478bd9Sstevel@tonic-gate 	if (F_ISSET(infop, REGION_CREATED)) {
309*7c478bd9Sstevel@tonic-gate 		/*
310*7c478bd9Sstevel@tonic-gate 		 * The process creating the region acquires a lock before it
311*7c478bd9Sstevel@tonic-gate 		 * sets the valid flag.  Any processes joining the region will
312*7c478bd9Sstevel@tonic-gate 		 * check the valid flag before acquiring the lock.
313*7c478bd9Sstevel@tonic-gate 		 *
314*7c478bd9Sstevel@tonic-gate 		 * Check the return of __db_mutex_init() and __db_mutex_lock(),
315*7c478bd9Sstevel@tonic-gate 		 * even though we don't usually check elsewhere.  This is the
316*7c478bd9Sstevel@tonic-gate 		 * first lock we initialize and acquire, and we have to know if
317*7c478bd9Sstevel@tonic-gate 		 * it fails.  (It CAN fail, e.g., SunOS, when using fcntl(2)
318*7c478bd9Sstevel@tonic-gate 		 * for locking, with an in-memory filesystem specified as the
319*7c478bd9Sstevel@tonic-gate 		 * database home.)
320*7c478bd9Sstevel@tonic-gate 		 */
321*7c478bd9Sstevel@tonic-gate 		if ((ret = __db_mutex_init(&rlp->lock,
322*7c478bd9Sstevel@tonic-gate 		    MUTEX_LOCK_OFFSET(rlp, &rlp->lock))) != 0 ||
323*7c478bd9Sstevel@tonic-gate 		    (ret = __db_mutex_lock(&rlp->lock, infop->fd)) != 0)
324*7c478bd9Sstevel@tonic-gate 			goto err;
325*7c478bd9Sstevel@tonic-gate 
326*7c478bd9Sstevel@tonic-gate 		/* Initialize the remaining region information. */
327*7c478bd9Sstevel@tonic-gate 		rlp->refcnt = 1;
328*7c478bd9Sstevel@tonic-gate 		rlp->size = infop->size;
329*7c478bd9Sstevel@tonic-gate 		db_version(&rlp->majver, &rlp->minver, &rlp->patch);
330*7c478bd9Sstevel@tonic-gate 		rlp->panic = 0;
331*7c478bd9Sstevel@tonic-gate 		rlp->segid = infop->segid;
332*7c478bd9Sstevel@tonic-gate 		rlp->flags = 0;
333*7c478bd9Sstevel@tonic-gate 		if (F_ISSET(infop, REGION_ANONYMOUS))
334*7c478bd9Sstevel@tonic-gate 			F_SET(rlp, REGION_ANONYMOUS);
335*7c478bd9Sstevel@tonic-gate 
336*7c478bd9Sstevel@tonic-gate 		/*
337*7c478bd9Sstevel@tonic-gate 		 * Fill in the valid field last -- use a magic number, memory
338*7c478bd9Sstevel@tonic-gate 		 * may not be zero-filled, and we want to minimize the chance
339*7c478bd9Sstevel@tonic-gate 		 * for collision.
340*7c478bd9Sstevel@tonic-gate 		 */
341*7c478bd9Sstevel@tonic-gate 		rlp->valid = DB_REGIONMAGIC;
342*7c478bd9Sstevel@tonic-gate 
343*7c478bd9Sstevel@tonic-gate 		/*
344*7c478bd9Sstevel@tonic-gate 		 * If the region is anonymous, write the RLAYOUT information
345*7c478bd9Sstevel@tonic-gate 		 * into the backing file so that future region join and unlink
346*7c478bd9Sstevel@tonic-gate 		 * calls can find it.
347*7c478bd9Sstevel@tonic-gate 		 *
348*7c478bd9Sstevel@tonic-gate 		 * XXX
349*7c478bd9Sstevel@tonic-gate 		 * We MUST do the seek before we do the write.  On Win95, while
350*7c478bd9Sstevel@tonic-gate 		 * closing the last reference to an anonymous shared region
351*7c478bd9Sstevel@tonic-gate 		 * doesn't discard the region, it does zero it out.  So, the
352*7c478bd9Sstevel@tonic-gate 		 * REGION_CREATED may be set, but the file may have already
353*7c478bd9Sstevel@tonic-gate 		 * been written and the file descriptor may be at the end of
354*7c478bd9Sstevel@tonic-gate 		 * the file.
355*7c478bd9Sstevel@tonic-gate 		 */
356*7c478bd9Sstevel@tonic-gate 		if (F_ISSET(infop, REGION_ANONYMOUS)) {
357*7c478bd9Sstevel@tonic-gate 			if ((ret = __os_seek(infop->fd, 0, 0, 0, 0, 0)) != 0)
358*7c478bd9Sstevel@tonic-gate 				goto err;
359*7c478bd9Sstevel@tonic-gate 			if ((ret =
360*7c478bd9Sstevel@tonic-gate 			    __os_write(infop->fd, rlp, sizeof(*rlp), &nw)) != 0)
361*7c478bd9Sstevel@tonic-gate 				goto err;
362*7c478bd9Sstevel@tonic-gate 		}
363*7c478bd9Sstevel@tonic-gate 	} else {
364*7c478bd9Sstevel@tonic-gate 		/* Check to see if the region has had catastrophic failure. */
365*7c478bd9Sstevel@tonic-gate 		if (rlp->panic) {
366*7c478bd9Sstevel@tonic-gate 			ret = DB_RUNRECOVERY;
367*7c478bd9Sstevel@tonic-gate 			goto err;
368*7c478bd9Sstevel@tonic-gate 		}
369*7c478bd9Sstevel@tonic-gate 
370*7c478bd9Sstevel@tonic-gate 		/*
371*7c478bd9Sstevel@tonic-gate 		 * Check the valid flag to ensure the region is initialized.
372*7c478bd9Sstevel@tonic-gate 		 * If the valid flag has not been set, the mutex may not have
373*7c478bd9Sstevel@tonic-gate 		 * been initialized, and an attempt to get it could lead to
374*7c478bd9Sstevel@tonic-gate 		 * random behavior.
375*7c478bd9Sstevel@tonic-gate 		 */
376*7c478bd9Sstevel@tonic-gate 		if (rlp->valid != DB_REGIONMAGIC)
377*7c478bd9Sstevel@tonic-gate 			goto retry;
378*7c478bd9Sstevel@tonic-gate 
379*7c478bd9Sstevel@tonic-gate 		/* Get the region lock. */
380*7c478bd9Sstevel@tonic-gate 		(void)__db_mutex_lock(&rlp->lock, infop->fd);
381*7c478bd9Sstevel@tonic-gate 
382*7c478bd9Sstevel@tonic-gate 		/*
383*7c478bd9Sstevel@tonic-gate 		 * We now own the region.  There are a couple of things that
384*7c478bd9Sstevel@tonic-gate 		 * may have gone wrong, however.
385*7c478bd9Sstevel@tonic-gate 		 *
386*7c478bd9Sstevel@tonic-gate 		 * Problem #1: while we were waiting for the lock, the region
387*7c478bd9Sstevel@tonic-gate 		 * was deleted.  Detected by re-checking the valid flag, since
388*7c478bd9Sstevel@tonic-gate 		 * it's cleared by the delete region routines.
389*7c478bd9Sstevel@tonic-gate 		 */
390*7c478bd9Sstevel@tonic-gate 		if (rlp->valid != DB_REGIONMAGIC) {
391*7c478bd9Sstevel@tonic-gate 			(void)__db_mutex_unlock(&rlp->lock, infop->fd);
392*7c478bd9Sstevel@tonic-gate 			goto retry;
393*7c478bd9Sstevel@tonic-gate 		}
394*7c478bd9Sstevel@tonic-gate 
395*7c478bd9Sstevel@tonic-gate 		/*
396*7c478bd9Sstevel@tonic-gate 		 * Problem #3: when we checked the size of the file, it was
397*7c478bd9Sstevel@tonic-gate 		 * still growing as part of creation.  Detected by the fact
398*7c478bd9Sstevel@tonic-gate 		 * that infop->size isn't the same size as the region.
399*7c478bd9Sstevel@tonic-gate 		 */
400*7c478bd9Sstevel@tonic-gate 		if (infop->size != rlp->size) {
401*7c478bd9Sstevel@tonic-gate 			(void)__db_mutex_unlock(&rlp->lock, infop->fd);
402*7c478bd9Sstevel@tonic-gate 			goto retry;
403*7c478bd9Sstevel@tonic-gate 		}
404*7c478bd9Sstevel@tonic-gate 
405*7c478bd9Sstevel@tonic-gate 		/* Increment the reference count. */
406*7c478bd9Sstevel@tonic-gate 		++rlp->refcnt;
407*7c478bd9Sstevel@tonic-gate 	}
408*7c478bd9Sstevel@tonic-gate 
409*7c478bd9Sstevel@tonic-gate 	/* Return the region in a locked condition. */
410*7c478bd9Sstevel@tonic-gate 
411*7c478bd9Sstevel@tonic-gate 	if (0) {
412*7c478bd9Sstevel@tonic-gate errmsg:		__db_err(infop->dbenv, "%s: %s", infop->name, strerror(ret));
413*7c478bd9Sstevel@tonic-gate 
414*7c478bd9Sstevel@tonic-gate err:
415*7c478bd9Sstevel@tonic-gate retry:		/* Discard the region. */
416*7c478bd9Sstevel@tonic-gate 		if (infop->addr != NULL) {
417*7c478bd9Sstevel@tonic-gate 			(void)__db_unmapregion(infop);
418*7c478bd9Sstevel@tonic-gate 			infop->addr = NULL;
419*7c478bd9Sstevel@tonic-gate 		}
420*7c478bd9Sstevel@tonic-gate 
421*7c478bd9Sstevel@tonic-gate 		/* Discard the backing file. */
422*7c478bd9Sstevel@tonic-gate 		if (infop->fd != -1) {
423*7c478bd9Sstevel@tonic-gate 			(void)__os_close(infop->fd);
424*7c478bd9Sstevel@tonic-gate 			infop->fd = -1;
425*7c478bd9Sstevel@tonic-gate 
426*7c478bd9Sstevel@tonic-gate 			if (F_ISSET(infop, REGION_CREATED))
427*7c478bd9Sstevel@tonic-gate 				(void)__os_unlink(infop->name);
428*7c478bd9Sstevel@tonic-gate 		}
429*7c478bd9Sstevel@tonic-gate 
430*7c478bd9Sstevel@tonic-gate 		/* Discard the name. */
431*7c478bd9Sstevel@tonic-gate 		if (infop->name != NULL) {
432*7c478bd9Sstevel@tonic-gate 			__os_freestr(infop->name);
433*7c478bd9Sstevel@tonic-gate 			infop->name = NULL;
434*7c478bd9Sstevel@tonic-gate 		}
435*7c478bd9Sstevel@tonic-gate 
436*7c478bd9Sstevel@tonic-gate 		/*
437*7c478bd9Sstevel@tonic-gate 		 * If we had a temporary error, wait a few seconds and
438*7c478bd9Sstevel@tonic-gate 		 * try again.
439*7c478bd9Sstevel@tonic-gate 		 */
440*7c478bd9Sstevel@tonic-gate 		if (ret == 0) {
441*7c478bd9Sstevel@tonic-gate 			if (++retry_cnt <= 3) {
442*7c478bd9Sstevel@tonic-gate 				__os_sleep(retry_cnt * 2, 0);
443*7c478bd9Sstevel@tonic-gate 				goto loop;
444*7c478bd9Sstevel@tonic-gate 			}
445*7c478bd9Sstevel@tonic-gate 			ret = EAGAIN;
446*7c478bd9Sstevel@tonic-gate 		}
447*7c478bd9Sstevel@tonic-gate 	}
448*7c478bd9Sstevel@tonic-gate 
449*7c478bd9Sstevel@tonic-gate 	/*
450*7c478bd9Sstevel@tonic-gate 	 * XXX
451*7c478bd9Sstevel@tonic-gate 	 * HP-UX won't permit mutexes to live in anything but shared memory.
452*7c478bd9Sstevel@tonic-gate 	 * Instantiate a shared region file on that architecture, regardless.
453*7c478bd9Sstevel@tonic-gate 	 *
454*7c478bd9Sstevel@tonic-gate 	 * XXX
455*7c478bd9Sstevel@tonic-gate 	 * There's a problem in cleaning this up on application exit, or on
456*7c478bd9Sstevel@tonic-gate 	 * application failure.  If an application opens a database without
457*7c478bd9Sstevel@tonic-gate 	 * an environment, we create a temporary backing mpool region for it.
458*7c478bd9Sstevel@tonic-gate 	 * That region is marked REGION_PRIVATE, but as HP-UX won't permit
459*7c478bd9Sstevel@tonic-gate 	 * mutexes to live in anything but shared memory, we instantiate a
460*7c478bd9Sstevel@tonic-gate 	 * real file plus a memory region of some form.  If the application
461*7c478bd9Sstevel@tonic-gate 	 * crashes, the necessary information to delete the backing file and
462*7c478bd9Sstevel@tonic-gate 	 * any system region (e.g., the shmget(2) segment ID) is no longer
463*7c478bd9Sstevel@tonic-gate 	 * available.  We can't completely fix the problem, but we try.
464*7c478bd9Sstevel@tonic-gate 	 *
465*7c478bd9Sstevel@tonic-gate 	 * The underlying UNIX __db_mapregion() code preferentially uses the
466*7c478bd9Sstevel@tonic-gate 	 * mmap(2) interface with the MAP_ANON/MAP_ANONYMOUS flags for regions
467*7c478bd9Sstevel@tonic-gate 	 * that are marked REGION_PRIVATE.  This means that we normally aren't
468*7c478bd9Sstevel@tonic-gate 	 * holding any system resources when we get here, in which case we can
469*7c478bd9Sstevel@tonic-gate 	 * delete the backing file.  This results in a short race, from the
470*7c478bd9Sstevel@tonic-gate 	 * __db_open() call above to here.
471*7c478bd9Sstevel@tonic-gate 	 *
472*7c478bd9Sstevel@tonic-gate 	 * If, for some reason, we are holding system resources when we get
473*7c478bd9Sstevel@tonic-gate 	 * here, we don't have any choice -- we can't delete the backing file
474*7c478bd9Sstevel@tonic-gate 	 * because we may need it to detach from the resources.  Set the
475*7c478bd9Sstevel@tonic-gate 	 * REGION_LASTDETACH flag, so that we do all necessary cleanup when
476*7c478bd9Sstevel@tonic-gate 	 * the application closes the region.
477*7c478bd9Sstevel@tonic-gate 	 */
478*7c478bd9Sstevel@tonic-gate 	if (F_ISSET(infop, REGION_PRIVATE) && !F_ISSET(infop, REGION_MALLOC))
479*7c478bd9Sstevel@tonic-gate 		if (F_ISSET(infop, REGION_HOLDINGSYS))
480*7c478bd9Sstevel@tonic-gate 			F_SET(infop, REGION_LASTDETACH);
481*7c478bd9Sstevel@tonic-gate 		else {
482*7c478bd9Sstevel@tonic-gate 			F_SET(infop, REGION_REMOVED);
483*7c478bd9Sstevel@tonic-gate 			F_CLR(infop, REGION_CANGROW);
484*7c478bd9Sstevel@tonic-gate 
485*7c478bd9Sstevel@tonic-gate 			(void)__os_close(infop->fd);
486*7c478bd9Sstevel@tonic-gate 			(void)__os_unlink(infop->name);
487*7c478bd9Sstevel@tonic-gate 		}
488*7c478bd9Sstevel@tonic-gate 
489*7c478bd9Sstevel@tonic-gate 	return (ret);
490*7c478bd9Sstevel@tonic-gate }
491*7c478bd9Sstevel@tonic-gate 
492*7c478bd9Sstevel@tonic-gate /*
493*7c478bd9Sstevel@tonic-gate  * __db_rdetach --
494*7c478bd9Sstevel@tonic-gate  *	De-attach from a shared memory region.
495*7c478bd9Sstevel@tonic-gate  *
496*7c478bd9Sstevel@tonic-gate  * PUBLIC: int __db_rdetach __P((REGINFO *));
497*7c478bd9Sstevel@tonic-gate  */
498*7c478bd9Sstevel@tonic-gate int
__db_rdetach(infop)499*7c478bd9Sstevel@tonic-gate __db_rdetach(infop)
500*7c478bd9Sstevel@tonic-gate 	REGINFO *infop;
501*7c478bd9Sstevel@tonic-gate {
502*7c478bd9Sstevel@tonic-gate 	RLAYOUT *rlp;
503*7c478bd9Sstevel@tonic-gate 	int detach, ret, t_ret;
504*7c478bd9Sstevel@tonic-gate 
505*7c478bd9Sstevel@tonic-gate 	ret = 0;
506*7c478bd9Sstevel@tonic-gate 
507*7c478bd9Sstevel@tonic-gate 	/*
508*7c478bd9Sstevel@tonic-gate 	 * If the region was removed when it was created, no further action
509*7c478bd9Sstevel@tonic-gate 	 * is required.
510*7c478bd9Sstevel@tonic-gate 	 */
511*7c478bd9Sstevel@tonic-gate 	if (F_ISSET(infop, REGION_REMOVED))
512*7c478bd9Sstevel@tonic-gate 		goto done;
513*7c478bd9Sstevel@tonic-gate 	/*
514*7c478bd9Sstevel@tonic-gate 	 * If the region was created in memory returned by malloc, the only
515*7c478bd9Sstevel@tonic-gate 	 * action required is freeing the memory.
516*7c478bd9Sstevel@tonic-gate 	 */
517*7c478bd9Sstevel@tonic-gate 	if (F_ISSET(infop, REGION_MALLOC)) {
518*7c478bd9Sstevel@tonic-gate 		__os_free(infop->addr, 0);
519*7c478bd9Sstevel@tonic-gate 		goto done;
520*7c478bd9Sstevel@tonic-gate 	}
521*7c478bd9Sstevel@tonic-gate 
522*7c478bd9Sstevel@tonic-gate 	/* Otherwise, attach to the region and optionally delete it. */
523*7c478bd9Sstevel@tonic-gate 	rlp = infop->addr;
524*7c478bd9Sstevel@tonic-gate 
525*7c478bd9Sstevel@tonic-gate 	/* Get the lock. */
526*7c478bd9Sstevel@tonic-gate 	(void)__db_mutex_lock(&rlp->lock, infop->fd);
527*7c478bd9Sstevel@tonic-gate 
528*7c478bd9Sstevel@tonic-gate 	/* Decrement the reference count. */
529*7c478bd9Sstevel@tonic-gate 	if (rlp->refcnt == 0)
530*7c478bd9Sstevel@tonic-gate 		__db_err(infop->dbenv,
531*7c478bd9Sstevel@tonic-gate 		    "region rdetach: reference count went to zero!");
532*7c478bd9Sstevel@tonic-gate 	else
533*7c478bd9Sstevel@tonic-gate 		--rlp->refcnt;
534*7c478bd9Sstevel@tonic-gate 
535*7c478bd9Sstevel@tonic-gate 	/*
536*7c478bd9Sstevel@tonic-gate 	 * If we're going to remove the region, clear the valid flag so
537*7c478bd9Sstevel@tonic-gate 	 * that any region join that's blocked waiting for us will know
538*7c478bd9Sstevel@tonic-gate 	 * what happened.
539*7c478bd9Sstevel@tonic-gate 	 */
540*7c478bd9Sstevel@tonic-gate 	detach = 0;
541*7c478bd9Sstevel@tonic-gate 	if (F_ISSET(infop, REGION_LASTDETACH))
542*7c478bd9Sstevel@tonic-gate 		if (rlp->refcnt == 0) {
543*7c478bd9Sstevel@tonic-gate 			detach = 1;
544*7c478bd9Sstevel@tonic-gate 			rlp->valid = 0;
545*7c478bd9Sstevel@tonic-gate 		} else
546*7c478bd9Sstevel@tonic-gate 			ret = EBUSY;
547*7c478bd9Sstevel@tonic-gate 
548*7c478bd9Sstevel@tonic-gate 	/* Release the lock. */
549*7c478bd9Sstevel@tonic-gate 	(void)__db_mutex_unlock(&rlp->lock, infop->fd);
550*7c478bd9Sstevel@tonic-gate 
551*7c478bd9Sstevel@tonic-gate 	/* Close the backing file descriptor. */
552*7c478bd9Sstevel@tonic-gate 	(void)__os_close(infop->fd);
553*7c478bd9Sstevel@tonic-gate 	infop->fd = -1;
554*7c478bd9Sstevel@tonic-gate 
555*7c478bd9Sstevel@tonic-gate 	/* Discard our mapping of the region. */
556*7c478bd9Sstevel@tonic-gate 	if ((t_ret = __db_unmapregion(infop)) != 0 && ret == 0)
557*7c478bd9Sstevel@tonic-gate 		ret = t_ret;
558*7c478bd9Sstevel@tonic-gate 
559*7c478bd9Sstevel@tonic-gate 	/* Discard the region itself. */
560*7c478bd9Sstevel@tonic-gate 	if (detach) {
561*7c478bd9Sstevel@tonic-gate 		if ((t_ret =
562*7c478bd9Sstevel@tonic-gate 		    __db_unlinkregion(infop->name, infop) != 0) && ret == 0)
563*7c478bd9Sstevel@tonic-gate 			ret = t_ret;
564*7c478bd9Sstevel@tonic-gate 		if ((t_ret = __os_unlink(infop->name) != 0) && ret == 0)
565*7c478bd9Sstevel@tonic-gate 			ret = t_ret;
566*7c478bd9Sstevel@tonic-gate 	}
567*7c478bd9Sstevel@tonic-gate 
568*7c478bd9Sstevel@tonic-gate done:	/* Discard the name. */
569*7c478bd9Sstevel@tonic-gate 	if (infop->name != NULL) {
570*7c478bd9Sstevel@tonic-gate 		__os_freestr(infop->name);
571*7c478bd9Sstevel@tonic-gate 		infop->name = NULL;
572*7c478bd9Sstevel@tonic-gate 	}
573*7c478bd9Sstevel@tonic-gate 
574*7c478bd9Sstevel@tonic-gate 	return (ret);
575*7c478bd9Sstevel@tonic-gate }
576*7c478bd9Sstevel@tonic-gate 
577*7c478bd9Sstevel@tonic-gate /*
578*7c478bd9Sstevel@tonic-gate  * __db_runlink --
579*7c478bd9Sstevel@tonic-gate  *	Remove a region.
580*7c478bd9Sstevel@tonic-gate  *
581*7c478bd9Sstevel@tonic-gate  * PUBLIC: int __db_runlink __P((REGINFO *, int));
582*7c478bd9Sstevel@tonic-gate  */
583*7c478bd9Sstevel@tonic-gate int
__db_runlink(infop,force)584*7c478bd9Sstevel@tonic-gate __db_runlink(infop, force)
585*7c478bd9Sstevel@tonic-gate 	REGINFO *infop;
586*7c478bd9Sstevel@tonic-gate 	int force;
587*7c478bd9Sstevel@tonic-gate {
588*7c478bd9Sstevel@tonic-gate 	RLAYOUT rl, *rlp;
589*7c478bd9Sstevel@tonic-gate 	size_t size;
590*7c478bd9Sstevel@tonic-gate 	ssize_t nr;
591*7c478bd9Sstevel@tonic-gate 	u_int32_t mbytes, bytes;
592*7c478bd9Sstevel@tonic-gate 	int fd, ret, t_ret;
593*7c478bd9Sstevel@tonic-gate 	char *name;
594*7c478bd9Sstevel@tonic-gate 
595*7c478bd9Sstevel@tonic-gate 	/*
596*7c478bd9Sstevel@tonic-gate 	 * XXX
597*7c478bd9Sstevel@tonic-gate 	 * We assume that we've created a new REGINFO structure for this
598*7c478bd9Sstevel@tonic-gate 	 * call, not used one that was already initialized.  Regardless,
599*7c478bd9Sstevel@tonic-gate 	 * if anyone is planning to use it after we're done, they're going
600*7c478bd9Sstevel@tonic-gate 	 * to be sorely disappointed.
601*7c478bd9Sstevel@tonic-gate 	 *
602*7c478bd9Sstevel@tonic-gate 	 * If force isn't set, we attach to the region, set a flag to delete
603*7c478bd9Sstevel@tonic-gate 	 * the region on last close, and let the region delete code do the
604*7c478bd9Sstevel@tonic-gate 	 * work.
605*7c478bd9Sstevel@tonic-gate 	 */
606*7c478bd9Sstevel@tonic-gate 	if (!force) {
607*7c478bd9Sstevel@tonic-gate 		if ((ret = __db_rattach(infop)) != 0)
608*7c478bd9Sstevel@tonic-gate 			return (ret);
609*7c478bd9Sstevel@tonic-gate 
610*7c478bd9Sstevel@tonic-gate 		rlp = (RLAYOUT *)infop->addr;
611*7c478bd9Sstevel@tonic-gate 		(void)__db_mutex_unlock(&rlp->lock, infop->fd);
612*7c478bd9Sstevel@tonic-gate 
613*7c478bd9Sstevel@tonic-gate 		F_SET(infop, REGION_LASTDETACH);
614*7c478bd9Sstevel@tonic-gate 
615*7c478bd9Sstevel@tonic-gate 		return (__db_rdetach(infop));
616*7c478bd9Sstevel@tonic-gate 	}
617*7c478bd9Sstevel@tonic-gate 
618*7c478bd9Sstevel@tonic-gate 	/*
619*7c478bd9Sstevel@tonic-gate 	 * Otherwise, we don't want to attach to the region.  We may have been
620*7c478bd9Sstevel@tonic-gate 	 * called to clean up if a process died leaving a region locked and/or
621*7c478bd9Sstevel@tonic-gate 	 * corrupted, which could cause the attach to hang.
622*7c478bd9Sstevel@tonic-gate 	 */
623*7c478bd9Sstevel@tonic-gate 	if ((ret = __db_appname(infop->dbenv, infop->appname,
624*7c478bd9Sstevel@tonic-gate 	    infop->path, infop->file, infop->dbflags, NULL, &name)) != 0)
625*7c478bd9Sstevel@tonic-gate 		return (ret);
626*7c478bd9Sstevel@tonic-gate 
627*7c478bd9Sstevel@tonic-gate 	/*
628*7c478bd9Sstevel@tonic-gate 	 * An underlying file is created for all regions other than private
629*7c478bd9Sstevel@tonic-gate 	 * (REGION_PRIVATE) ones, regardless of whether or not it's used to
630*7c478bd9Sstevel@tonic-gate 	 * back the region.  If that file doesn't exist, we're done.
631*7c478bd9Sstevel@tonic-gate 	 */
632*7c478bd9Sstevel@tonic-gate 	if (__os_exists(name, NULL) != 0) {
633*7c478bd9Sstevel@tonic-gate 		__os_freestr(name);
634*7c478bd9Sstevel@tonic-gate 		return (0);
635*7c478bd9Sstevel@tonic-gate 	}
636*7c478bd9Sstevel@tonic-gate 
637*7c478bd9Sstevel@tonic-gate 	/*
638*7c478bd9Sstevel@tonic-gate 	 * See the comments in __db_rattach -- figure out if this is a regular
639*7c478bd9Sstevel@tonic-gate 	 * file backing a region or if it's a regular file with information
640*7c478bd9Sstevel@tonic-gate 	 * about a region.
641*7c478bd9Sstevel@tonic-gate 	 */
642*7c478bd9Sstevel@tonic-gate 	if ((ret = __db_open(name, DB_RDONLY, DB_RDONLY, 0, &fd)) != 0)
643*7c478bd9Sstevel@tonic-gate 		goto errmsg;
644*7c478bd9Sstevel@tonic-gate 	if ((ret = __os_ioinfo(name, fd, &mbytes, &bytes, NULL)) != 0)
645*7c478bd9Sstevel@tonic-gate 		goto errmsg;
646*7c478bd9Sstevel@tonic-gate 	size = mbytes * MEGABYTE + bytes;
647*7c478bd9Sstevel@tonic-gate 
648*7c478bd9Sstevel@tonic-gate 	if (size <= sizeof(RLAYOUT)) {
649*7c478bd9Sstevel@tonic-gate 		if ((ret = __os_read(fd, &rl, sizeof(rl), &nr)) != 0)
650*7c478bd9Sstevel@tonic-gate 			goto errmsg;
651*7c478bd9Sstevel@tonic-gate 		if (rl.valid != DB_REGIONMAGIC) {
652*7c478bd9Sstevel@tonic-gate 			__db_err(infop->dbenv,
653*7c478bd9Sstevel@tonic-gate 			    "%s: illegal region magic number", name);
654*7c478bd9Sstevel@tonic-gate 			ret = EINVAL;
655*7c478bd9Sstevel@tonic-gate 			goto err;
656*7c478bd9Sstevel@tonic-gate 		}
657*7c478bd9Sstevel@tonic-gate 
658*7c478bd9Sstevel@tonic-gate 		/* Set the size, memory id and characteristics. */
659*7c478bd9Sstevel@tonic-gate 		infop->size = rl.size;
660*7c478bd9Sstevel@tonic-gate 		infop->segid = rl.segid;
661*7c478bd9Sstevel@tonic-gate 		if (F_ISSET(&rl, REGION_ANONYMOUS))
662*7c478bd9Sstevel@tonic-gate 			F_SET(infop, REGION_ANONYMOUS);
663*7c478bd9Sstevel@tonic-gate 	} else {
664*7c478bd9Sstevel@tonic-gate 		infop->size = size;
665*7c478bd9Sstevel@tonic-gate 		infop->segid = INVALID_SEGID;
666*7c478bd9Sstevel@tonic-gate 	}
667*7c478bd9Sstevel@tonic-gate 
668*7c478bd9Sstevel@tonic-gate 	/* Remove the underlying region. */
669*7c478bd9Sstevel@tonic-gate 	ret = __db_unlinkregion(name, infop);
670*7c478bd9Sstevel@tonic-gate 
671*7c478bd9Sstevel@tonic-gate 	/*
672*7c478bd9Sstevel@tonic-gate 	 * Unlink the backing file.  Close the open file descriptor first,
673*7c478bd9Sstevel@tonic-gate 	 * because some architectures (e.g., Win32) won't unlink a file if
674*7c478bd9Sstevel@tonic-gate 	 * open file descriptors remain.
675*7c478bd9Sstevel@tonic-gate 	 */
676*7c478bd9Sstevel@tonic-gate 	(void)__os_close(fd);
677*7c478bd9Sstevel@tonic-gate 	if ((t_ret = __os_unlink(name)) != 0 && ret == 0)
678*7c478bd9Sstevel@tonic-gate 		ret = t_ret;
679*7c478bd9Sstevel@tonic-gate 
680*7c478bd9Sstevel@tonic-gate 	if (0) {
681*7c478bd9Sstevel@tonic-gate errmsg:		__db_err(infop->dbenv, "%s: %s", name, strerror(ret));
682*7c478bd9Sstevel@tonic-gate err:		(void)__os_close(fd);
683*7c478bd9Sstevel@tonic-gate 	}
684*7c478bd9Sstevel@tonic-gate 
685*7c478bd9Sstevel@tonic-gate 	__os_freestr(name);
686*7c478bd9Sstevel@tonic-gate 	return (ret);
687*7c478bd9Sstevel@tonic-gate }
688*7c478bd9Sstevel@tonic-gate 
689*7c478bd9Sstevel@tonic-gate /*
690*7c478bd9Sstevel@tonic-gate  * __db_rgrow --
691*7c478bd9Sstevel@tonic-gate  *	Extend a region.
692*7c478bd9Sstevel@tonic-gate  *
693*7c478bd9Sstevel@tonic-gate  * PUBLIC: int __db_rgrow __P((REGINFO *, size_t));
694*7c478bd9Sstevel@tonic-gate  */
695*7c478bd9Sstevel@tonic-gate int
__db_rgrow(infop,new_size)696*7c478bd9Sstevel@tonic-gate __db_rgrow(infop, new_size)
697*7c478bd9Sstevel@tonic-gate 	REGINFO *infop;
698*7c478bd9Sstevel@tonic-gate 	size_t new_size;
699*7c478bd9Sstevel@tonic-gate {
700*7c478bd9Sstevel@tonic-gate 	RLAYOUT *rlp;
701*7c478bd9Sstevel@tonic-gate 	size_t increment;
702*7c478bd9Sstevel@tonic-gate 	int ret;
703*7c478bd9Sstevel@tonic-gate 
704*7c478bd9Sstevel@tonic-gate 	/*
705*7c478bd9Sstevel@tonic-gate 	 * !!!
706*7c478bd9Sstevel@tonic-gate 	 * This routine MUST be called with the region already locked.
707*7c478bd9Sstevel@tonic-gate 	 */
708*7c478bd9Sstevel@tonic-gate 
709*7c478bd9Sstevel@tonic-gate 	/* The underlying routines have flagged if this region can grow. */
710*7c478bd9Sstevel@tonic-gate 	if (!F_ISSET(infop, REGION_CANGROW))
711*7c478bd9Sstevel@tonic-gate 		return (EINVAL);
712*7c478bd9Sstevel@tonic-gate 
713*7c478bd9Sstevel@tonic-gate 	/*
714*7c478bd9Sstevel@tonic-gate 	 * Round off the requested size to the next page boundary, and
715*7c478bd9Sstevel@tonic-gate 	 * determine the additional space required.
716*7c478bd9Sstevel@tonic-gate 	 */
717*7c478bd9Sstevel@tonic-gate 	rlp = (RLAYOUT *)infop->addr;
718*7c478bd9Sstevel@tonic-gate 	DB_ROUNDOFF(new_size, DB_VMPAGESIZE);
719*7c478bd9Sstevel@tonic-gate 	increment = new_size - rlp->size;
720*7c478bd9Sstevel@tonic-gate 
721*7c478bd9Sstevel@tonic-gate 	if ((ret = __db_growregion(infop, increment)) != 0)
722*7c478bd9Sstevel@tonic-gate 		return (ret);
723*7c478bd9Sstevel@tonic-gate 
724*7c478bd9Sstevel@tonic-gate 	/* Update the on-disk region size. */
725*7c478bd9Sstevel@tonic-gate 	rlp->size = new_size;
726*7c478bd9Sstevel@tonic-gate 
727*7c478bd9Sstevel@tonic-gate 	/* Detach from and reattach to the region. */
728*7c478bd9Sstevel@tonic-gate 	return (__db_rreattach(infop, new_size));
729*7c478bd9Sstevel@tonic-gate }
730*7c478bd9Sstevel@tonic-gate 
731*7c478bd9Sstevel@tonic-gate /*
732*7c478bd9Sstevel@tonic-gate  * __db_growregion --
733*7c478bd9Sstevel@tonic-gate  *	Grow a shared memory region.
734*7c478bd9Sstevel@tonic-gate  */
735*7c478bd9Sstevel@tonic-gate static int
__db_growregion(infop,increment)736*7c478bd9Sstevel@tonic-gate __db_growregion(infop, increment)
737*7c478bd9Sstevel@tonic-gate 	REGINFO *infop;
738*7c478bd9Sstevel@tonic-gate 	size_t increment;
739*7c478bd9Sstevel@tonic-gate {
740*7c478bd9Sstevel@tonic-gate 	db_pgno_t pages;
741*7c478bd9Sstevel@tonic-gate 	size_t i;
742*7c478bd9Sstevel@tonic-gate 	ssize_t nr, nw;
743*7c478bd9Sstevel@tonic-gate 	u_int32_t relative;
744*7c478bd9Sstevel@tonic-gate 	int ret;
745*7c478bd9Sstevel@tonic-gate 	char buf[DB_VMPAGESIZE];
746*7c478bd9Sstevel@tonic-gate 
747*7c478bd9Sstevel@tonic-gate 	/* Seek to the end of the region. */
748*7c478bd9Sstevel@tonic-gate 	if ((ret = __os_seek(infop->fd, 0, 0, 0, 0, SEEK_END)) != 0)
749*7c478bd9Sstevel@tonic-gate 		goto err;
750*7c478bd9Sstevel@tonic-gate 
751*7c478bd9Sstevel@tonic-gate 	/* Write nuls to the new bytes. */
752*7c478bd9Sstevel@tonic-gate 	memset(buf, 0, sizeof(buf));
753*7c478bd9Sstevel@tonic-gate 
754*7c478bd9Sstevel@tonic-gate 	/*
755*7c478bd9Sstevel@tonic-gate 	 * Some systems require that all of the bytes of the region be
756*7c478bd9Sstevel@tonic-gate 	 * written before it can be mapped and accessed randomly, and
757*7c478bd9Sstevel@tonic-gate 	 * other systems don't zero out the pages.
758*7c478bd9Sstevel@tonic-gate 	 */
759*7c478bd9Sstevel@tonic-gate 	if (__db_mapinit())
760*7c478bd9Sstevel@tonic-gate 		/* Extend the region by writing each new page. */
761*7c478bd9Sstevel@tonic-gate 		for (i = 0; i < increment; i += DB_VMPAGESIZE) {
762*7c478bd9Sstevel@tonic-gate 			if ((ret =
763*7c478bd9Sstevel@tonic-gate 			    __os_write(infop->fd, buf, sizeof(buf), &nw)) != 0)
764*7c478bd9Sstevel@tonic-gate 				goto err;
765*7c478bd9Sstevel@tonic-gate 			if (nw != sizeof(buf))
766*7c478bd9Sstevel@tonic-gate 				goto eio;
767*7c478bd9Sstevel@tonic-gate 		}
768*7c478bd9Sstevel@tonic-gate 	else {
769*7c478bd9Sstevel@tonic-gate 		/*
770*7c478bd9Sstevel@tonic-gate 		 * Extend the region by writing the last page.  If the region
771*7c478bd9Sstevel@tonic-gate 		 * is >4Gb, increment may be larger than the maximum possible
772*7c478bd9Sstevel@tonic-gate 		 * seek "relative" argument, as it's an unsigned 32-bit value.
773*7c478bd9Sstevel@tonic-gate 		 * Break the offset into pages of 1MB each so that we don't
774*7c478bd9Sstevel@tonic-gate 		 * overflow (2^20 + 2^32 is bigger than any memory I expect
775*7c478bd9Sstevel@tonic-gate 		 * to see for awhile).
776*7c478bd9Sstevel@tonic-gate 		 */
777*7c478bd9Sstevel@tonic-gate 		pages = (increment - DB_VMPAGESIZE) / MEGABYTE;
778*7c478bd9Sstevel@tonic-gate 		relative = (increment - DB_VMPAGESIZE) % MEGABYTE;
779*7c478bd9Sstevel@tonic-gate 		if ((ret = __os_seek(infop->fd,
780*7c478bd9Sstevel@tonic-gate 		    MEGABYTE, pages, relative, 0, SEEK_CUR)) != 0)
781*7c478bd9Sstevel@tonic-gate 			goto err;
782*7c478bd9Sstevel@tonic-gate 		if ((ret = __os_write(infop->fd, buf, sizeof(buf), &nw)) != 0)
783*7c478bd9Sstevel@tonic-gate 			goto err;
784*7c478bd9Sstevel@tonic-gate 		if (nw != sizeof(buf))
785*7c478bd9Sstevel@tonic-gate 			goto eio;
786*7c478bd9Sstevel@tonic-gate 
787*7c478bd9Sstevel@tonic-gate 		/*
788*7c478bd9Sstevel@tonic-gate 		 * It's sometimes significantly faster to page-fault in all of
789*7c478bd9Sstevel@tonic-gate 		 * the region's pages before we run the application, as we see
790*7c478bd9Sstevel@tonic-gate 		 * nasty side-effects when we page-fault while holding various
791*7c478bd9Sstevel@tonic-gate 		 * locks, i.e., the lock takes a long time to acquire because
792*7c478bd9Sstevel@tonic-gate 		 * of the underlying page fault, and the other threads convoy
793*7c478bd9Sstevel@tonic-gate 		 * behind the lock holder.
794*7c478bd9Sstevel@tonic-gate 		 *
795*7c478bd9Sstevel@tonic-gate 		 * We also use REGION_INIT to guarantee that there is enough
796*7c478bd9Sstevel@tonic-gate 		 * disk space for the region, so we also write a byte to each
797*7c478bd9Sstevel@tonic-gate 		 * page.  Reading the byte is insufficient as some systems
798*7c478bd9Sstevel@tonic-gate 		 * (e.g., Solaris) do not instantiate disk pages to satisfy
799*7c478bd9Sstevel@tonic-gate 		 * a read, and so we don't know if there is enough disk space
800*7c478bd9Sstevel@tonic-gate 		 * or not.
801*7c478bd9Sstevel@tonic-gate 		 */
802*7c478bd9Sstevel@tonic-gate 		if (DB_GLOBAL(db_region_init)) {
803*7c478bd9Sstevel@tonic-gate 			pages = increment / MEGABYTE;
804*7c478bd9Sstevel@tonic-gate 			relative = increment % MEGABYTE;
805*7c478bd9Sstevel@tonic-gate 			if ((ret = __os_seek(infop->fd,
806*7c478bd9Sstevel@tonic-gate 			    MEGABYTE, pages, relative, 1, SEEK_END)) != 0)
807*7c478bd9Sstevel@tonic-gate 				goto err;
808*7c478bd9Sstevel@tonic-gate 
809*7c478bd9Sstevel@tonic-gate 			/* Write a byte to each page. */
810*7c478bd9Sstevel@tonic-gate 			for (i = 0; i < increment; i += DB_VMPAGESIZE) {
811*7c478bd9Sstevel@tonic-gate 				if ((ret =
812*7c478bd9Sstevel@tonic-gate 				    __os_write(infop->fd, buf, 1, &nr)) != 0)
813*7c478bd9Sstevel@tonic-gate 					goto err;
814*7c478bd9Sstevel@tonic-gate 				if (nr != 1)
815*7c478bd9Sstevel@tonic-gate 					goto eio;
816*7c478bd9Sstevel@tonic-gate 				if ((ret = __os_seek(infop->fd,
817*7c478bd9Sstevel@tonic-gate 				    0, 0, DB_VMPAGESIZE - 1, 0, SEEK_CUR)) != 0)
818*7c478bd9Sstevel@tonic-gate 					goto err;
819*7c478bd9Sstevel@tonic-gate 			}
820*7c478bd9Sstevel@tonic-gate 		}
821*7c478bd9Sstevel@tonic-gate 	}
822*7c478bd9Sstevel@tonic-gate 	return (0);
823*7c478bd9Sstevel@tonic-gate 
824*7c478bd9Sstevel@tonic-gate eio:	ret = EIO;
825*7c478bd9Sstevel@tonic-gate err:	__db_err(infop->dbenv, "region grow: %s", strerror(ret));
826*7c478bd9Sstevel@tonic-gate 	return (ret);
827*7c478bd9Sstevel@tonic-gate }
828*7c478bd9Sstevel@tonic-gate 
829*7c478bd9Sstevel@tonic-gate /*
830*7c478bd9Sstevel@tonic-gate  * __db_rreattach --
831*7c478bd9Sstevel@tonic-gate  *	Detach from and reattach to a region.
832*7c478bd9Sstevel@tonic-gate  *
833*7c478bd9Sstevel@tonic-gate  * PUBLIC: int __db_rreattach __P((REGINFO *, size_t));
834*7c478bd9Sstevel@tonic-gate  */
835*7c478bd9Sstevel@tonic-gate int
__db_rreattach(infop,new_size)836*7c478bd9Sstevel@tonic-gate __db_rreattach(infop, new_size)
837*7c478bd9Sstevel@tonic-gate 	REGINFO *infop;
838*7c478bd9Sstevel@tonic-gate 	size_t new_size;
839*7c478bd9Sstevel@tonic-gate {
840*7c478bd9Sstevel@tonic-gate 	int ret;
841*7c478bd9Sstevel@tonic-gate 
842*7c478bd9Sstevel@tonic-gate #ifdef DIAGNOSTIC
843*7c478bd9Sstevel@tonic-gate 	if (infop->name == NULL) {
844*7c478bd9Sstevel@tonic-gate 		__db_err(infop->dbenv, "__db_rreattach: name was NULL");
845*7c478bd9Sstevel@tonic-gate 		return (EINVAL);
846*7c478bd9Sstevel@tonic-gate 	}
847*7c478bd9Sstevel@tonic-gate #endif
848*7c478bd9Sstevel@tonic-gate 	/*
849*7c478bd9Sstevel@tonic-gate 	 * If we're growing an already mapped region, we have to unmap it
850*7c478bd9Sstevel@tonic-gate 	 * and get it back.  We have it locked, so nobody else can get in,
851*7c478bd9Sstevel@tonic-gate 	 * which makes it fairly straight-forward to do, as everybody else
852*7c478bd9Sstevel@tonic-gate 	 * is going to block while we do the unmap/remap.  NB: if we fail
853*7c478bd9Sstevel@tonic-gate 	 * to get it back, the pooch is genuinely screwed, because we can
854*7c478bd9Sstevel@tonic-gate 	 * never release the lock we're holding.
855*7c478bd9Sstevel@tonic-gate 	 *
856*7c478bd9Sstevel@tonic-gate 	 * Detach from the region.  We have to do this first so architectures
857*7c478bd9Sstevel@tonic-gate 	 * that don't permit a file to be mapped into different places in the
858*7c478bd9Sstevel@tonic-gate 	 * address space simultaneously, e.g., HP's PaRisc, will work.
859*7c478bd9Sstevel@tonic-gate 	 */
860*7c478bd9Sstevel@tonic-gate 	if ((ret = __db_unmapregion(infop)) != 0)
861*7c478bd9Sstevel@tonic-gate 		return (ret);
862*7c478bd9Sstevel@tonic-gate 
863*7c478bd9Sstevel@tonic-gate 	/* Update the caller's REGINFO size to the new map size. */
864*7c478bd9Sstevel@tonic-gate 	infop->size = new_size;
865*7c478bd9Sstevel@tonic-gate 
866*7c478bd9Sstevel@tonic-gate 	/* Attach to the region. */
867*7c478bd9Sstevel@tonic-gate 	ret = __db_mapregion(infop->name, infop);
868*7c478bd9Sstevel@tonic-gate 
869*7c478bd9Sstevel@tonic-gate 	return (ret);
870*7c478bd9Sstevel@tonic-gate }
871