/*- * See the file LICENSE for redistribution information. * * Copyright (c) 1996, 1997, 1998 * Sleepycat Software. All rights reserved. */ #include "config.h" #ifndef lint static const char sccsid[] = "@(#)db_region.c 10.53 (Sleepycat) 11/10/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include #include #include #include #endif #include "db_int.h" #include "common_ext.h" static int __db_growregion __P((REGINFO *, size_t)); /* * __db_rattach -- * Optionally create and attach to a shared memory region. * * PUBLIC: int __db_rattach __P((REGINFO *)); */ int __db_rattach(infop) REGINFO *infop; { RLAYOUT *rlp, rl; size_t grow_region, size; ssize_t nr, nw; u_int32_t flags, mbytes, bytes; u_int8_t *p; int malloc_possible, ret, retry_cnt; grow_region = 0; malloc_possible = 1; ret = retry_cnt = 0; /* Round off the requested size to the next page boundary. */ DB_ROUNDOFF(infop->size, DB_VMPAGESIZE); /* Some architectures have hard limits on the maximum region size. */ #ifdef DB_REGIONSIZE_MAX if (infop->size > DB_REGIONSIZE_MAX) { __db_err(infop->dbenv, "__db_rattach: cache size too large"); return (EINVAL); } #endif /* Intialize the return information in the REGINFO structure. */ loop: infop->addr = NULL; infop->fd = -1; infop->segid = INVALID_SEGID; if (infop->name != NULL) { __os_freestr(infop->name); infop->name = NULL; } F_CLR(infop, REGION_CANGROW | REGION_CREATED); #ifndef HAVE_SPINLOCKS /* * XXX * Lacking spinlocks, we must have a file descriptor for fcntl(2) * locking, which implies using mmap(2) to map in a regular file. * (Theoretically, we could probably get a file descriptor to lock * other types of shared regions, but I don't see any reason to * bother.) * * Since we may be using shared memory regions, e.g., shmget(2), * and not mmap of regular files, the backing file may be only a * few tens of bytes in length. So, this depends on the ability * to fcntl lock file offsets much larger than the physical file. */ malloc_possible = 0; #endif #ifdef __hppa /* * XXX * HP-UX won't permit mutexes to live in anything but shared memory. * Instantiate a shared region file on that architecture, regardless. */ malloc_possible = 0; #endif /* * If a region is truly private, malloc the memory. That's faster * than either anonymous memory or a shared file. */ if (malloc_possible && F_ISSET(infop, REGION_PRIVATE)) { if ((ret = __os_malloc(infop->size, NULL, &infop->addr)) != 0) return (ret); /* * It's sometimes significantly faster to page-fault in all of * the region's pages before we run the application, as we see * nasty side-effects when we page-fault while holding various * locks, i.e., the lock takes a long time to acquire because * of the underlying page fault, and the other threads convoy * behind the lock holder. */ if (DB_GLOBAL(db_region_init)) for (p = infop->addr; p < (u_int8_t *)infop->addr + infop->size; p += DB_VMPAGESIZE) p[0] = '\0'; F_SET(infop, REGION_CREATED | REGION_MALLOC); goto region_init; } /* * Get the name of the region (creating the file if a temporary file * is being used). The dbenv contains the current DB environment, * including naming information. The path argument may be a file or * a directory. If path is a directory, it must exist and file is the * file name to be created inside the directory. If path is a file, * then file must be NULL. */ if ((ret = __db_appname(infop->dbenv, infop->appname, infop->path, infop->file, infop->dbflags, &infop->fd, &infop->name)) != 0) return (ret); if (infop->fd != -1) F_SET(infop, REGION_CREATED); /* * Try to create the file, if we have authority. We have to make sure * that multiple threads/processes attempting to simultaneously create * the region are properly ordered, so we open it using DB_CREATE and * DB_EXCL, so two attempts to create the region will return failure in * one. */ if (infop->fd == -1 && infop->dbflags & DB_CREATE) { flags = infop->dbflags; LF_SET(DB_EXCL); if ((ret = __db_open(infop->name, flags, flags, infop->mode, &infop->fd)) == 0) F_SET(infop, REGION_CREATED); else if (ret != EEXIST) goto errmsg; } /* If we couldn't create the file, try and open it. */ if (infop->fd == -1) { flags = infop->dbflags; LF_CLR(DB_CREATE | DB_EXCL); if ((ret = __db_open(infop->name, flags, flags, infop->mode, &infop->fd)) != 0) goto errmsg; } /* * There are three cases we support: * 1. Named anonymous memory (shmget(2)). * 2. Unnamed anonymous memory (mmap(2): MAP_ANON/MAP_ANONYMOUS). * 3. Memory backed by a regular file (mmap(2)). * * We instantiate a backing file in all cases, which contains at least * the RLAYOUT structure, and in case #3, contains the actual region. * This is necessary for a couple of reasons: * * First, the mpool region uses temporary files to name regions, and * since you may have multiple regions in the same directory, we need * a filesystem name to ensure that they don't collide. * * Second, applications are allowed to forcibly remove regions, even * if they don't know anything about them other than the name. If a * region is backed by anonymous memory, there has to be some way for * the application to find out that information, and, in some cases, * determine ID information for the anonymous memory. */ if (F_ISSET(infop, REGION_CREATED)) { /* * If we're using anonymous memory to back this region, set * the flag. */ if (DB_GLOBAL(db_region_anon)) F_SET(infop, REGION_ANONYMOUS); /* * If we're using a regular file to back a region we created, * grow it to the specified size. */ if (!DB_GLOBAL(db_region_anon) && (ret = __db_growregion(infop, infop->size)) != 0) goto err; } else { /* * If we're joining a region, figure out what it looks like. * * XXX * We have to figure out if the file is a regular file backing * a region that we want to map into our address space, or a * file with the information we need to find a shared anonymous * region that we want to map into our address space. * * All this noise is because some systems don't have a coherent * VM and buffer cache, and worse, if you mix operations on the * VM and buffer cache, half the time you hang the system. * * There are two possibilities. If the file is the size of an * RLAYOUT structure, then we know that the real region is in * shared memory, because otherwise it would be bigger. (As * the RLAYOUT structure size is smaller than a disk sector, * the only way it can be this size is if deliberately written * that way.) In which case, retrieve the information we need * from the RLAYOUT structure and use it to acquire the shared * memory. * * If the structure is larger than an RLAYOUT structure, then * the file is backing the shared memory region, and we use * the current size of the file without reading any information * from the file itself so that we don't confuse the VM. * * And yes, this makes me want to take somebody and kill them, * but I can't think of any other solution. */ if ((ret = __os_ioinfo(infop->name, infop->fd, &mbytes, &bytes, NULL)) != 0) goto errmsg; size = mbytes * MEGABYTE + bytes; if (size <= sizeof(RLAYOUT)) { /* * If the size is too small, the read fails or the * valid flag is incorrect, assume it's because the * RLAYOUT information hasn't been written out yet, * and retry. */ if (size < sizeof(RLAYOUT)) goto retry; if ((ret = __os_read(infop->fd, &rl, sizeof(rl), &nr)) != 0) goto retry; if (rl.valid != DB_REGIONMAGIC) goto retry; /* Copy the size, memory id and characteristics. */ size = rl.size; infop->segid = rl.segid; if (F_ISSET(&rl, REGION_ANONYMOUS)) F_SET(infop, REGION_ANONYMOUS); } /* * If the region is larger than we think, that's okay, use the * current size. If it's smaller than we think, and we were * just using the default size, that's okay, use the current * size. If it's smaller than we think and we really care, * save the size and we'll catch that further down -- we can't * correct it here because we have to have a lock to grow the * region. */ if (infop->size > size && !F_ISSET(infop, REGION_SIZEDEF)) grow_region = infop->size; infop->size = size; } /* * Map the region into our address space. If we're creating it, the * underlying routines will make it the right size. * * There are at least two cases where we can "reasonably" fail when * we attempt to map in the region. On Windows/95, closing the last * reference to a region causes it to be zeroed out. On UNIX, when * using the shmget(2) interfaces, the region will no longer exist * if the system was rebooted. In these cases, the underlying map call * returns EAGAIN, and we *remove* our file and try again. There are * obvious races in doing this, but it should eventually settle down * to a winner and then things should proceed normally. */ if ((ret = __db_mapregion(infop->name, infop)) != 0) if (ret == EAGAIN) { /* * Pretend we created the region even if we didn't so * that our error processing unlinks it. */ F_SET(infop, REGION_CREATED); ret = 0; goto retry; } else goto err; region_init: /* * Initialize the common region information. * * !!! * We have to order the region creates so that two processes don't try * to simultaneously create the region. This is handled by using the * DB_CREATE and DB_EXCL flags when we create the "backing" region file. * * We also have to order region joins so that processes joining regions * never see inconsistent data. We'd like to play permissions games * with the backing file, but we can't because WNT filesystems won't * open a file mode 0. */ rlp = (RLAYOUT *)infop->addr; if (F_ISSET(infop, REGION_CREATED)) { /* * The process creating the region acquires a lock before it * sets the valid flag. Any processes joining the region will * check the valid flag before acquiring the lock. * * Check the return of __db_mutex_init() and __db_mutex_lock(), * even though we don't usually check elsewhere. This is the * first lock we initialize and acquire, and we have to know if * it fails. (It CAN fail, e.g., SunOS, when using fcntl(2) * for locking, with an in-memory filesystem specified as the * database home.) */ if ((ret = __db_mutex_init(&rlp->lock, MUTEX_LOCK_OFFSET(rlp, &rlp->lock))) != 0 || (ret = __db_mutex_lock(&rlp->lock, infop->fd)) != 0) goto err; /* Initialize the remaining region information. */ rlp->refcnt = 1; rlp->size = infop->size; db_version(&rlp->majver, &rlp->minver, &rlp->patch); rlp->panic = 0; rlp->segid = infop->segid; rlp->flags = 0; if (F_ISSET(infop, REGION_ANONYMOUS)) F_SET(rlp, REGION_ANONYMOUS); /* * Fill in the valid field last -- use a magic number, memory * may not be zero-filled, and we want to minimize the chance * for collision. */ rlp->valid = DB_REGIONMAGIC; /* * If the region is anonymous, write the RLAYOUT information * into the backing file so that future region join and unlink * calls can find it. * * XXX * We MUST do the seek before we do the write. On Win95, while * closing the last reference to an anonymous shared region * doesn't discard the region, it does zero it out. So, the * REGION_CREATED may be set, but the file may have already * been written and the file descriptor may be at the end of * the file. */ if (F_ISSET(infop, REGION_ANONYMOUS)) { if ((ret = __os_seek(infop->fd, 0, 0, 0, 0, 0)) != 0) goto err; if ((ret = __os_write(infop->fd, rlp, sizeof(*rlp), &nw)) != 0) goto err; } } else { /* Check to see if the region has had catastrophic failure. */ if (rlp->panic) { ret = DB_RUNRECOVERY; goto err; } /* * Check the valid flag to ensure the region is initialized. * If the valid flag has not been set, the mutex may not have * been initialized, and an attempt to get it could lead to * random behavior. */ if (rlp->valid != DB_REGIONMAGIC) goto retry; /* Get the region lock. */ (void)__db_mutex_lock(&rlp->lock, infop->fd); /* * We now own the region. There are a couple of things that * may have gone wrong, however. * * Problem #1: while we were waiting for the lock, the region * was deleted. Detected by re-checking the valid flag, since * it's cleared by the delete region routines. */ if (rlp->valid != DB_REGIONMAGIC) { (void)__db_mutex_unlock(&rlp->lock, infop->fd); goto retry; } /* * Problem #3: when we checked the size of the file, it was * still growing as part of creation. Detected by the fact * that infop->size isn't the same size as the region. */ if (infop->size != rlp->size) { (void)__db_mutex_unlock(&rlp->lock, infop->fd); goto retry; } /* Increment the reference count. */ ++rlp->refcnt; } /* Return the region in a locked condition. */ if (0) { errmsg: __db_err(infop->dbenv, "%s: %s", infop->name, strerror(ret)); err: retry: /* Discard the region. */ if (infop->addr != NULL) { (void)__db_unmapregion(infop); infop->addr = NULL; } /* Discard the backing file. */ if (infop->fd != -1) { (void)__os_close(infop->fd); infop->fd = -1; if (F_ISSET(infop, REGION_CREATED)) (void)__os_unlink(infop->name); } /* Discard the name. */ if (infop->name != NULL) { __os_freestr(infop->name); infop->name = NULL; } /* * If we had a temporary error, wait a few seconds and * try again. */ if (ret == 0) { if (++retry_cnt <= 3) { __os_sleep(retry_cnt * 2, 0); goto loop; } ret = EAGAIN; } } /* * XXX * HP-UX won't permit mutexes to live in anything but shared memory. * Instantiate a shared region file on that architecture, regardless. * * XXX * There's a problem in cleaning this up on application exit, or on * application failure. If an application opens a database without * an environment, we create a temporary backing mpool region for it. * That region is marked REGION_PRIVATE, but as HP-UX won't permit * mutexes to live in anything but shared memory, we instantiate a * real file plus a memory region of some form. If the application * crashes, the necessary information to delete the backing file and * any system region (e.g., the shmget(2) segment ID) is no longer * available. We can't completely fix the problem, but we try. * * The underlying UNIX __db_mapregion() code preferentially uses the * mmap(2) interface with the MAP_ANON/MAP_ANONYMOUS flags for regions * that are marked REGION_PRIVATE. This means that we normally aren't * holding any system resources when we get here, in which case we can * delete the backing file. This results in a short race, from the * __db_open() call above to here. * * If, for some reason, we are holding system resources when we get * here, we don't have any choice -- we can't delete the backing file * because we may need it to detach from the resources. Set the * REGION_LASTDETACH flag, so that we do all necessary cleanup when * the application closes the region. */ if (F_ISSET(infop, REGION_PRIVATE) && !F_ISSET(infop, REGION_MALLOC)) if (F_ISSET(infop, REGION_HOLDINGSYS)) F_SET(infop, REGION_LASTDETACH); else { F_SET(infop, REGION_REMOVED); F_CLR(infop, REGION_CANGROW); (void)__os_close(infop->fd); (void)__os_unlink(infop->name); } return (ret); } /* * __db_rdetach -- * De-attach from a shared memory region. * * PUBLIC: int __db_rdetach __P((REGINFO *)); */ int __db_rdetach(infop) REGINFO *infop; { RLAYOUT *rlp; int detach, ret, t_ret; ret = 0; /* * If the region was removed when it was created, no further action * is required. */ if (F_ISSET(infop, REGION_REMOVED)) goto done; /* * If the region was created in memory returned by malloc, the only * action required is freeing the memory. */ if (F_ISSET(infop, REGION_MALLOC)) { __os_free(infop->addr, 0); goto done; } /* Otherwise, attach to the region and optionally delete it. */ rlp = infop->addr; /* Get the lock. */ (void)__db_mutex_lock(&rlp->lock, infop->fd); /* Decrement the reference count. */ if (rlp->refcnt == 0) __db_err(infop->dbenv, "region rdetach: reference count went to zero!"); else --rlp->refcnt; /* * If we're going to remove the region, clear the valid flag so * that any region join that's blocked waiting for us will know * what happened. */ detach = 0; if (F_ISSET(infop, REGION_LASTDETACH)) if (rlp->refcnt == 0) { detach = 1; rlp->valid = 0; } else ret = EBUSY; /* Release the lock. */ (void)__db_mutex_unlock(&rlp->lock, infop->fd); /* Close the backing file descriptor. */ (void)__os_close(infop->fd); infop->fd = -1; /* Discard our mapping of the region. */ if ((t_ret = __db_unmapregion(infop)) != 0 && ret == 0) ret = t_ret; /* Discard the region itself. */ if (detach) { if ((t_ret = __db_unlinkregion(infop->name, infop) != 0) && ret == 0) ret = t_ret; if ((t_ret = __os_unlink(infop->name) != 0) && ret == 0) ret = t_ret; } done: /* Discard the name. */ if (infop->name != NULL) { __os_freestr(infop->name); infop->name = NULL; } return (ret); } /* * __db_runlink -- * Remove a region. * * PUBLIC: int __db_runlink __P((REGINFO *, int)); */ int __db_runlink(infop, force) REGINFO *infop; int force; { RLAYOUT rl, *rlp; size_t size; ssize_t nr; u_int32_t mbytes, bytes; int fd, ret, t_ret; char *name; /* * XXX * We assume that we've created a new REGINFO structure for this * call, not used one that was already initialized. Regardless, * if anyone is planning to use it after we're done, they're going * to be sorely disappointed. * * If force isn't set, we attach to the region, set a flag to delete * the region on last close, and let the region delete code do the * work. */ if (!force) { if ((ret = __db_rattach(infop)) != 0) return (ret); rlp = (RLAYOUT *)infop->addr; (void)__db_mutex_unlock(&rlp->lock, infop->fd); F_SET(infop, REGION_LASTDETACH); return (__db_rdetach(infop)); } /* * Otherwise, we don't want to attach to the region. We may have been * called to clean up if a process died leaving a region locked and/or * corrupted, which could cause the attach to hang. */ if ((ret = __db_appname(infop->dbenv, infop->appname, infop->path, infop->file, infop->dbflags, NULL, &name)) != 0) return (ret); /* * An underlying file is created for all regions other than private * (REGION_PRIVATE) ones, regardless of whether or not it's used to * back the region. If that file doesn't exist, we're done. */ if (__os_exists(name, NULL) != 0) { __os_freestr(name); return (0); } /* * See the comments in __db_rattach -- figure out if this is a regular * file backing a region or if it's a regular file with information * about a region. */ if ((ret = __db_open(name, DB_RDONLY, DB_RDONLY, 0, &fd)) != 0) goto errmsg; if ((ret = __os_ioinfo(name, fd, &mbytes, &bytes, NULL)) != 0) goto errmsg; size = mbytes * MEGABYTE + bytes; if (size <= sizeof(RLAYOUT)) { if ((ret = __os_read(fd, &rl, sizeof(rl), &nr)) != 0) goto errmsg; if (rl.valid != DB_REGIONMAGIC) { __db_err(infop->dbenv, "%s: illegal region magic number", name); ret = EINVAL; goto err; } /* Set the size, memory id and characteristics. */ infop->size = rl.size; infop->segid = rl.segid; if (F_ISSET(&rl, REGION_ANONYMOUS)) F_SET(infop, REGION_ANONYMOUS); } else { infop->size = size; infop->segid = INVALID_SEGID; } /* Remove the underlying region. */ ret = __db_unlinkregion(name, infop); /* * Unlink the backing file. Close the open file descriptor first, * because some architectures (e.g., Win32) won't unlink a file if * open file descriptors remain. */ (void)__os_close(fd); if ((t_ret = __os_unlink(name)) != 0 && ret == 0) ret = t_ret; if (0) { errmsg: __db_err(infop->dbenv, "%s: %s", name, strerror(ret)); err: (void)__os_close(fd); } __os_freestr(name); return (ret); } /* * __db_rgrow -- * Extend a region. * * PUBLIC: int __db_rgrow __P((REGINFO *, size_t)); */ int __db_rgrow(infop, new_size) REGINFO *infop; size_t new_size; { RLAYOUT *rlp; size_t increment; int ret; /* * !!! * This routine MUST be called with the region already locked. */ /* The underlying routines have flagged if this region can grow. */ if (!F_ISSET(infop, REGION_CANGROW)) return (EINVAL); /* * Round off the requested size to the next page boundary, and * determine the additional space required. */ rlp = (RLAYOUT *)infop->addr; DB_ROUNDOFF(new_size, DB_VMPAGESIZE); increment = new_size - rlp->size; if ((ret = __db_growregion(infop, increment)) != 0) return (ret); /* Update the on-disk region size. */ rlp->size = new_size; /* Detach from and reattach to the region. */ return (__db_rreattach(infop, new_size)); } /* * __db_growregion -- * Grow a shared memory region. */ static int __db_growregion(infop, increment) REGINFO *infop; size_t increment; { db_pgno_t pages; size_t i; ssize_t nr, nw; u_int32_t relative; int ret; char buf[DB_VMPAGESIZE]; /* Seek to the end of the region. */ if ((ret = __os_seek(infop->fd, 0, 0, 0, 0, SEEK_END)) != 0) goto err; /* Write nuls to the new bytes. */ memset(buf, 0, sizeof(buf)); /* * Some systems require that all of the bytes of the region be * written before it can be mapped and accessed randomly, and * other systems don't zero out the pages. */ if (__db_mapinit()) /* Extend the region by writing each new page. */ for (i = 0; i < increment; i += DB_VMPAGESIZE) { if ((ret = __os_write(infop->fd, buf, sizeof(buf), &nw)) != 0) goto err; if (nw != sizeof(buf)) goto eio; } else { /* * Extend the region by writing the last page. If the region * is >4Gb, increment may be larger than the maximum possible * seek "relative" argument, as it's an unsigned 32-bit value. * Break the offset into pages of 1MB each so that we don't * overflow (2^20 + 2^32 is bigger than any memory I expect * to see for awhile). */ pages = (increment - DB_VMPAGESIZE) / MEGABYTE; relative = (increment - DB_VMPAGESIZE) % MEGABYTE; if ((ret = __os_seek(infop->fd, MEGABYTE, pages, relative, 0, SEEK_CUR)) != 0) goto err; if ((ret = __os_write(infop->fd, buf, sizeof(buf), &nw)) != 0) goto err; if (nw != sizeof(buf)) goto eio; /* * It's sometimes significantly faster to page-fault in all of * the region's pages before we run the application, as we see * nasty side-effects when we page-fault while holding various * locks, i.e., the lock takes a long time to acquire because * of the underlying page fault, and the other threads convoy * behind the lock holder. * * We also use REGION_INIT to guarantee that there is enough * disk space for the region, so we also write a byte to each * page. Reading the byte is insufficient as some systems * (e.g., Solaris) do not instantiate disk pages to satisfy * a read, and so we don't know if there is enough disk space * or not. */ if (DB_GLOBAL(db_region_init)) { pages = increment / MEGABYTE; relative = increment % MEGABYTE; if ((ret = __os_seek(infop->fd, MEGABYTE, pages, relative, 1, SEEK_END)) != 0) goto err; /* Write a byte to each page. */ for (i = 0; i < increment; i += DB_VMPAGESIZE) { if ((ret = __os_write(infop->fd, buf, 1, &nr)) != 0) goto err; if (nr != 1) goto eio; if ((ret = __os_seek(infop->fd, 0, 0, DB_VMPAGESIZE - 1, 0, SEEK_CUR)) != 0) goto err; } } } return (0); eio: ret = EIO; err: __db_err(infop->dbenv, "region grow: %s", strerror(ret)); return (ret); } /* * __db_rreattach -- * Detach from and reattach to a region. * * PUBLIC: int __db_rreattach __P((REGINFO *, size_t)); */ int __db_rreattach(infop, new_size) REGINFO *infop; size_t new_size; { int ret; #ifdef DIAGNOSTIC if (infop->name == NULL) { __db_err(infop->dbenv, "__db_rreattach: name was NULL"); return (EINVAL); } #endif /* * If we're growing an already mapped region, we have to unmap it * and get it back. We have it locked, so nobody else can get in, * which makes it fairly straight-forward to do, as everybody else * is going to block while we do the unmap/remap. NB: if we fail * to get it back, the pooch is genuinely screwed, because we can * never release the lock we're holding. * * Detach from the region. We have to do this first so architectures * that don't permit a file to be mapped into different places in the * address space simultaneously, e.g., HP's PaRisc, will work. */ if ((ret = __db_unmapregion(infop)) != 0) return (ret); /* Update the caller's REGINFO size to the new map size. */ infop->size = new_size; /* Attach to the region. */ ret = __db_mapregion(infop->name, infop); return (ret); }