/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* Copyright (c) 1988 AT&T */ /* All Rights Reserved */ /* * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include #include #include #include #if SHARE #include #include #define ERR -1 #endif #include "invlib.h" #include "library.h" #define DEBUG 0 /* debugging code and realloc messages */ #define BLOCKSIZE 2 * BUFSIZ /* logical block size */ #define LINEMAX 1000 /* sorted posting line max size */ #define POSTINC 10000 /* posting buffer size increment */ #define SEP ' ' /* sorted posting field separator */ #define SETINC 100 /* posting set size increment */ #define STATS 0 /* print statistics */ #define SUPERINC 10000 /* super index size increment */ #define TERMMAX 512 /* term max size */ #define VERSION 1 /* inverted index format version */ #define ZIPFSIZE 200 /* zipf curve size */ #define FREAD "r" /* fopen for reading */ #define FREADP "r+" /* fopen for update */ #define FWRITE "w" /* fopen truncate or create for writing */ #define FWRITEP "w+" /* fopen truncate or create for update */ extern char *argv0; /* command name (must be set in main function) */ int invbreak; #if STATS int showzipf; /* show postings per term distribution */ #endif static POSTING *item, *enditem, *item1 = NULL, *item2 = NULL; static unsigned setsize1, setsize2; static long numitems, totterm, zerolong; static char *indexfile, *postingfile; static FILE *outfile, *fpost; static unsigned supersize = SUPERINC, supintsize; static int numpost, numlogblk, amtused, nextpost, lastinblk, numinvitems; static POSTING *POST, *postptr; static unsigned long *SUPINT, *supint, nextsupfing; static char *SUPFING, *supfing; static char thisterm[TERMMAX]; static union { long invblk[BLOCKSIZE / sizeof (long)]; char chrblk[BLOCKSIZE]; } logicalblk; #if DEBUG || STATS static long totpost; #endif #if STATS static int zipf[ZIPFSIZE + 1]; #endif static void invcannotalloc(size_t n); static void invcannotopen(char *file); static void invcannotwrite(char *file); static int invnewterm(void); static int boolready(void); long invmake(char *invname, char *invpost, FILE *infile) { unsigned char *s; long num; int i; long fileindex; unsigned postsize = POSTINC * sizeof (POSTING); unsigned long *intptr; char line[LINEMAX]; long tlong; PARAM param; POSTING posting; #if STATS int j; unsigned maxtermlen = 0; #endif /* output file */ if ((outfile = vpfopen(invname, FWRITEP)) == NULL) { invcannotopen(invname); return (0); } indexfile = invname; (void) fseek(outfile, (long)BUFSIZ, 0); /* posting file */ if ((fpost = vpfopen(invpost, FWRITE)) == NULL) { invcannotopen(invpost); return (0); } postingfile = invpost; nextpost = 0; /* get space for the postings list */ if ((POST = (POSTING *)malloc(postsize)) == NULL) { invcannotalloc(postsize); return (0); } postptr = POST; /* get space for the superfinger (superindex) */ if ((SUPFING = malloc(supersize)) == NULL) { invcannotalloc(supersize); return (0); } supfing = SUPFING; supintsize = supersize / 40; /* also for the superfinger index */ if ((SUPINT = malloc(supintsize * sizeof (long))) == NULL) { invcannotalloc(supintsize * sizeof (long)); return (0); } supint = SUPINT; supint++; /* leave first term open for a count */ /* initialize using an empty term */ (void) strcpy(thisterm, ""); *supint++ = 0; *supfing++ = ' '; *supfing++ = '\0'; nextsupfing = 2; #if DEBUG || STATS totpost = 0L; #endif totterm = 0L; numpost = 1; /* * set up as though a block had come and gone, i.e., set up for * new block */ amtused = 16; /* leave no space - init 3 words + one for luck */ numinvitems = 0; numlogblk = 0; lastinblk = BLOCKSIZE; /* now loop as long as more to read (till eof) */ while (fgets(line, LINEMAX, infile) != NULL) { #if DEBUG || STATS ++totpost; #endif s = (unsigned char *) strchr(line, SEP); if (s == NULL) /* where did this line come from ??? */ continue; /* workaround: just skip it */ *s = '\0'; #if STATS if ((i = strlen(line)) > maxtermlen) { maxtermlen = i; } #endif #if DEBUG (void) printf("%ld: %s ", totpost, line); (void) fflush(stdout); #endif if (strcmp(thisterm, line) == 0) { if (postptr + 10 > POST + postsize / sizeof (POSTING)) { i = postptr - POST; postsize += POSTINC * sizeof (POSTING); if ((POST = realloc(POST, postsize)) == NULL) { invcannotalloc(postsize); return (0); } postptr = i + POST; #if DEBUG (void) printf("reallocated post space to %u, " "totpost=%ld\n", postsize, totpost); #endif } numpost++; } else { /* have a new term */ if (!invnewterm()) { return (0); } (void) strcpy(thisterm, line); numpost = 1; postptr = POST; fileindex = 0; } /* get the new posting */ num = *++s - '!'; i = 1; do { num = BASE * num + *++s - '!'; } while (++i < PRECISION); posting.lineoffset = num; while (++fileindex < nsrcoffset && num > srcoffset[fileindex]) { ; } posting.fileindex = --fileindex; posting.type = *++s; num = *++s - '!'; if (*s != '\n') { num = *++s - '!'; while (*++s != '\n') { num = BASE * num + *s - '!'; } posting.fcnoffset = num; } else { posting.fcnoffset = 0; } *postptr++ = posting; #if DEBUG (void) printf("%ld %ld %ld %ld\n", posting.fileindex, posting.fcnoffset, posting.lineoffset, posting.type); (void) fflush(stdout); #endif } if (!invnewterm()) { return (0); } /* now clean up final block */ logicalblk.invblk[0] = numinvitems; /* loops pointer around to start */ logicalblk.invblk[1] = 0; logicalblk.invblk[2] = numlogblk - 1; if (fwrite((char *)&logicalblk, BLOCKSIZE, 1, outfile) == 0) { goto cannotwrite; } numlogblk++; /* write out block to save space. what in it doesn't matter */ if (fwrite((char *)&logicalblk, BLOCKSIZE, 1, outfile) == 0) { goto cannotwrite; } /* finish up the super finger */ *SUPINT = numlogblk; /* add to the offsets the size of the offset pointers */ intptr = (SUPINT + 1); i = (char *)supint - (char *)SUPINT; while (intptr < supint) *intptr++ += i; /* write out the offsets (1 for the N at start) and the super finger */ if (fwrite((char *)SUPINT, sizeof (*SUPINT), numlogblk + 1, outfile) == 0 || fwrite(SUPFING, 1, supfing - SUPFING, outfile) == 0) { goto cannotwrite; } /* save the size for reference later */ nextsupfing = sizeof (long) + sizeof (long) * numlogblk + (supfing - SUPFING); /* * make sure the file ends at a logical block boundary. This is * necessary for invinsert to correctly create extended blocks */ i = nextsupfing % BLOCKSIZE; /* write out junk to fill log blk */ if (fwrite(SUPFING, BLOCKSIZE - i, 1, outfile) == 0 || fflush(outfile) == EOF) { /* rewind doesn't check for write failure */ goto cannotwrite; } /* write the control area */ rewind(outfile); param.version = VERSION; param.filestat = 0; param.sizeblk = BLOCKSIZE; param.startbyte = (numlogblk + 1) * BLOCKSIZE + BUFSIZ; param.supsize = nextsupfing; param.cntlsize = BUFSIZ; param.share = 0; if (fwrite((char *)¶m, sizeof (param), 1, outfile) == 0) { goto cannotwrite; } for (i = 0; i < 10; i++) /* for future use */ if (fwrite((char *)&zerolong, sizeof (zerolong), 1, outfile) == 0) { goto cannotwrite; } /* make first block loop backwards to last block */ if (fflush(outfile) == EOF) { /* fseek doesn't check for write failure */ goto cannotwrite; } /* get to second word first block */ (void) fseek(outfile, (long)BUFSIZ + 8, 0); tlong = numlogblk - 1; if (fwrite((char *)&tlong, sizeof (tlong), 1, outfile) == 0 || fclose(outfile) == EOF) { cannotwrite: invcannotwrite(invname); return (0); } if (fclose(fpost) == EOF) { invcannotwrite(postingfile); return (0); } --totterm; /* don't count null term */ #if STATS (void) printf("logical blocks = %d, postings = %ld, terms = %ld, " "max term length = %d\n", numlogblk, totpost, totterm, maxtermlen); if (showzipf) { (void) printf( "\n************* ZIPF curve ****************\n"); for (j = ZIPFSIZE; j > 1; j--) if (zipf[j]) break; for (i = 1; i < j; ++i) { (void) printf("%3d -%6d ", i, zipf[i]); if (i % 6 == 0) (void) putchar('\n'); } (void) printf(">%d-%6d\n", ZIPFSIZE, zipf[0]); } #endif /* free all malloc'd memory */ free(POST); free(SUPFING); free(SUPINT); return (totterm); } /* add a term to the data base */ static int invnewterm(void) { int backupflag, i, j, maxback, holditems, gooditems, howfar; int len, numwilluse, wdlen; char *tptr, *tptr2, *tptr3; union { unsigned long packword[2]; ENTRY e; } iteminfo; totterm++; #if STATS /* keep zipfian info on the distribution */ if (numpost <= ZIPFSIZE) zipf[numpost]++; else zipf[0]++; #endif len = strlen(thisterm); wdlen = (len + (sizeof (long) - 1)) / sizeof (long); numwilluse = (wdlen + 3) * sizeof (long); /* new block if at least 1 item in block */ if (numinvitems && numwilluse + amtused > BLOCKSIZE) { /* set up new block */ if (supfing + 500 > SUPFING + supersize) { i = supfing - SUPFING; supersize += 20000; if ((SUPFING = realloc(SUPFING, supersize)) == NULL) { invcannotalloc(supersize); return (0); } supfing = i + SUPFING; #if DEBUG (void) printf("reallocated superfinger space to %d, " "totpost=%ld\n", supersize, totpost); #endif } /* check that room for the offset as well */ if ((numlogblk + 10) > supintsize) { i = supint - SUPINT; supintsize += SUPERINC; if ((SUPINT = realloc((char *)SUPINT, supintsize * sizeof (long))) == NULL) { invcannotalloc(supintsize * sizeof (long)); return (0); } supint = i + SUPINT; #if DEBUG (void) printf("reallocated superfinger offset to %d, " "totpost = %ld\n", supintsize * sizeof (long), totpost); #endif } /* See if backup is efficatious */ backupflag = 0; maxback = strlen(thisterm) / 10; holditems = numinvitems; if (maxback > numinvitems) maxback = numinvitems - 2; howfar = 0; while (--maxback > 0) { howfar++; iteminfo.packword[0] = logicalblk.invblk[--holditems * 2 + (sizeof (long) - 1)]; if ((i = iteminfo.e.size / 10) < maxback) { maxback = i; backupflag = howfar; gooditems = holditems; tptr2 = logicalblk.chrblk + iteminfo.e.offset; } } /* see if backup will occur */ if (backupflag) { numinvitems = gooditems; } logicalblk.invblk[0] = numinvitems; /* set forward pointer pointing to next */ logicalblk.invblk[1] = numlogblk + 1; /* set back pointer to last block */ logicalblk.invblk[2] = numlogblk - 1; if (fwrite((char *)logicalblk.chrblk, 1, BLOCKSIZE, outfile) == 0) { invcannotwrite(indexfile); return (0); } amtused = 16; numlogblk++; /* check if had to back up, if so do it */ if (backupflag) { /* find out where the end of the new block is */ iteminfo.packword[0] = logicalblk.invblk[numinvitems * 2 + 1]; tptr3 = logicalblk.chrblk + iteminfo.e.offset; /* move the index for this block */ for (i = 3; i <= (backupflag * 2 + 2); i++) { logicalblk.invblk[i] = logicalblk.invblk[numinvitems * 2+i]; } /* move the word into the super index */ iteminfo.packword[0] = logicalblk.invblk[3]; iteminfo.packword[1] = logicalblk.invblk[4]; tptr2 = logicalblk.chrblk + iteminfo.e.offset; (void) strncpy(supfing, tptr2, (int)iteminfo.e.size); *(supfing + iteminfo.e.size) = '\0'; #if DEBUG (void) printf("backup %d at term=%s to term=%s\n", backupflag, thisterm, supfing); #endif *supint++ = nextsupfing; nextsupfing += strlen(supfing) + 1; supfing += strlen(supfing) + 1; /* now fix up the logical block */ tptr = logicalblk.chrblk + lastinblk; lastinblk = BLOCKSIZE; tptr2 = logicalblk.chrblk + lastinblk; j = tptr3 - tptr; while (tptr3 > tptr) *--tptr2 = *--tptr3; lastinblk -= j; amtused += (8 * backupflag + j); for (i = 3; i < (backupflag * 2 + 2); i += 2) { iteminfo.packword[0] = logicalblk.invblk[i]; iteminfo.e.offset += (tptr2 - tptr3); logicalblk.invblk[i] = iteminfo.packword[0]; } numinvitems = backupflag; } else { /* no backup needed */ numinvitems = 0; lastinblk = BLOCKSIZE; /* add new term to superindex */ (void) strcpy(supfing, thisterm); supfing += strlen(thisterm) + 1; *supint++ = nextsupfing; nextsupfing += strlen(thisterm) + 1; } } lastinblk -= (numwilluse - 8); iteminfo.e.offset = lastinblk; iteminfo.e.size = (char)len; iteminfo.e.space = 0; iteminfo.e.post = numpost; (void) strncpy(logicalblk.chrblk + lastinblk, thisterm, len); amtused += numwilluse; logicalblk.invblk[(lastinblk/sizeof (long))+wdlen] = nextpost; if ((i = postptr - POST) > 0) { if (fwrite((char *)POST, sizeof (POSTING), i, fpost) == 0) { invcannotwrite(postingfile); return (0); } nextpost += i * sizeof (POSTING); } logicalblk.invblk[3+2*numinvitems++] = iteminfo.packword[0]; logicalblk.invblk[2+2*numinvitems] = iteminfo.packword[1]; return (1); } static void swap_ints(void *p, size_t sz) { uint32_t *s; uint32_t *e = (uint32_t *)p + (sz / sizeof (uint32_t)); for (s = p; s < e; s++) *s = BSWAP_32(*s); } static void write_param(INVCONTROL *invcntl) { if (invcntl->swap) swap_ints(&invcntl->param, sizeof (invcntl->param)); rewind(invcntl->invfile); (void) fwrite((char *)&invcntl->param, sizeof (invcntl->param), 1, invcntl->invfile); if (invcntl->swap) swap_ints(&invcntl->param, sizeof (invcntl->param)); } static void read_superfinger(INVCONTROL *invcntl) { size_t count; (void) fseek(invcntl->invfile, invcntl->param.startbyte, SEEK_SET); (void) fread(invcntl->iindex, (int)invcntl->param.supsize, 1, invcntl->invfile); if (invcntl->swap) { /* * The superfinger consists of a count, followed by * count offsets, followed by a string table (which * the offsets reference). * * We need to swap the count and the offsets. */ count = 1 + BSWAP_32(*(uint32_t *)invcntl->iindex); swap_ints(invcntl->iindex, count * sizeof (unsigned long)); } } static void read_logblock(INVCONTROL *invcntl, int block) { /* note always fetch it if the file is busy */ if ((block != invcntl->numblk) || (invcntl->param.filestat >= INVBUSY)) { (void) fseek(invcntl->invfile, (block * invcntl->param.sizeblk) + invcntl->param.cntlsize, SEEK_SET); invcntl->numblk = block; (void) fread(invcntl->logblk, (int)invcntl->param.sizeblk, 1, invcntl->invfile); if (invcntl->swap) { size_t count; ENTRY *ecur, *eend; uint32_t *postptr; /* * A logblock consists of a count, a next block id, * and a previous block id, followed by count * ENTRYs, followed by alternating strings and * offsets. */ swap_ints(invcntl->logblk, 3 * sizeof (unsigned long)); count = *(uint32_t *)invcntl->logblk; ecur = (ENTRY *)((uint32_t *)invcntl->logblk + 3); eend = ecur + count; for (; ecur < eend; ecur++) { ecur->offset = BSWAP_16(ecur->offset); ecur->post = BSWAP_32(ecur->post); /* * After the string is the posting offset. */ postptr = (uint32_t *)(invcntl->logblk + ecur->offset + P2ROUNDUP(ecur->size, sizeof (long))); *postptr = BSWAP_32(*postptr); } } } } void read_next_posting(INVCONTROL *invcntl, POSTING *posting) { (void) fread((char *)posting, sizeof (*posting), 1, invcntl->postfile); if (invcntl->swap) { posting->lineoffset = BSWAP_32(posting->lineoffset); posting->fcnoffset = BSWAP_32(posting->fcnoffset); /* * fileindex is a 24-bit field, so shift it before swapping */ posting->fileindex = BSWAP_32(posting->fileindex << 8); } } int invopen(INVCONTROL *invcntl, char *invname, char *invpost, int stat) { int read_index; if ((invcntl->invfile = vpfopen(invname, ((stat == 0) ? FREAD : FREADP))) == NULL) { invcannotopen(invname); return (-1); } if (fread((char *)&invcntl->param, sizeof (invcntl->param), 1, invcntl->invfile) == 0) { (void) fprintf(stderr, "%s: empty inverted file\n", argv0); goto closeinv; } if (invcntl->param.version != VERSION && BSWAP_32(invcntl->param.version) != VERSION) { (void) fprintf(stderr, "%s: cannot read old index format; use -U option to " "force database to rebuild\n", argv0); goto closeinv; } invcntl->swap = (invcntl->param.version != VERSION); if (invcntl->swap) swap_ints(&invcntl->param, sizeof (invcntl->param)); if (stat == 0 && invcntl->param.filestat == INVALONE) { (void) fprintf(stderr, "%s: inverted file is locked\n", argv0); goto closeinv; } if ((invcntl->postfile = vpfopen(invpost, ((stat == 0) ? FREAD : FREADP))) == NULL) { invcannotopen(invpost); goto closeinv; } /* allocate core for a logical block */ if ((invcntl->logblk = malloc(invcntl->param.sizeblk)) == NULL) { invcannotalloc((unsigned)invcntl->param.sizeblk); goto closeboth; } /* allocate for and read in superfinger */ read_index = 1; invcntl->iindex = NULL; #if SHARE if (invcntl->param.share == 1) { key_t ftok(), shm_key; struct shmid_ds shm_buf; char *shmat(); int shm_id; /* see if the shared segment exists */ shm_key = ftok(invname, 2); shm_id = shmget(shm_key, 0, 0); /* * Failure simply means (hopefully) that segment doesn't * exist */ if (shm_id == -1) { /* * Have to give general write permission due to AMdahl * not having protected segments */ shm_id = shmget(shm_key, invcntl->param.supsize + sizeof (long), IPC_CREAT | 0666); if (shm_id == -1) perror("Could not create shared " "memory segment"); } else read_index = 0; if (shm_id != -1) { invcntl->iindex = shmat(shm_id, 0, ((read_index) ? 0 : SHM_RDONLY)); if (invcntl->iindex == (char *)ERR) { (void) fprintf(stderr, "%s: shared memory link failed\n", argv0); invcntl->iindex = NULL; read_index = 1; } } } #endif if (invcntl->iindex == NULL) invcntl->iindex = malloc(invcntl->param.supsize + 16); if (invcntl->iindex == NULL) { invcannotalloc((unsigned)invcntl->param.supsize); free(invcntl->logblk); goto closeboth; } if (read_index) { read_superfinger(invcntl); } invcntl->numblk = -1; if (boolready() == -1) { closeboth: (void) fclose(invcntl->postfile); closeinv: (void) fclose(invcntl->invfile); return (-1); } /* write back out the control block if anything changed */ invcntl->param.filestat = stat; if (stat > invcntl->param.filestat) write_param(invcntl); return (1); } /* invclose must be called to wrap things up and deallocate core */ void invclose(INVCONTROL *invcntl) { /* write out the control block in case anything changed */ if (invcntl->param.filestat > 0) { invcntl->param.filestat = 0; write_param(invcntl); } (void) fclose(invcntl->invfile); (void) fclose(invcntl->postfile); #if SHARE if (invcntl->param.share > 0) { shmdt(invcntl->iindex); invcntl->iindex = NULL; } #endif if (invcntl->iindex != NULL) free(invcntl->iindex); free(invcntl->logblk); } /* invstep steps the inverted file forward one item */ void invstep(INVCONTROL *invcntl) { if (invcntl->keypnt < (*(int *)invcntl->logblk - 1)) { invcntl->keypnt++; return; } /* move forward a block else wrap */ read_logblock(invcntl, *(int *)(invcntl->logblk + sizeof (long))); invcntl->keypnt = 0; } /* invforward moves forward one term in the inverted file */ int invforward(INVCONTROL *invcntl) { invstep(invcntl); /* skip things with 0 postings */ while (((ENTRY *)(invcntl->logblk + 12) + invcntl->keypnt)->post == 0) { invstep(invcntl); } /* Check for having wrapped - reached start of inverted file! */ if ((invcntl->numblk == 0) && (invcntl->keypnt == 0)) return (0); return (1); } /* invterm gets the present term from the present logical block */ int invterm(INVCONTROL *invcntl, char *term) { ENTRY * entryptr; entryptr = (ENTRY *)(invcntl->logblk + 12) + invcntl->keypnt; (void) strncpy(term, entryptr->offset + invcntl->logblk, (int)entryptr->size); *(term + entryptr->size) = '\0'; return (entryptr->post); } /* invfind searches for an individual item in the inverted file */ long invfind(INVCONTROL *invcntl, char *searchterm) { int imid, ilow, ihigh; long num; int i; unsigned long *intptr, *intptr2; ENTRY *entryptr; /* make sure it is initialized via invready */ if (invcntl->invfile == 0) return (-1L); /* now search for the appropriate finger block */ intptr = (unsigned long *)invcntl->iindex; ilow = 0; ihigh = *intptr++ - 1; while (ilow <= ihigh) { imid = (ilow + ihigh) / 2; intptr2 = intptr + imid; i = strcmp(searchterm, (invcntl->iindex + *intptr2)); if (i < 0) ihigh = imid - 1; else if (i > 0) ilow = ++imid; else { ilow = imid + 1; break; } } /* be careful about case where searchterm is after last in this block */ imid = (ilow) ? ilow - 1 : 0; /* fetch the appropriate logical block if not in core */ read_logblock(invcntl, imid); srch_ext: /* now find the term in this block. tricky this */ intptr = (unsigned long *)invcntl->logblk; ilow = 0; ihigh = *intptr - 1; intptr += 3; num = 0; while (ilow <= ihigh) { imid = (ilow + ihigh) / 2; entryptr = (ENTRY *)intptr + imid; i = strncmp(searchterm, (invcntl->logblk + entryptr->offset), (int)entryptr->size); if (i == 0) i = strlen(searchterm) - entryptr->size; if (i < 0) ihigh = imid - 1; else if (i > 0) ilow = ++imid; else { num = entryptr->post; break; } } /* be careful about case where searchterm is after last in this block */ if (imid >= *(long *)invcntl->logblk) { invcntl->keypnt = *(long *)invcntl->logblk; invstep(invcntl); /* note if this happens the term could be in extended block */ if (invcntl->param.startbyte < invcntl->numblk * invcntl->param.sizeblk) goto srch_ext; } else invcntl->keypnt = imid; return (num); } #if DEBUG /* invdump dumps the block the term parameter is in */ void invdump(INVCONTROL *invcntl, char *term) { long i, j, n, *longptr; ENTRY * entryptr; char temp[512], *ptr; /* dump superindex if term is "-" */ if (*term == '-') { j = atoi(term + 1); longptr = (long *)invcntl->iindex; n = *longptr++; (void) printf("Superindex dump, num blocks=%ld\n", n); longptr += j; while ((longptr <= ((long *)invcntl->iindex) + n) && invbreak == 0) { (void) printf("%2ld %6ld %s\n", j++, *longptr, invcntl->iindex + *longptr); longptr++; } return; } else if (*term == '#') { j = atoi(term + 1); /* fetch the appropriate logical block */ read_logblock(invcntl, j); } else i = abs((int)invfind(invcntl, term)); longptr = (long *)invcntl->logblk; n = *longptr++; (void) printf("Entry term to invdump=%s, postings=%ld, " "forward ptr=%ld, back ptr=%ld\n", term, i, *(longptr), *(longptr + 1)); entryptr = (ENTRY *)(invcntl->logblk + 12); (void) printf("%ld terms in this block, block=%ld\n", n, invcntl->numblk); (void) printf("\tterm\t\t\tposts\tsize\toffset\tspace\t1st word\n"); for (j = 0; j < n && invbreak == 0; j++) { ptr = invcntl->logblk + entryptr->offset; (void) strncpy(temp, ptr, (int)entryptr->size); temp[entryptr->size] = '\0'; ptr += (sizeof (long) * (long)((entryptr->size + (sizeof (long) - 1)) / sizeof (long))); (void) printf("%2ld %-24s\t%5ld\t%3d\t%d\t%d\t%ld\n", j, temp, entryptr->post, entryptr->size, entryptr->offset, entryptr->space, *(long *)ptr); entryptr++; } } #endif static int boolready(void) { numitems = 0; if (item1 != NULL) free(item1); setsize1 = SETINC; if ((item1 = (POSTING *)malloc(SETINC * sizeof (POSTING))) == NULL) { invcannotalloc(SETINC); return (-1); } if (item2 != NULL) free(item2); setsize2 = SETINC; if ((item2 = (POSTING *)malloc(SETINC * sizeof (POSTING))) == NULL) { invcannotalloc(SETINC); return (-1); } item = item1; enditem = item; return (0); } void boolclear(void) { numitems = 0; item = item1; enditem = item; } POSTING * boolfile(INVCONTROL *invcntl, long *num, int bool) { ENTRY *entryptr; FILE *file; char *ptr; unsigned long *ptr2; POSTING *newitem; POSTING posting; unsigned u; POSTING *newsetp, *set1p; long newsetc, set1c, set2c; entryptr = (ENTRY *) (invcntl->logblk + 12) + invcntl->keypnt; ptr = invcntl->logblk + entryptr->offset; ptr2 = ((unsigned long *)ptr) + (entryptr->size + (sizeof (long) - 1)) / sizeof (long); *num = entryptr->post; switch (bool) { case OR: case NOT: if (*num == 0) { *num = numitems; return (item); } } /* make room for the new set */ u = 0; switch (bool) { case AND: case NOT: newsetp = set1p = item; break; case OR: u = enditem - item; /* FALLTHROUGH */ case REVERSENOT: u += *num; if (item == item2) { if (u > setsize1) { u += SETINC; if ((item1 = (POSTING *) realloc(item1, u * sizeof (POSTING))) == NULL) { goto cannotalloc; } setsize1 = u; } newitem = item1; } else { if (u > setsize2) { u += SETINC; if ((item2 = (POSTING *)realloc(item2, u * sizeof (POSTING))) == NULL) { cannotalloc: invcannotalloc(u * sizeof (POSTING)); (void) boolready(); *num = -1; return (NULL); } setsize2 = u; } newitem = item2; } set1p = item; newsetp = newitem; } file = invcntl->postfile; (void) fseek(file, (long)*ptr2, SEEK_SET); read_next_posting(invcntl, &posting); newsetc = 0; switch (bool) { case OR: /* while something in both sets */ set1p = item; newsetp = newitem; for (set1c = 0, set2c = 0; set1c < numitems && set2c < *num; newsetc++) { if (set1p->lineoffset < posting.lineoffset) { *newsetp++ = *set1p++; set1c++; } else if (set1p->lineoffset > posting.lineoffset) { *newsetp++ = posting; read_next_posting(invcntl, &posting); set2c++; } else if (set1p->type < posting.type) { *newsetp++ = *set1p++; set1c++; } else if (set1p->type > posting.type) { *newsetp++ = posting; read_next_posting(invcntl, &posting); set2c++; } else { /* identical postings */ *newsetp++ = *set1p++; set1c++; read_next_posting(invcntl, &posting); set2c++; } } /* find out what ran out and move the rest in */ if (set1c < numitems) { newsetc += numitems - set1c; while (set1c++ < numitems) { *newsetp++ = *set1p++; } } else { while (set2c++ < *num) { *newsetp++ = posting; newsetc++; read_next_posting(invcntl, &posting); } } item = newitem; break; /* end of OR */ #if 0 case AND: set1c = 0; set2c = 0; while (set1c < numitems && set2c < *num) { if (set1p->lineoffset < posting.lineoffset) { set1p++; set1c++; } else if (set1p->lineoffset > posting.lineoffset) { read_next_posting(invcntl, &posting); set2c++; } else if (set1p->type < posting.type) { *set1p++; set1c++; } else if (set1p->type > posting.type) { read_next_posting(invcntl, &posting); set2c++; } else { /* identical postings */ *newsetp++ = *set1p++; newsetc++; set1c++; read_next_posting(invcntl, &posting); set2c++; } } break; /* end of AND */ case NOT: set1c = 0; set2c = 0; while (set1c < numitems && set2c < *num) { if (set1p->lineoffset < posting.lineoffset) { *newsetp++ = *set1p++; newsetc++; set1c++; } else if (set1p->lineoffset > posting.lineoffset) { read_next_posting(invcntl, &posting); set2c++; } else if (set1p->type < posting.type) { *newsetp++ = *set1p++; newsetc++; set1c++; } else if (set1p->type > posting.type) { read_next_posting(invcntl, &posting); set2c++; } else { /* identical postings */ set1c++; set1p++; read_next_posting(invcntl, &posting); set2c++; } } newsetc += numitems - set1c; while (set1c++ < numitems) { *newsetp++ = *set1p++; } break; /* end of NOT */ case REVERSENOT: /* core NOT incoming set */ set1c = 0; set2c = 0; while (set1c < numitems && set2c < *num) { if (set1p->lineoffset < posting.lineoffset) { set1p++; set1c++; } else if (set1p->lineoffset > posting.lineoffset) { *newsetp++ = posting; read_next_posting(invcntl, &posting); set2c++; } else if (set1p->type < posting.type) { set1p++; set1c++; } else if (set1p->type > posting.type) { *newsetp++ = posting; read_next_posting(invcntl, &posting); set2c++; } else { /* identical postings */ set1c++; set1p++; read_next_posting(invcntl, &posting); set2c++; } } while (set2c++ < *num) { *newsetp++ = posting; newsetc++; read_next_posting(invcntl, &posting); } item = newitem; break; /* end of REVERSENOT */ #endif } numitems = newsetc; *num = newsetc; enditem = (POSTING *)newsetp; return ((POSTING *)item); } #if 0 POSTING * boolsave(int clear) { int i; POSTING *ptr; POSTING *oldstuff, *newstuff; if (numitems == 0) { if (clear) boolclear(); return (NULL); } /* * if clear then give them what we have and use (void) * boolready to realloc */ if (clear) { ptr = item; /* free up the space we didn't give them */ if (item == item1) item1 = NULL; else item2 = NULL; (void) boolready(); return (ptr); } i = (enditem - item) * sizeof (POSTING) + 100; if ((ptr = (POSTING *)malloc(i))r == NULL) { invcannotalloc(i); return (ptr); } /* move present set into place */ oldstuff = item; newstuff = ptr; while (oldstuff < enditem) *newstuff++ = *oldstuff++; return (ptr); } #endif static void invcannotalloc(size_t n) { (void) fprintf(stderr, "%s: cannot allocate %u bytes\n", argv0, n); } static void invcannotopen(char *file) { (void) fprintf(stderr, "%s: cannot open file %s\n", argv0, file); } static void invcannotwrite(char *file) { (void) perror(argv0); /* must be first to preserve errno */ (void) fprintf(stderr, "%s: write to file %s failed\n", argv0, file); }