1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*	Copyright (c) 1988 AT&T	*/
23/*	  All Rights Reserved  	*/
24
25
26/*
27 * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
28 * Use is subject to license terms.
29 */
30
31#pragma ident	"%Z%%M%	%I%	%E% SMI"
32
33#include <ctype.h>
34#include <stdio.h>
35#include <sys/types.h>
36#include <sys/sysmacros.h>
37#include <sys/byteorder.h>
38#if SHARE
39#include <sys/ipc.h>
40#include <sys/shm.h>
41#define	ERR  -1
42#endif
43#include "invlib.h"
44#include "library.h"
45
46#define	DEBUG		0	/* debugging code and realloc messages */
47#define	BLOCKSIZE	2 * BUFSIZ	/* logical block size */
48#define	LINEMAX		1000	/* sorted posting line max size */
49#define	POSTINC		10000	/* posting buffer size increment */
50#define	SEP		' '	/* sorted posting field separator */
51#define	SETINC		100	/* posting set size increment */
52#define	STATS		0	/* print statistics */
53#define	SUPERINC	10000	/* super index size increment */
54#define	TERMMAX		512	/* term max size */
55#define	VERSION		1	/* inverted index format version */
56#define	ZIPFSIZE	200	/* zipf curve size */
57#define	FREAD	"r"		/* fopen for reading */
58#define	FREADP	"r+"		/* fopen for update */
59#define	FWRITE	"w"		/* fopen truncate or create for writing */
60#define	FWRITEP	"w+"		/* fopen truncate or create for update */
61
62extern	char	*argv0;	/* command name (must be set in main function) */
63
64int	invbreak;
65
66#if STATS
67int	showzipf;	/* show postings per term distribution */
68#endif
69
70static	POSTING	*item, *enditem, *item1 = NULL, *item2 = NULL;
71static	unsigned setsize1, setsize2;
72static	long	numitems, totterm, zerolong;
73static	char	*indexfile, *postingfile;
74static	FILE	*outfile, *fpost;
75static	unsigned supersize = SUPERINC, supintsize;
76static	int	numpost, numlogblk, amtused, nextpost,
77		lastinblk, numinvitems;
78static	POSTING	*POST, *postptr;
79static	unsigned long	*SUPINT, *supint, nextsupfing;
80static	char	*SUPFING, *supfing;
81static	char	thisterm[TERMMAX];
82static	union {
83	long	invblk[BLOCKSIZE / sizeof (long)];
84	char	chrblk[BLOCKSIZE];
85} logicalblk;
86
87#if DEBUG || STATS
88static	long	totpost;
89#endif
90
91#if STATS
92static	int	zipf[ZIPFSIZE + 1];
93#endif
94
95static void invcannotalloc(size_t n);
96static void invcannotopen(char *file);
97static void invcannotwrite(char *file);
98static int invnewterm(void);
99static int boolready(void);
100
101long
102invmake(char *invname, char *invpost, FILE *infile)
103{
104	unsigned char	*s;
105	long	num;
106	int	i;
107	long	fileindex;
108	unsigned postsize = POSTINC * sizeof (POSTING);
109	unsigned long	*intptr;
110	char	line[LINEMAX];
111	long	tlong;
112	PARAM	param;
113	POSTING	posting;
114#if STATS
115	int	j;
116	unsigned maxtermlen = 0;
117#endif
118	/* output file */
119	if ((outfile = vpfopen(invname, FWRITEP)) == NULL) {
120		invcannotopen(invname);
121		return (0);
122	}
123	indexfile = invname;
124	(void) fseek(outfile, (long)BUFSIZ, 0);
125
126	/* posting file  */
127	if ((fpost = vpfopen(invpost, FWRITE)) == NULL) {
128		invcannotopen(invpost);
129		return (0);
130	}
131	postingfile = invpost;
132	nextpost = 0;
133	/* get space for the postings list */
134	if ((POST = (POSTING *)malloc(postsize)) == NULL) {
135		invcannotalloc(postsize);
136		return (0);
137	}
138	postptr = POST;
139	/* get space for the superfinger (superindex) */
140	if ((SUPFING = malloc(supersize)) == NULL) {
141		invcannotalloc(supersize);
142		return (0);
143	}
144	supfing = SUPFING;
145	supintsize = supersize / 40;
146	/* also for the superfinger index */
147	if ((SUPINT = malloc(supintsize * sizeof (long))) == NULL) {
148		invcannotalloc(supintsize * sizeof (long));
149		return (0);
150	}
151	supint = SUPINT;
152	supint++; /* leave first term open for a count */
153	/* initialize using an empty term */
154	(void) strcpy(thisterm, "");
155	*supint++ = 0;
156	*supfing++ = ' ';
157	*supfing++ = '\0';
158	nextsupfing = 2;
159#if DEBUG || STATS
160	totpost = 0L;
161#endif
162	totterm = 0L;
163	numpost = 1;
164
165	/*
166	 * set up as though a block had come and gone, i.e., set up for
167	 * new block
168	 */
169	amtused = 16; /* leave no space - init 3 words + one for luck */
170	numinvitems = 0;
171	numlogblk = 0;
172	lastinblk = BLOCKSIZE;
173
174	/* now loop as long as more to read (till eof)  */
175	while (fgets(line, LINEMAX, infile) != NULL) {
176#if DEBUG || STATS
177		++totpost;
178#endif
179		s = (unsigned char *) strchr(line, SEP);
180		if (s == NULL)		/* where did this line come from ??? */
181			continue;	/* workaround: just skip it */
182		*s = '\0';
183#if STATS
184		if ((i = strlen(line)) > maxtermlen) {
185			maxtermlen = i;
186		}
187#endif
188#if DEBUG
189		(void) printf("%ld: %s ", totpost, line);
190		(void) fflush(stdout);
191#endif
192		if (strcmp(thisterm, line) == 0) {
193			if (postptr + 10 > POST + postsize / sizeof (POSTING)) {
194				i = postptr - POST;
195				postsize += POSTINC * sizeof (POSTING);
196				if ((POST = realloc(POST, postsize)) == NULL) {
197					invcannotalloc(postsize);
198					return (0);
199				}
200				postptr = i + POST;
201#if DEBUG
202				(void) printf("reallocated post space to %u, "
203				    "totpost=%ld\n", postsize, totpost);
204#endif
205			}
206			numpost++;
207		} else {
208			/* have a new term */
209			if (!invnewterm()) {
210				return (0);
211			}
212			(void) strcpy(thisterm, line);
213			numpost = 1;
214			postptr = POST;
215			fileindex = 0;
216		}
217		/* get the new posting */
218		num = *++s - '!';
219		i = 1;
220		do {
221			num = BASE * num + *++s - '!';
222		} while (++i < PRECISION);
223		posting.lineoffset = num;
224		while (++fileindex < nsrcoffset && num > srcoffset[fileindex]) {
225			;
226		}
227		posting.fileindex = --fileindex;
228		posting.type = *++s;
229		num = *++s - '!';
230		if (*s != '\n') {
231			num = *++s - '!';
232			while (*++s != '\n') {
233				num = BASE * num + *s - '!';
234			}
235			posting.fcnoffset = num;
236		} else {
237			posting.fcnoffset = 0;
238		}
239		*postptr++ = posting;
240#if DEBUG
241		(void) printf("%ld %ld %ld %ld\n", posting.fileindex,
242		    posting.fcnoffset, posting.lineoffset, posting.type);
243		(void) fflush(stdout);
244#endif
245	}
246	if (!invnewterm()) {
247		return (0);
248	}
249	/* now clean up final block  */
250	logicalblk.invblk[0] = numinvitems;
251	/* loops pointer around to start */
252	logicalblk.invblk[1] = 0;
253	logicalblk.invblk[2] = numlogblk - 1;
254	if (fwrite((char *)&logicalblk, BLOCKSIZE, 1, outfile) == 0) {
255		goto cannotwrite;
256	}
257	numlogblk++;
258	/* write out block to save space. what in it doesn't matter */
259	if (fwrite((char *)&logicalblk, BLOCKSIZE, 1, outfile) == 0) {
260		goto cannotwrite;
261	}
262	/* finish up the super finger */
263	*SUPINT = numlogblk;
264	/* add to the offsets the size of the offset pointers */
265	intptr = (SUPINT + 1);
266	i = (char *)supint - (char *)SUPINT;
267	while (intptr < supint)
268		*intptr++ += i;
269	/* write out the offsets (1 for the N at start) and the super finger */
270	if (fwrite((char *)SUPINT, sizeof (*SUPINT), numlogblk + 1,
271	    outfile) == 0 ||
272	    fwrite(SUPFING, 1, supfing - SUPFING, outfile) == 0) {
273		goto cannotwrite;
274	}
275	/* save the size for reference later */
276	nextsupfing = sizeof (long) + sizeof (long) * numlogblk +
277	    (supfing - SUPFING);
278	/*
279	 * make sure the file ends at a logical block boundary.  This is
280	 * necessary for invinsert to correctly create extended blocks
281	 */
282	i = nextsupfing % BLOCKSIZE;
283	/* write out junk to fill log blk */
284	if (fwrite(SUPFING, BLOCKSIZE - i, 1, outfile) == 0 ||
285	    fflush(outfile) == EOF) {
286		/* rewind doesn't check for write failure */
287		goto cannotwrite;
288	}
289	/* write the control area */
290	rewind(outfile);
291	param.version = VERSION;
292	param.filestat = 0;
293	param.sizeblk = BLOCKSIZE;
294	param.startbyte = (numlogblk + 1) * BLOCKSIZE + BUFSIZ;
295	param.supsize = nextsupfing;
296	param.cntlsize = BUFSIZ;
297	param.share = 0;
298	if (fwrite((char *)&param, sizeof (param), 1, outfile) == 0) {
299		goto cannotwrite;
300	}
301	for (i = 0; i < 10; i++)	/* for future use */
302		if (fwrite((char *)&zerolong, sizeof (zerolong),
303		    1, outfile) == 0) {
304			goto cannotwrite;
305		}
306
307	/* make first block loop backwards to last block */
308	if (fflush(outfile) == EOF) {
309		/* fseek doesn't check for write failure */
310		goto cannotwrite;
311	}
312	/* get to second word first block */
313	(void) fseek(outfile, (long)BUFSIZ + 8, 0);
314	tlong = numlogblk - 1;
315	if (fwrite((char *)&tlong, sizeof (tlong), 1, outfile) == 0 ||
316	    fclose(outfile) == EOF) {
317cannotwrite:
318		invcannotwrite(invname);
319		return (0);
320	}
321	if (fclose(fpost) == EOF) {
322		invcannotwrite(postingfile);
323		return (0);
324	}
325	--totterm;	/* don't count null term */
326#if STATS
327	(void) printf("logical blocks = %d, postings = %ld, terms = %ld, "
328	    "max term length = %d\n", numlogblk, totpost, totterm, maxtermlen);
329	if (showzipf) {
330		(void) printf(
331		    "\n*************   ZIPF curve  ****************\n");
332		for (j = ZIPFSIZE; j > 1; j--)
333			if (zipf[j])
334				break;
335		for (i = 1; i < j; ++i) {
336			(void) printf("%3d -%6d ", i, zipf[i]);
337			if (i % 6 == 0) (void) putchar('\n');
338		}
339		(void) printf(">%d-%6d\n", ZIPFSIZE, zipf[0]);
340	}
341#endif
342	/* free all malloc'd memory */
343	free(POST);
344	free(SUPFING);
345	free(SUPINT);
346	return (totterm);
347}
348
349/* add a term to the data base */
350
351static int
352invnewterm(void)
353{
354	int	backupflag, i, j, maxback, holditems, gooditems, howfar;
355	int	len, numwilluse, wdlen;
356	char	*tptr, *tptr2, *tptr3;
357	union {
358		unsigned long	packword[2];
359		ENTRY		e;
360	} iteminfo;
361
362	totterm++;
363#if STATS
364	/* keep zipfian info on the distribution */
365	if (numpost <= ZIPFSIZE)
366		zipf[numpost]++;
367	else
368		zipf[0]++;
369#endif
370	len = strlen(thisterm);
371	wdlen = (len + (sizeof (long) - 1)) / sizeof (long);
372	numwilluse = (wdlen + 3) * sizeof (long);
373	/* new block if at least 1 item in block */
374	if (numinvitems && numwilluse + amtused > BLOCKSIZE) {
375		/* set up new block */
376		if (supfing + 500 > SUPFING + supersize) {
377			i = supfing - SUPFING;
378			supersize += 20000;
379			if ((SUPFING = realloc(SUPFING, supersize)) == NULL) {
380				invcannotalloc(supersize);
381				return (0);
382			}
383			supfing = i + SUPFING;
384#if DEBUG
385			(void) printf("reallocated superfinger space to %d, "
386			    "totpost=%ld\n", supersize, totpost);
387#endif
388		}
389		/* check that room for the offset as well */
390		if ((numlogblk + 10) > supintsize) {
391			i = supint - SUPINT;
392			supintsize += SUPERINC;
393			if ((SUPINT = realloc((char *)SUPINT,
394			    supintsize * sizeof (long))) == NULL) {
395				invcannotalloc(supintsize * sizeof (long));
396				return (0);
397			}
398			supint = i + SUPINT;
399#if DEBUG
400			(void) printf("reallocated superfinger offset to %d, "
401			    "totpost = %ld\n", supintsize * sizeof (long),
402			    totpost);
403#endif
404		}
405		/* See if backup is efficatious  */
406		backupflag = 0;
407		maxback = strlen(thisterm) / 10;
408		holditems = numinvitems;
409		if (maxback > numinvitems)
410			maxback = numinvitems - 2;
411		howfar = 0;
412		while (--maxback > 0) {
413			howfar++;
414			iteminfo.packword[0] =
415			    logicalblk.invblk[--holditems * 2 +
416			    (sizeof (long) - 1)];
417			if ((i = iteminfo.e.size / 10) < maxback) {
418				maxback = i;
419				backupflag = howfar;
420				gooditems = holditems;
421				tptr2 = logicalblk.chrblk + iteminfo.e.offset;
422			}
423		}
424		/* see if backup will occur  */
425		if (backupflag) {
426			numinvitems = gooditems;
427		}
428		logicalblk.invblk[0] = numinvitems;
429		/* set forward pointer pointing to next */
430		logicalblk.invblk[1] = numlogblk + 1;
431		/* set back pointer to last block */
432		logicalblk.invblk[2] = numlogblk - 1;
433		if (fwrite((char *)logicalblk.chrblk, 1,
434		    BLOCKSIZE, outfile) == 0) {
435			invcannotwrite(indexfile);
436			return (0);
437		}
438		amtused = 16;
439		numlogblk++;
440		/* check if had to back up, if so do it */
441		if (backupflag) {
442			/* find out where the end of the new block is */
443			iteminfo.packword[0] =
444			    logicalblk.invblk[numinvitems * 2 + 1];
445			tptr3 = logicalblk.chrblk + iteminfo.e.offset;
446			/* move the index for this block */
447			for (i = 3; i <= (backupflag * 2 + 2); i++) {
448				logicalblk.invblk[i] =
449				    logicalblk.invblk[numinvitems * 2+i];
450			}
451			/* move the word into the super index */
452			iteminfo.packword[0] = logicalblk.invblk[3];
453			iteminfo.packword[1] = logicalblk.invblk[4];
454			tptr2 = logicalblk.chrblk + iteminfo.e.offset;
455			(void) strncpy(supfing, tptr2, (int)iteminfo.e.size);
456			*(supfing + iteminfo.e.size) = '\0';
457#if DEBUG
458			(void) printf("backup %d at term=%s to term=%s\n",
459			    backupflag, thisterm, supfing);
460#endif
461			*supint++ = nextsupfing;
462			nextsupfing += strlen(supfing) + 1;
463			supfing += strlen(supfing) + 1;
464			/* now fix up the logical block */
465			tptr = logicalblk.chrblk + lastinblk;
466			lastinblk = BLOCKSIZE;
467			tptr2 = logicalblk.chrblk + lastinblk;
468			j = tptr3 - tptr;
469			while (tptr3 > tptr)
470				*--tptr2 = *--tptr3;
471			lastinblk -= j;
472			amtused += (8 * backupflag + j);
473			for (i = 3; i < (backupflag * 2 + 2); i += 2) {
474				iteminfo.packword[0] = logicalblk.invblk[i];
475				iteminfo.e.offset += (tptr2 - tptr3);
476				logicalblk.invblk[i] = iteminfo.packword[0];
477			}
478			numinvitems = backupflag;
479		} else { /* no backup needed */
480			numinvitems = 0;
481			lastinblk = BLOCKSIZE;
482			/* add new term to superindex */
483			(void) strcpy(supfing, thisterm);
484			supfing += strlen(thisterm) + 1;
485			*supint++ = nextsupfing;
486			nextsupfing += strlen(thisterm) + 1;
487		}
488	}
489	lastinblk -= (numwilluse - 8);
490	iteminfo.e.offset = lastinblk;
491	iteminfo.e.size = (char)len;
492	iteminfo.e.space = 0;
493	iteminfo.e.post = numpost;
494	(void) strncpy(logicalblk.chrblk + lastinblk, thisterm, len);
495	amtused += numwilluse;
496	logicalblk.invblk[(lastinblk/sizeof (long))+wdlen] = nextpost;
497	if ((i = postptr - POST) > 0) {
498		if (fwrite((char *)POST, sizeof (POSTING), i, fpost) == 0) {
499			invcannotwrite(postingfile);
500			return (0);
501		}
502		nextpost += i * sizeof (POSTING);
503	}
504	logicalblk.invblk[3+2*numinvitems++] = iteminfo.packword[0];
505	logicalblk.invblk[2+2*numinvitems] = iteminfo.packword[1];
506	return (1);
507}
508
509static void
510swap_ints(void *p, size_t sz)
511{
512	uint32_t *s;
513	uint32_t *e = (uint32_t *)p + (sz / sizeof (uint32_t));
514
515	for (s = p; s < e; s++)
516		*s = BSWAP_32(*s);
517}
518
519static void
520write_param(INVCONTROL *invcntl)
521{
522	if (invcntl->swap)
523		swap_ints(&invcntl->param, sizeof (invcntl->param));
524
525	rewind(invcntl->invfile);
526	(void) fwrite((char *)&invcntl->param, sizeof (invcntl->param), 1,
527	    invcntl->invfile);
528
529	if (invcntl->swap)
530		swap_ints(&invcntl->param, sizeof (invcntl->param));
531}
532
533static void
534read_superfinger(INVCONTROL *invcntl)
535{
536	size_t count;
537
538	(void) fseek(invcntl->invfile, invcntl->param.startbyte, SEEK_SET);
539	(void) fread(invcntl->iindex, (int)invcntl->param.supsize,
540	    1, invcntl->invfile);
541
542	if (invcntl->swap) {
543		/*
544		 * The superfinger consists of a count, followed by
545		 * count offsets, followed by a string table (which
546		 * the offsets reference).
547		 *
548		 * We need to swap the count and the offsets.
549		 */
550		count = 1 + BSWAP_32(*(uint32_t *)invcntl->iindex);
551		swap_ints(invcntl->iindex, count * sizeof (unsigned long));
552	}
553}
554
555static void
556read_logblock(INVCONTROL *invcntl, int block)
557{
558	/* note always fetch it if the file is busy */
559	if ((block != invcntl->numblk) ||
560	    (invcntl->param.filestat >= INVBUSY)) {
561		(void) fseek(invcntl->invfile,
562		    (block * invcntl->param.sizeblk) + invcntl->param.cntlsize,
563		    SEEK_SET);
564		invcntl->numblk = block;
565		(void) fread(invcntl->logblk, (int)invcntl->param.sizeblk,
566		    1, invcntl->invfile);
567
568		if (invcntl->swap) {
569			size_t count;
570			ENTRY *ecur, *eend;
571			uint32_t *postptr;
572
573			/*
574			 * A logblock consists of a count, a next block id,
575			 * and a previous block id, followed by count
576			 * ENTRYs, followed by alternating strings and
577			 * offsets.
578			 */
579			swap_ints(invcntl->logblk, 3 * sizeof (unsigned long));
580
581			count = *(uint32_t *)invcntl->logblk;
582
583			ecur = (ENTRY *)((uint32_t *)invcntl->logblk + 3);
584			eend = ecur + count;
585
586			for (; ecur < eend; ecur++) {
587				ecur->offset = BSWAP_16(ecur->offset);
588				ecur->post = BSWAP_32(ecur->post);
589
590				/*
591				 * After the string is the posting offset.
592				 */
593				postptr = (uint32_t *)(invcntl->logblk +
594				    ecur->offset +
595				    P2ROUNDUP(ecur->size, sizeof (long)));
596
597				*postptr = BSWAP_32(*postptr);
598			}
599		}
600	}
601}
602
603void
604read_next_posting(INVCONTROL *invcntl, POSTING *posting)
605{
606	(void) fread((char *)posting, sizeof (*posting), 1, invcntl->postfile);
607	if (invcntl->swap) {
608		posting->lineoffset = BSWAP_32(posting->lineoffset);
609		posting->fcnoffset = BSWAP_32(posting->fcnoffset);
610		/*
611		 * fileindex is a 24-bit field, so shift it before swapping
612		 */
613		posting->fileindex = BSWAP_32(posting->fileindex << 8);
614	}
615}
616
617int
618invopen(INVCONTROL *invcntl, char *invname, char *invpost, int stat)
619{
620	int	read_index;
621
622	if ((invcntl->invfile = vpfopen(invname,
623	    ((stat == 0) ? FREAD : FREADP))) == NULL) {
624		invcannotopen(invname);
625		return (-1);
626	}
627	if (fread((char *)&invcntl->param, sizeof (invcntl->param), 1,
628	    invcntl->invfile) == 0) {
629		(void) fprintf(stderr, "%s: empty inverted file\n", argv0);
630		goto closeinv;
631	}
632	if (invcntl->param.version != VERSION &&
633	    BSWAP_32(invcntl->param.version) != VERSION) {
634		(void) fprintf(stderr,
635		    "%s: cannot read old index format; use -U option to "
636		    "force database to rebuild\n", argv0);
637		goto closeinv;
638	}
639	invcntl->swap = (invcntl->param.version != VERSION);
640	if (invcntl->swap)
641		swap_ints(&invcntl->param, sizeof (invcntl->param));
642
643	if (stat == 0 && invcntl->param.filestat == INVALONE) {
644		(void) fprintf(stderr, "%s: inverted file is locked\n", argv0);
645		goto closeinv;
646	}
647	if ((invcntl->postfile = vpfopen(invpost,
648	    ((stat == 0) ? FREAD : FREADP))) == NULL) {
649		invcannotopen(invpost);
650		goto closeinv;
651	}
652	/* allocate core for a logical block  */
653	if ((invcntl->logblk = malloc(invcntl->param.sizeblk)) == NULL) {
654		invcannotalloc((unsigned)invcntl->param.sizeblk);
655		goto closeboth;
656	}
657	/* allocate for and read in superfinger  */
658	read_index = 1;
659	invcntl->iindex = NULL;
660#if SHARE
661	if (invcntl->param.share == 1) {
662		key_t ftok(), shm_key;
663		struct shmid_ds shm_buf;
664		char	*shmat();
665		int	shm_id;
666
667		/* see if the shared segment exists */
668		shm_key = ftok(invname, 2);
669		shm_id = shmget(shm_key, 0, 0);
670		/*
671		 * Failure simply means (hopefully) that segment doesn't
672		 * exist
673		 */
674		if (shm_id == -1) {
675			/*
676			 * Have to give general write permission due to AMdahl
677			 * not having protected segments
678			 */
679			shm_id = shmget(shm_key,
680			    invcntl->param.supsize + sizeof (long),
681			    IPC_CREAT | 0666);
682			if (shm_id == -1)
683				perror("Could not create shared "
684				    "memory segment");
685		} else
686			read_index = 0;
687
688		if (shm_id != -1) {
689			invcntl->iindex = shmat(shm_id, 0,
690			    ((read_index) ? 0 : SHM_RDONLY));
691			if (invcntl->iindex == (char *)ERR) {
692				(void) fprintf(stderr,
693				    "%s: shared memory link failed\n", argv0);
694				invcntl->iindex = NULL;
695				read_index = 1;
696			}
697		}
698	}
699#endif
700	if (invcntl->iindex == NULL)
701		invcntl->iindex = malloc(invcntl->param.supsize + 16);
702	if (invcntl->iindex == NULL) {
703		invcannotalloc((unsigned)invcntl->param.supsize);
704		free(invcntl->logblk);
705		goto closeboth;
706	}
707	if (read_index) {
708		read_superfinger(invcntl);
709	}
710	invcntl->numblk = -1;
711	if (boolready() == -1) {
712	closeboth:
713		(void) fclose(invcntl->postfile);
714	closeinv:
715		(void) fclose(invcntl->invfile);
716		return (-1);
717	}
718	/* write back out the control block if anything changed */
719	invcntl->param.filestat = stat;
720	if (stat > invcntl->param.filestat)
721		write_param(invcntl);
722	return (1);
723}
724
725/* invclose must be called to wrap things up and deallocate core */
726void
727invclose(INVCONTROL *invcntl)
728{
729	/* write out the control block in case anything changed */
730	if (invcntl->param.filestat > 0) {
731		invcntl->param.filestat = 0;
732		write_param(invcntl);
733	}
734	(void) fclose(invcntl->invfile);
735	(void) fclose(invcntl->postfile);
736#if SHARE
737	if (invcntl->param.share > 0) {
738		shmdt(invcntl->iindex);
739		invcntl->iindex = NULL;
740	}
741#endif
742	if (invcntl->iindex != NULL)
743		free(invcntl->iindex);
744	free(invcntl->logblk);
745}
746
747/* invstep steps the inverted file forward one item */
748void
749invstep(INVCONTROL *invcntl)
750{
751	if (invcntl->keypnt < (*(int *)invcntl->logblk - 1)) {
752		invcntl->keypnt++;
753		return;
754	}
755
756	/* move forward a block else wrap */
757	read_logblock(invcntl, *(int *)(invcntl->logblk + sizeof (long)));
758
759	invcntl->keypnt = 0;
760}
761
762/* invforward moves forward one term in the inverted file  */
763int
764invforward(INVCONTROL *invcntl)
765{
766	invstep(invcntl);
767	/* skip things with 0 postings */
768	while (((ENTRY *)(invcntl->logblk + 12) + invcntl->keypnt)->post == 0) {
769		invstep(invcntl);
770	}
771	/* Check for having wrapped - reached start of inverted file! */
772	if ((invcntl->numblk == 0) && (invcntl->keypnt == 0))
773		return (0);
774	return (1);
775}
776
777/*  invterm gets the present term from the present logical block  */
778int
779invterm(INVCONTROL *invcntl, char *term)
780{
781	ENTRY * entryptr;
782
783	entryptr = (ENTRY *)(invcntl->logblk + 12) + invcntl->keypnt;
784	(void) strncpy(term, entryptr->offset + invcntl->logblk,
785	    (int)entryptr->size);
786	*(term + entryptr->size) = '\0';
787	return (entryptr->post);
788}
789
790/* invfind searches for an individual item in the inverted file */
791long
792invfind(INVCONTROL *invcntl, char *searchterm)
793{
794	int	imid, ilow, ihigh;
795	long	num;
796	int	i;
797	unsigned long	*intptr, *intptr2;
798	ENTRY *entryptr;
799
800	/* make sure it is initialized via invready  */
801	if (invcntl->invfile == 0)
802		return (-1L);
803
804	/* now search for the appropriate finger block */
805	intptr = (unsigned long *)invcntl->iindex;
806
807	ilow = 0;
808	ihigh = *intptr++ - 1;
809	while (ilow <= ihigh) {
810		imid = (ilow + ihigh) / 2;
811		intptr2 = intptr + imid;
812		i = strcmp(searchterm, (invcntl->iindex + *intptr2));
813		if (i < 0)
814			ihigh = imid - 1;
815		else if (i > 0)
816			ilow = ++imid;
817		else {
818			ilow = imid + 1;
819			break;
820		}
821	}
822	/* be careful about case where searchterm is after last in this block */
823	imid = (ilow) ? ilow - 1 : 0;
824
825	/* fetch the appropriate logical block if not in core  */
826	read_logblock(invcntl, imid);
827
828srch_ext:
829	/* now find the term in this block. tricky this  */
830	intptr = (unsigned long *)invcntl->logblk;
831
832	ilow = 0;
833	ihigh = *intptr - 1;
834	intptr += 3;
835	num = 0;
836	while (ilow <= ihigh) {
837		imid = (ilow + ihigh) / 2;
838		entryptr = (ENTRY *)intptr + imid;
839		i = strncmp(searchterm, (invcntl->logblk + entryptr->offset),
840		    (int)entryptr->size);
841		if (i == 0)
842			i = strlen(searchterm) - entryptr->size;
843		if (i < 0)
844			ihigh = imid - 1;
845		else if (i > 0)
846			ilow = ++imid;
847		else {
848			num = entryptr->post;
849			break;
850		}
851	}
852	/* be careful about case where searchterm is after last in this block */
853	if (imid >= *(long *)invcntl->logblk) {
854		invcntl->keypnt = *(long *)invcntl->logblk;
855		invstep(invcntl);
856		/* note if this happens the term could be in extended block */
857		if (invcntl->param.startbyte <
858		    invcntl->numblk * invcntl->param.sizeblk)
859			goto srch_ext;
860	} else
861		invcntl->keypnt = imid;
862	return (num);
863}
864
865#if DEBUG
866
867/* invdump dumps the block the term parameter is in */
868void
869invdump(INVCONTROL *invcntl, char *term)
870{
871	long	i, j, n, *longptr;
872	ENTRY * entryptr;
873	char	temp[512], *ptr;
874
875	/* dump superindex if term is "-"  */
876	if (*term == '-') {
877		j = atoi(term + 1);
878		longptr = (long *)invcntl->iindex;
879		n = *longptr++;
880		(void) printf("Superindex dump, num blocks=%ld\n", n);
881		longptr += j;
882		while ((longptr <= ((long *)invcntl->iindex) + n) &&
883		    invbreak == 0) {
884			(void) printf("%2ld  %6ld %s\n", j++, *longptr,
885			    invcntl->iindex + *longptr);
886			longptr++;
887		}
888		return;
889	} else if (*term == '#') {
890		j = atoi(term + 1);
891		/* fetch the appropriate logical block */
892		read_logblock(invcntl, j);
893	} else
894		i = abs((int)invfind(invcntl, term));
895	longptr = (long *)invcntl->logblk;
896	n = *longptr++;
897	(void) printf("Entry term to invdump=%s, postings=%ld, "
898	    "forward ptr=%ld, back ptr=%ld\n", term, i, *(longptr),
899	    *(longptr + 1));
900	entryptr = (ENTRY *)(invcntl->logblk + 12);
901	(void) printf("%ld terms in this block, block=%ld\n", n,
902	    invcntl->numblk);
903	(void) printf("\tterm\t\t\tposts\tsize\toffset\tspace\t1st word\n");
904	for (j = 0; j < n && invbreak == 0; j++) {
905		ptr = invcntl->logblk + entryptr->offset;
906		(void) strncpy(temp, ptr, (int)entryptr->size);
907		temp[entryptr->size] = '\0';
908		ptr += (sizeof (long) *
909		    (long)((entryptr->size +
910		    (sizeof (long) - 1)) / sizeof (long)));
911		(void) printf("%2ld  %-24s\t%5ld\t%3d\t%d\t%d\t%ld\n", j, temp,
912		    entryptr->post, entryptr->size, entryptr->offset,
913		    entryptr->space, *(long *)ptr);
914		entryptr++;
915	}
916}
917#endif
918
919static int
920boolready(void)
921{
922	numitems = 0;
923	if (item1 != NULL)
924		free(item1);
925	setsize1 = SETINC;
926	if ((item1 = (POSTING *)malloc(SETINC * sizeof (POSTING))) == NULL) {
927		invcannotalloc(SETINC);
928		return (-1);
929	}
930	if (item2 != NULL)
931		free(item2);
932	setsize2 = SETINC;
933	if ((item2 = (POSTING *)malloc(SETINC * sizeof (POSTING))) == NULL) {
934		invcannotalloc(SETINC);
935		return (-1);
936	}
937	item = item1;
938	enditem = item;
939	return (0);
940}
941
942void
943boolclear(void)
944{
945	numitems = 0;
946	item = item1;
947	enditem = item;
948}
949
950POSTING *
951boolfile(INVCONTROL *invcntl, long *num, int bool)
952{
953	ENTRY	*entryptr;
954	FILE	*file;
955	char	*ptr;
956	unsigned long	*ptr2;
957	POSTING	*newitem;
958	POSTING	posting;
959	unsigned u;
960	POSTING *newsetp, *set1p;
961	long	newsetc, set1c, set2c;
962
963	entryptr = (ENTRY *) (invcntl->logblk + 12) + invcntl->keypnt;
964	ptr = invcntl->logblk + entryptr->offset;
965	ptr2 = ((unsigned long *)ptr) +
966	    (entryptr->size + (sizeof (long) - 1)) / sizeof (long);
967	*num = entryptr->post;
968	switch (bool) {
969	case OR:
970	case NOT:
971		if (*num == 0) {
972			*num = numitems;
973			return (item);
974		}
975	}
976	/* make room for the new set */
977	u = 0;
978	switch (bool) {
979	case AND:
980	case NOT:
981		newsetp = set1p = item;
982		break;
983
984	case OR:
985		u = enditem - item;
986		/* FALLTHROUGH */
987	case REVERSENOT:
988		u += *num;
989		if (item == item2) {
990			if (u > setsize1) {
991				u += SETINC;
992				if ((item1 = (POSTING *) realloc(item1,
993				    u * sizeof (POSTING))) == NULL) {
994					goto cannotalloc;
995				}
996				setsize1 = u;
997			}
998			newitem = item1;
999		} else {
1000			if (u > setsize2) {
1001				u += SETINC;
1002				if ((item2 = (POSTING *)realloc(item2,
1003				    u * sizeof (POSTING))) == NULL) {
1004				cannotalloc:
1005					invcannotalloc(u * sizeof (POSTING));
1006					(void) boolready();
1007					*num = -1;
1008					return (NULL);
1009				}
1010				setsize2 = u;
1011			}
1012			newitem = item2;
1013		}
1014		set1p = item;
1015		newsetp = newitem;
1016	}
1017	file = invcntl->postfile;
1018	(void) fseek(file, (long)*ptr2, SEEK_SET);
1019	read_next_posting(invcntl, &posting);
1020	newsetc = 0;
1021	switch (bool) {
1022	case OR:
1023		/* while something in both sets */
1024		set1p = item;
1025		newsetp = newitem;
1026		for (set1c = 0, set2c = 0;
1027		    set1c < numitems && set2c < *num; newsetc++) {
1028			if (set1p->lineoffset < posting.lineoffset) {
1029				*newsetp++ = *set1p++;
1030				set1c++;
1031			} else if (set1p->lineoffset > posting.lineoffset) {
1032				*newsetp++ = posting;
1033				read_next_posting(invcntl, &posting);
1034				set2c++;
1035			} else if (set1p->type < posting.type) {
1036				*newsetp++ = *set1p++;
1037				set1c++;
1038			} else if (set1p->type > posting.type) {
1039				*newsetp++ = posting;
1040				read_next_posting(invcntl, &posting);
1041				set2c++;
1042			} else {	/* identical postings */
1043				*newsetp++ = *set1p++;
1044				set1c++;
1045				read_next_posting(invcntl, &posting);
1046				set2c++;
1047			}
1048		}
1049		/* find out what ran out and move the rest in */
1050		if (set1c < numitems) {
1051			newsetc += numitems - set1c;
1052			while (set1c++ < numitems) {
1053				*newsetp++ = *set1p++;
1054			}
1055		} else {
1056			while (set2c++ < *num) {
1057				*newsetp++ = posting;
1058				newsetc++;
1059				read_next_posting(invcntl, &posting);
1060			}
1061		}
1062		item = newitem;
1063		break; /* end of OR */
1064#if 0
1065	case AND:
1066		set1c = 0;
1067		set2c = 0;
1068		while (set1c < numitems && set2c < *num) {
1069			if (set1p->lineoffset < posting.lineoffset) {
1070				set1p++;
1071				set1c++;
1072			} else if (set1p->lineoffset > posting.lineoffset) {
1073				read_next_posting(invcntl, &posting);
1074				set2c++;
1075			} else if (set1p->type < posting.type)  {
1076				*set1p++;
1077				set1c++;
1078			} else if (set1p->type > posting.type) {
1079				read_next_posting(invcntl, &posting);
1080				set2c++;
1081			} else {	/* identical postings */
1082				*newsetp++ = *set1p++;
1083				newsetc++;
1084				set1c++;
1085				read_next_posting(invcntl, &posting);
1086				set2c++;
1087			}
1088		}
1089		break; /* end of AND */
1090
1091	case NOT:
1092		set1c = 0;
1093		set2c = 0;
1094		while (set1c < numitems && set2c < *num) {
1095			if (set1p->lineoffset < posting.lineoffset) {
1096				*newsetp++ = *set1p++;
1097				newsetc++;
1098				set1c++;
1099			} else if (set1p->lineoffset > posting.lineoffset) {
1100				read_next_posting(invcntl, &posting);
1101				set2c++;
1102			} else if (set1p->type < posting.type) {
1103				*newsetp++ = *set1p++;
1104				newsetc++;
1105				set1c++;
1106			} else if (set1p->type > posting.type) {
1107				read_next_posting(invcntl, &posting);
1108				set2c++;
1109			} else {	/* identical postings */
1110				set1c++;
1111				set1p++;
1112				read_next_posting(invcntl, &posting);
1113				set2c++;
1114			}
1115		}
1116		newsetc += numitems - set1c;
1117		while (set1c++ < numitems) {
1118			*newsetp++ = *set1p++;
1119		}
1120		break; /* end of NOT */
1121
1122	case REVERSENOT:  /* core NOT incoming set */
1123		set1c = 0;
1124		set2c = 0;
1125		while (set1c < numitems && set2c < *num) {
1126			if (set1p->lineoffset < posting.lineoffset) {
1127				set1p++;
1128				set1c++;
1129			} else if (set1p->lineoffset > posting.lineoffset) {
1130				*newsetp++ = posting;
1131				read_next_posting(invcntl, &posting);
1132				set2c++;
1133			} else if (set1p->type < posting.type) {
1134				set1p++;
1135				set1c++;
1136			} else if (set1p->type > posting.type) {
1137				*newsetp++ = posting;
1138				read_next_posting(invcntl, &posting);
1139				set2c++;
1140			} else {	/* identical postings */
1141				set1c++;
1142				set1p++;
1143				read_next_posting(invcntl, &posting);
1144				set2c++;
1145			}
1146		}
1147		while (set2c++ < *num) {
1148			*newsetp++ = posting;
1149			newsetc++;
1150			read_next_posting(invcntl, &posting);
1151		}
1152		item = newitem;
1153		break; /* end of REVERSENOT  */
1154#endif
1155	}
1156	numitems = newsetc;
1157	*num = newsetc;
1158	enditem = (POSTING *)newsetp;
1159	return ((POSTING *)item);
1160}
1161
1162#if 0
1163POSTING *
1164boolsave(int clear)
1165{
1166	int	i;
1167	POSTING	*ptr;
1168	POSTING	*oldstuff, *newstuff;
1169
1170	if (numitems == 0) {
1171		if (clear)
1172			boolclear();
1173		return (NULL);
1174	}
1175	/*
1176	 * if clear then give them what we have and use (void)
1177	 * boolready to realloc
1178	 */
1179	if (clear) {
1180		ptr = item;
1181		/* free up the space we didn't give them */
1182		if (item == item1)
1183			item1 = NULL;
1184		else
1185			item2 = NULL;
1186		(void) boolready();
1187		return (ptr);
1188	}
1189	i = (enditem - item) * sizeof (POSTING) + 100;
1190	if ((ptr = (POSTING *)malloc(i))r == NULL) {
1191		invcannotalloc(i);
1192		return (ptr);
1193	}
1194	/* move present set into place  */
1195	oldstuff = item;
1196	newstuff = ptr;
1197	while (oldstuff < enditem)
1198		*newstuff++ = *oldstuff++;
1199	return (ptr);
1200}
1201#endif
1202
1203static void
1204invcannotalloc(size_t n)
1205{
1206	(void) fprintf(stderr, "%s: cannot allocate %u bytes\n", argv0, n);
1207}
1208
1209static void
1210invcannotopen(char *file)
1211{
1212	(void) fprintf(stderr, "%s: cannot open file %s\n", argv0, file);
1213}
1214
1215static void
1216invcannotwrite(char *file)
1217{
1218	(void) perror(argv0);	/* must be first to preserve errno */
1219	(void) fprintf(stderr, "%s: write to file %s failed\n", argv0, file);
1220}
1221