1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*	Copyright (c) 1988 AT&T	*/
23 /*	  All Rights Reserved  	*/
24 
25 
26 /*
27  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
28  * Use is subject to license terms.
29  */
30 
31 #include <ctype.h>
32 #include <stdio.h>
33 #include <sys/types.h>
34 #include <sys/sysmacros.h>
35 #include <sys/byteorder.h>
36 #if SHARE
37 #include <sys/ipc.h>
38 #include <sys/shm.h>
39 #define	ERR  -1
40 #endif
41 #include "invlib.h"
42 #include "library.h"
43 
44 #define	DEBUG		0	/* debugging code and realloc messages */
45 #define	BLOCKSIZE	2 * BUFSIZ	/* logical block size */
46 #define	LINEMAX		1000	/* sorted posting line max size */
47 #define	POSTINC		10000	/* posting buffer size increment */
48 #define	SEP		' '	/* sorted posting field separator */
49 #define	SETINC		100	/* posting set size increment */
50 #define	STATS		0	/* print statistics */
51 #define	SUPERINC	10000	/* super index size increment */
52 #define	TERMMAX		512	/* term max size */
53 #define	VERSION		1	/* inverted index format version */
54 #define	ZIPFSIZE	200	/* zipf curve size */
55 #define	FREAD	"r"		/* fopen for reading */
56 #define	FREADP	"r+"		/* fopen for update */
57 #define	FWRITE	"w"		/* fopen truncate or create for writing */
58 #define	FWRITEP	"w+"		/* fopen truncate or create for update */
59 
60 extern	char	*argv0;	/* command name (must be set in main function) */
61 
62 int	invbreak;
63 
64 #if STATS
65 int	showzipf;	/* show postings per term distribution */
66 #endif
67 
68 static	POSTING	*item, *enditem, *item1 = NULL, *item2 = NULL;
69 static	unsigned setsize1, setsize2;
70 static	long	numitems, totterm, zerolong;
71 static	char	*indexfile, *postingfile;
72 static	FILE	*outfile, *fpost;
73 static	unsigned supersize = SUPERINC, supintsize;
74 static	int	numpost, numlogblk, amtused, nextpost,
75 		lastinblk, numinvitems;
76 static	POSTING	*POST, *postptr;
77 static	unsigned long	*SUPINT, *supint, nextsupfing;
78 static	char	*SUPFING, *supfing;
79 static	char	thisterm[TERMMAX];
80 static	union {
81 	long	invblk[BLOCKSIZE / sizeof (long)];
82 	char	chrblk[BLOCKSIZE];
83 } logicalblk;
84 
85 #if DEBUG || STATS
86 static	long	totpost;
87 #endif
88 
89 #if STATS
90 static	int	zipf[ZIPFSIZE + 1];
91 #endif
92 
93 static void invcannotalloc(size_t n);
94 static void invcannotopen(char *file);
95 static void invcannotwrite(char *file);
96 static int invnewterm(void);
97 static int boolready(void);
98 
99 long
invmake(char * invname,char * invpost,FILE * infile)100 invmake(char *invname, char *invpost, FILE *infile)
101 {
102 	unsigned char	*s;
103 	long	num;
104 	int	i;
105 	long	fileindex;
106 	unsigned postsize = POSTINC * sizeof (POSTING);
107 	unsigned long	*intptr;
108 	char	line[LINEMAX];
109 	long	tlong;
110 	PARAM	param;
111 	POSTING	posting;
112 #if STATS
113 	int	j;
114 	unsigned maxtermlen = 0;
115 #endif
116 	/* output file */
117 	if ((outfile = vpfopen(invname, FWRITEP)) == NULL) {
118 		invcannotopen(invname);
119 		return (0);
120 	}
121 	indexfile = invname;
122 	(void) fseek(outfile, (long)BUFSIZ, 0);
123 
124 	/* posting file  */
125 	if ((fpost = vpfopen(invpost, FWRITE)) == NULL) {
126 		invcannotopen(invpost);
127 		return (0);
128 	}
129 	postingfile = invpost;
130 	nextpost = 0;
131 	/* get space for the postings list */
132 	if ((POST = (POSTING *)malloc(postsize)) == NULL) {
133 		invcannotalloc(postsize);
134 		return (0);
135 	}
136 	postptr = POST;
137 	/* get space for the superfinger (superindex) */
138 	if ((SUPFING = malloc(supersize)) == NULL) {
139 		invcannotalloc(supersize);
140 		return (0);
141 	}
142 	supfing = SUPFING;
143 	supintsize = supersize / 40;
144 	/* also for the superfinger index */
145 	if ((SUPINT = malloc(supintsize * sizeof (long))) == NULL) {
146 		invcannotalloc(supintsize * sizeof (long));
147 		return (0);
148 	}
149 	supint = SUPINT;
150 	supint++; /* leave first term open for a count */
151 	/* initialize using an empty term */
152 	(void) strcpy(thisterm, "");
153 	*supint++ = 0;
154 	*supfing++ = ' ';
155 	*supfing++ = '\0';
156 	nextsupfing = 2;
157 #if DEBUG || STATS
158 	totpost = 0L;
159 #endif
160 	totterm = 0L;
161 	numpost = 1;
162 
163 	/*
164 	 * set up as though a block had come and gone, i.e., set up for
165 	 * new block
166 	 */
167 	amtused = 16; /* leave no space - init 3 words + one for luck */
168 	numinvitems = 0;
169 	numlogblk = 0;
170 	lastinblk = BLOCKSIZE;
171 
172 	/* now loop as long as more to read (till eof)  */
173 	while (fgets(line, LINEMAX, infile) != NULL) {
174 #if DEBUG || STATS
175 		++totpost;
176 #endif
177 		s = (unsigned char *) strchr(line, SEP);
178 		if (s == NULL)		/* where did this line come from ??? */
179 			continue;	/* workaround: just skip it */
180 		*s = '\0';
181 #if STATS
182 		if ((i = strlen(line)) > maxtermlen) {
183 			maxtermlen = i;
184 		}
185 #endif
186 #if DEBUG
187 		(void) printf("%ld: %s ", totpost, line);
188 		(void) fflush(stdout);
189 #endif
190 		if (strcmp(thisterm, line) == 0) {
191 			if (postptr + 10 > POST + postsize / sizeof (POSTING)) {
192 				i = postptr - POST;
193 				postsize += POSTINC * sizeof (POSTING);
194 				if ((POST = realloc(POST, postsize)) == NULL) {
195 					invcannotalloc(postsize);
196 					return (0);
197 				}
198 				postptr = i + POST;
199 #if DEBUG
200 				(void) printf("reallocated post space to %u, "
201 				    "totpost=%ld\n", postsize, totpost);
202 #endif
203 			}
204 			numpost++;
205 		} else {
206 			/* have a new term */
207 			if (!invnewterm()) {
208 				return (0);
209 			}
210 			(void) strcpy(thisterm, line);
211 			numpost = 1;
212 			postptr = POST;
213 			fileindex = 0;
214 		}
215 		/* get the new posting */
216 		num = *++s - '!';
217 		i = 1;
218 		do {
219 			num = BASE * num + *++s - '!';
220 		} while (++i < PRECISION);
221 		posting.lineoffset = num;
222 		while (++fileindex < nsrcoffset && num > srcoffset[fileindex]) {
223 			;
224 		}
225 		posting.fileindex = --fileindex;
226 		posting.type = *++s;
227 		num = *++s - '!';
228 		if (*s != '\n') {
229 			num = *++s - '!';
230 			while (*++s != '\n') {
231 				num = BASE * num + *s - '!';
232 			}
233 			posting.fcnoffset = num;
234 		} else {
235 			posting.fcnoffset = 0;
236 		}
237 		*postptr++ = posting;
238 #if DEBUG
239 		(void) printf("%ld %ld %ld %ld\n", posting.fileindex,
240 		    posting.fcnoffset, posting.lineoffset, posting.type);
241 		(void) fflush(stdout);
242 #endif
243 	}
244 	if (!invnewterm()) {
245 		return (0);
246 	}
247 	/* now clean up final block  */
248 	logicalblk.invblk[0] = numinvitems;
249 	/* loops pointer around to start */
250 	logicalblk.invblk[1] = 0;
251 	logicalblk.invblk[2] = numlogblk - 1;
252 	if (fwrite((char *)&logicalblk, BLOCKSIZE, 1, outfile) == 0) {
253 		goto cannotwrite;
254 	}
255 	numlogblk++;
256 	/* write out block to save space. what in it doesn't matter */
257 	if (fwrite((char *)&logicalblk, BLOCKSIZE, 1, outfile) == 0) {
258 		goto cannotwrite;
259 	}
260 	/* finish up the super finger */
261 	*SUPINT = numlogblk;
262 	/* add to the offsets the size of the offset pointers */
263 	intptr = (SUPINT + 1);
264 	i = (char *)supint - (char *)SUPINT;
265 	while (intptr < supint)
266 		*intptr++ += i;
267 	/* write out the offsets (1 for the N at start) and the super finger */
268 	if (fwrite((char *)SUPINT, sizeof (*SUPINT), numlogblk + 1,
269 	    outfile) == 0 ||
270 	    fwrite(SUPFING, 1, supfing - SUPFING, outfile) == 0) {
271 		goto cannotwrite;
272 	}
273 	/* save the size for reference later */
274 	nextsupfing = sizeof (long) + sizeof (long) * numlogblk +
275 	    (supfing - SUPFING);
276 	/*
277 	 * make sure the file ends at a logical block boundary.  This is
278 	 * necessary for invinsert to correctly create extended blocks
279 	 */
280 	i = nextsupfing % BLOCKSIZE;
281 	/* write out junk to fill log blk */
282 	if (fwrite(SUPFING, BLOCKSIZE - i, 1, outfile) == 0 ||
283 	    fflush(outfile) == EOF) {
284 		/* rewind doesn't check for write failure */
285 		goto cannotwrite;
286 	}
287 	/* write the control area */
288 	rewind(outfile);
289 	param.version = VERSION;
290 	param.filestat = 0;
291 	param.sizeblk = BLOCKSIZE;
292 	param.startbyte = (numlogblk + 1) * BLOCKSIZE + BUFSIZ;
293 	param.supsize = nextsupfing;
294 	param.cntlsize = BUFSIZ;
295 	param.share = 0;
296 	if (fwrite((char *)&param, sizeof (param), 1, outfile) == 0) {
297 		goto cannotwrite;
298 	}
299 	for (i = 0; i < 10; i++)	/* for future use */
300 		if (fwrite((char *)&zerolong, sizeof (zerolong),
301 		    1, outfile) == 0) {
302 			goto cannotwrite;
303 		}
304 
305 	/* make first block loop backwards to last block */
306 	if (fflush(outfile) == EOF) {
307 		/* fseek doesn't check for write failure */
308 		goto cannotwrite;
309 	}
310 	/* get to second word first block */
311 	(void) fseek(outfile, (long)BUFSIZ + 8, 0);
312 	tlong = numlogblk - 1;
313 	if (fwrite((char *)&tlong, sizeof (tlong), 1, outfile) == 0 ||
314 	    fclose(outfile) == EOF) {
315 cannotwrite:
316 		invcannotwrite(invname);
317 		return (0);
318 	}
319 	if (fclose(fpost) == EOF) {
320 		invcannotwrite(postingfile);
321 		return (0);
322 	}
323 	--totterm;	/* don't count null term */
324 #if STATS
325 	(void) printf("logical blocks = %d, postings = %ld, terms = %ld, "
326 	    "max term length = %d\n", numlogblk, totpost, totterm, maxtermlen);
327 	if (showzipf) {
328 		(void) printf(
329 		    "\n*************   ZIPF curve  ****************\n");
330 		for (j = ZIPFSIZE; j > 1; j--)
331 			if (zipf[j])
332 				break;
333 		for (i = 1; i < j; ++i) {
334 			(void) printf("%3d -%6d ", i, zipf[i]);
335 			if (i % 6 == 0) (void) putchar('\n');
336 		}
337 		(void) printf(">%d-%6d\n", ZIPFSIZE, zipf[0]);
338 	}
339 #endif
340 	/* free all malloc'd memory */
341 	free(POST);
342 	free(SUPFING);
343 	free(SUPINT);
344 	return (totterm);
345 }
346 
347 /* add a term to the data base */
348 
349 static int
invnewterm(void)350 invnewterm(void)
351 {
352 	int	backupflag, i, j, maxback, holditems, gooditems, howfar;
353 	int	len, numwilluse, wdlen;
354 	char	*tptr, *tptr2, *tptr3;
355 	union {
356 		unsigned long	packword[2];
357 		ENTRY		e;
358 	} iteminfo;
359 
360 	totterm++;
361 #if STATS
362 	/* keep zipfian info on the distribution */
363 	if (numpost <= ZIPFSIZE)
364 		zipf[numpost]++;
365 	else
366 		zipf[0]++;
367 #endif
368 	len = strlen(thisterm);
369 	wdlen = (len + (sizeof (long) - 1)) / sizeof (long);
370 	numwilluse = (wdlen + 3) * sizeof (long);
371 	/* new block if at least 1 item in block */
372 	if (numinvitems && numwilluse + amtused > BLOCKSIZE) {
373 		/* set up new block */
374 		if (supfing + 500 > SUPFING + supersize) {
375 			i = supfing - SUPFING;
376 			supersize += 20000;
377 			if ((SUPFING = realloc(SUPFING, supersize)) == NULL) {
378 				invcannotalloc(supersize);
379 				return (0);
380 			}
381 			supfing = i + SUPFING;
382 #if DEBUG
383 			(void) printf("reallocated superfinger space to %d, "
384 			    "totpost=%ld\n", supersize, totpost);
385 #endif
386 		}
387 		/* check that room for the offset as well */
388 		if ((numlogblk + 10) > supintsize) {
389 			i = supint - SUPINT;
390 			supintsize += SUPERINC;
391 			if ((SUPINT = realloc((char *)SUPINT,
392 			    supintsize * sizeof (long))) == NULL) {
393 				invcannotalloc(supintsize * sizeof (long));
394 				return (0);
395 			}
396 			supint = i + SUPINT;
397 #if DEBUG
398 			(void) printf("reallocated superfinger offset to %d, "
399 			    "totpost = %ld\n", supintsize * sizeof (long),
400 			    totpost);
401 #endif
402 		}
403 		/* See if backup is efficatious  */
404 		backupflag = 0;
405 		maxback = strlen(thisterm) / 10;
406 		holditems = numinvitems;
407 		if (maxback > numinvitems)
408 			maxback = numinvitems - 2;
409 		howfar = 0;
410 		while (--maxback > 0) {
411 			howfar++;
412 			iteminfo.packword[0] =
413 			    logicalblk.invblk[--holditems * 2 +
414 			    (sizeof (long) - 1)];
415 			if ((i = iteminfo.e.size / 10) < maxback) {
416 				maxback = i;
417 				backupflag = howfar;
418 				gooditems = holditems;
419 				tptr2 = logicalblk.chrblk + iteminfo.e.offset;
420 			}
421 		}
422 		/* see if backup will occur  */
423 		if (backupflag) {
424 			numinvitems = gooditems;
425 		}
426 		logicalblk.invblk[0] = numinvitems;
427 		/* set forward pointer pointing to next */
428 		logicalblk.invblk[1] = numlogblk + 1;
429 		/* set back pointer to last block */
430 		logicalblk.invblk[2] = numlogblk - 1;
431 		if (fwrite((char *)logicalblk.chrblk, 1,
432 		    BLOCKSIZE, outfile) == 0) {
433 			invcannotwrite(indexfile);
434 			return (0);
435 		}
436 		amtused = 16;
437 		numlogblk++;
438 		/* check if had to back up, if so do it */
439 		if (backupflag) {
440 			/* find out where the end of the new block is */
441 			iteminfo.packword[0] =
442 			    logicalblk.invblk[numinvitems * 2 + 1];
443 			tptr3 = logicalblk.chrblk + iteminfo.e.offset;
444 			/* move the index for this block */
445 			for (i = 3; i <= (backupflag * 2 + 2); i++) {
446 				logicalblk.invblk[i] =
447 				    logicalblk.invblk[numinvitems * 2+i];
448 			}
449 			/* move the word into the super index */
450 			iteminfo.packword[0] = logicalblk.invblk[3];
451 			iteminfo.packword[1] = logicalblk.invblk[4];
452 			tptr2 = logicalblk.chrblk + iteminfo.e.offset;
453 			(void) strncpy(supfing, tptr2, (int)iteminfo.e.size);
454 			*(supfing + iteminfo.e.size) = '\0';
455 #if DEBUG
456 			(void) printf("backup %d at term=%s to term=%s\n",
457 			    backupflag, thisterm, supfing);
458 #endif
459 			*supint++ = nextsupfing;
460 			nextsupfing += strlen(supfing) + 1;
461 			supfing += strlen(supfing) + 1;
462 			/* now fix up the logical block */
463 			tptr = logicalblk.chrblk + lastinblk;
464 			lastinblk = BLOCKSIZE;
465 			tptr2 = logicalblk.chrblk + lastinblk;
466 			j = tptr3 - tptr;
467 			while (tptr3 > tptr)
468 				*--tptr2 = *--tptr3;
469 			lastinblk -= j;
470 			amtused += (8 * backupflag + j);
471 			for (i = 3; i < (backupflag * 2 + 2); i += 2) {
472 				iteminfo.packword[0] = logicalblk.invblk[i];
473 				iteminfo.e.offset += (tptr2 - tptr3);
474 				logicalblk.invblk[i] = iteminfo.packword[0];
475 			}
476 			numinvitems = backupflag;
477 		} else { /* no backup needed */
478 			numinvitems = 0;
479 			lastinblk = BLOCKSIZE;
480 			/* add new term to superindex */
481 			(void) strcpy(supfing, thisterm);
482 			supfing += strlen(thisterm) + 1;
483 			*supint++ = nextsupfing;
484 			nextsupfing += strlen(thisterm) + 1;
485 		}
486 	}
487 	lastinblk -= (numwilluse - 8);
488 	iteminfo.e.offset = lastinblk;
489 	iteminfo.e.size = (char)len;
490 	iteminfo.e.space = 0;
491 	iteminfo.e.post = numpost;
492 	(void) strncpy(logicalblk.chrblk + lastinblk, thisterm, len);
493 	amtused += numwilluse;
494 	logicalblk.invblk[(lastinblk/sizeof (long))+wdlen] = nextpost;
495 	if ((i = postptr - POST) > 0) {
496 		if (fwrite((char *)POST, sizeof (POSTING), i, fpost) == 0) {
497 			invcannotwrite(postingfile);
498 			return (0);
499 		}
500 		nextpost += i * sizeof (POSTING);
501 	}
502 	logicalblk.invblk[3+2*numinvitems++] = iteminfo.packword[0];
503 	logicalblk.invblk[2+2*numinvitems] = iteminfo.packword[1];
504 	return (1);
505 }
506 
507 static void
swap_ints(void * p,size_t sz)508 swap_ints(void *p, size_t sz)
509 {
510 	uint32_t *s;
511 	uint32_t *e = (uint32_t *)p + (sz / sizeof (uint32_t));
512 
513 	for (s = p; s < e; s++)
514 		*s = BSWAP_32(*s);
515 }
516 
517 static void
write_param(INVCONTROL * invcntl)518 write_param(INVCONTROL *invcntl)
519 {
520 	if (invcntl->swap)
521 		swap_ints(&invcntl->param, sizeof (invcntl->param));
522 
523 	rewind(invcntl->invfile);
524 	(void) fwrite((char *)&invcntl->param, sizeof (invcntl->param), 1,
525 	    invcntl->invfile);
526 
527 	if (invcntl->swap)
528 		swap_ints(&invcntl->param, sizeof (invcntl->param));
529 }
530 
531 static void
read_superfinger(INVCONTROL * invcntl)532 read_superfinger(INVCONTROL *invcntl)
533 {
534 	size_t count;
535 
536 	(void) fseek(invcntl->invfile, invcntl->param.startbyte, SEEK_SET);
537 	(void) fread(invcntl->iindex, (int)invcntl->param.supsize,
538 	    1, invcntl->invfile);
539 
540 	if (invcntl->swap) {
541 		/*
542 		 * The superfinger consists of a count, followed by
543 		 * count offsets, followed by a string table (which
544 		 * the offsets reference).
545 		 *
546 		 * We need to swap the count and the offsets.
547 		 */
548 		count = 1 + BSWAP_32(*(uint32_t *)invcntl->iindex);
549 		swap_ints(invcntl->iindex, count * sizeof (unsigned long));
550 	}
551 }
552 
553 static void
read_logblock(INVCONTROL * invcntl,int block)554 read_logblock(INVCONTROL *invcntl, int block)
555 {
556 	/* note always fetch it if the file is busy */
557 	if ((block != invcntl->numblk) ||
558 	    (invcntl->param.filestat >= INVBUSY)) {
559 		(void) fseek(invcntl->invfile,
560 		    (block * invcntl->param.sizeblk) + invcntl->param.cntlsize,
561 		    SEEK_SET);
562 		invcntl->numblk = block;
563 		(void) fread(invcntl->logblk, (int)invcntl->param.sizeblk,
564 		    1, invcntl->invfile);
565 
566 		if (invcntl->swap) {
567 			size_t count;
568 			ENTRY *ecur, *eend;
569 			uint32_t *postptr;
570 
571 			/*
572 			 * A logblock consists of a count, a next block id,
573 			 * and a previous block id, followed by count
574 			 * ENTRYs, followed by alternating strings and
575 			 * offsets.
576 			 */
577 			swap_ints(invcntl->logblk, 3 * sizeof (unsigned long));
578 
579 			count = *(uint32_t *)invcntl->logblk;
580 
581 			ecur = (ENTRY *)((uint32_t *)invcntl->logblk + 3);
582 			eend = ecur + count;
583 
584 			for (; ecur < eend; ecur++) {
585 				ecur->offset = BSWAP_16(ecur->offset);
586 				ecur->post = BSWAP_32(ecur->post);
587 
588 				/*
589 				 * After the string is the posting offset.
590 				 */
591 				postptr = (uint32_t *)(invcntl->logblk +
592 				    ecur->offset +
593 				    P2ROUNDUP(ecur->size, sizeof (long)));
594 
595 				*postptr = BSWAP_32(*postptr);
596 			}
597 		}
598 	}
599 }
600 
601 void
read_next_posting(INVCONTROL * invcntl,POSTING * posting)602 read_next_posting(INVCONTROL *invcntl, POSTING *posting)
603 {
604 	(void) fread((char *)posting, sizeof (*posting), 1, invcntl->postfile);
605 	if (invcntl->swap) {
606 		posting->lineoffset = BSWAP_32(posting->lineoffset);
607 		posting->fcnoffset = BSWAP_32(posting->fcnoffset);
608 		/*
609 		 * fileindex is a 24-bit field, so shift it before swapping
610 		 */
611 		posting->fileindex = BSWAP_32(posting->fileindex << 8);
612 	}
613 }
614 
615 int
invopen(INVCONTROL * invcntl,char * invname,char * invpost,int stat)616 invopen(INVCONTROL *invcntl, char *invname, char *invpost, int stat)
617 {
618 	int	read_index;
619 
620 	if ((invcntl->invfile = vpfopen(invname,
621 	    ((stat == 0) ? FREAD : FREADP))) == NULL) {
622 		invcannotopen(invname);
623 		return (-1);
624 	}
625 	if (fread((char *)&invcntl->param, sizeof (invcntl->param), 1,
626 	    invcntl->invfile) == 0) {
627 		(void) fprintf(stderr, "%s: empty inverted file\n", argv0);
628 		goto closeinv;
629 	}
630 	if (invcntl->param.version != VERSION &&
631 	    BSWAP_32(invcntl->param.version) != VERSION) {
632 		(void) fprintf(stderr,
633 		    "%s: cannot read old index format; use -U option to "
634 		    "force database to rebuild\n", argv0);
635 		goto closeinv;
636 	}
637 	invcntl->swap = (invcntl->param.version != VERSION);
638 	if (invcntl->swap)
639 		swap_ints(&invcntl->param, sizeof (invcntl->param));
640 
641 	if (stat == 0 && invcntl->param.filestat == INVALONE) {
642 		(void) fprintf(stderr, "%s: inverted file is locked\n", argv0);
643 		goto closeinv;
644 	}
645 	if ((invcntl->postfile = vpfopen(invpost,
646 	    ((stat == 0) ? FREAD : FREADP))) == NULL) {
647 		invcannotopen(invpost);
648 		goto closeinv;
649 	}
650 	/* allocate core for a logical block  */
651 	if ((invcntl->logblk = malloc(invcntl->param.sizeblk)) == NULL) {
652 		invcannotalloc((unsigned)invcntl->param.sizeblk);
653 		goto closeboth;
654 	}
655 	/* allocate for and read in superfinger  */
656 	read_index = 1;
657 	invcntl->iindex = NULL;
658 #if SHARE
659 	if (invcntl->param.share == 1) {
660 		key_t ftok(), shm_key;
661 		struct shmid_ds shm_buf;
662 		char	*shmat();
663 		int	shm_id;
664 
665 		/* see if the shared segment exists */
666 		shm_key = ftok(invname, 2);
667 		shm_id = shmget(shm_key, 0, 0);
668 		/*
669 		 * Failure simply means (hopefully) that segment doesn't
670 		 * exist
671 		 */
672 		if (shm_id == -1) {
673 			/*
674 			 * Have to give general write permission due to AMdahl
675 			 * not having protected segments
676 			 */
677 			shm_id = shmget(shm_key,
678 			    invcntl->param.supsize + sizeof (long),
679 			    IPC_CREAT | 0666);
680 			if (shm_id == -1)
681 				perror("Could not create shared "
682 				    "memory segment");
683 		} else
684 			read_index = 0;
685 
686 		if (shm_id != -1) {
687 			invcntl->iindex = shmat(shm_id, 0,
688 			    ((read_index) ? 0 : SHM_RDONLY));
689 			if (invcntl->iindex == (char *)ERR) {
690 				(void) fprintf(stderr,
691 				    "%s: shared memory link failed\n", argv0);
692 				invcntl->iindex = NULL;
693 				read_index = 1;
694 			}
695 		}
696 	}
697 #endif
698 	if (invcntl->iindex == NULL)
699 		invcntl->iindex = malloc(invcntl->param.supsize + 16);
700 	if (invcntl->iindex == NULL) {
701 		invcannotalloc((unsigned)invcntl->param.supsize);
702 		free(invcntl->logblk);
703 		goto closeboth;
704 	}
705 	if (read_index) {
706 		read_superfinger(invcntl);
707 	}
708 	invcntl->numblk = -1;
709 	if (boolready() == -1) {
710 	closeboth:
711 		(void) fclose(invcntl->postfile);
712 	closeinv:
713 		(void) fclose(invcntl->invfile);
714 		return (-1);
715 	}
716 	/* write back out the control block if anything changed */
717 	invcntl->param.filestat = stat;
718 	if (stat > invcntl->param.filestat)
719 		write_param(invcntl);
720 	return (1);
721 }
722 
723 /* invclose must be called to wrap things up and deallocate core */
724 void
invclose(INVCONTROL * invcntl)725 invclose(INVCONTROL *invcntl)
726 {
727 	/* write out the control block in case anything changed */
728 	if (invcntl->param.filestat > 0) {
729 		invcntl->param.filestat = 0;
730 		write_param(invcntl);
731 	}
732 	(void) fclose(invcntl->invfile);
733 	(void) fclose(invcntl->postfile);
734 #if SHARE
735 	if (invcntl->param.share > 0) {
736 		shmdt(invcntl->iindex);
737 		invcntl->iindex = NULL;
738 	}
739 #endif
740 	if (invcntl->iindex != NULL)
741 		free(invcntl->iindex);
742 	free(invcntl->logblk);
743 }
744 
745 /* invstep steps the inverted file forward one item */
746 void
invstep(INVCONTROL * invcntl)747 invstep(INVCONTROL *invcntl)
748 {
749 	if (invcntl->keypnt < (*(int *)invcntl->logblk - 1)) {
750 		invcntl->keypnt++;
751 		return;
752 	}
753 
754 	/* move forward a block else wrap */
755 	read_logblock(invcntl, *(int *)(invcntl->logblk + sizeof (long)));
756 
757 	invcntl->keypnt = 0;
758 }
759 
760 /* invforward moves forward one term in the inverted file  */
761 int
invforward(INVCONTROL * invcntl)762 invforward(INVCONTROL *invcntl)
763 {
764 	invstep(invcntl);
765 	/* skip things with 0 postings */
766 	while (((ENTRY *)(invcntl->logblk + 12) + invcntl->keypnt)->post == 0) {
767 		invstep(invcntl);
768 	}
769 	/* Check for having wrapped - reached start of inverted file! */
770 	if ((invcntl->numblk == 0) && (invcntl->keypnt == 0))
771 		return (0);
772 	return (1);
773 }
774 
775 /*  invterm gets the present term from the present logical block  */
776 int
invterm(INVCONTROL * invcntl,char * term)777 invterm(INVCONTROL *invcntl, char *term)
778 {
779 	ENTRY * entryptr;
780 
781 	entryptr = (ENTRY *)(invcntl->logblk + 12) + invcntl->keypnt;
782 	(void) strncpy(term, entryptr->offset + invcntl->logblk,
783 	    (int)entryptr->size);
784 	*(term + entryptr->size) = '\0';
785 	return (entryptr->post);
786 }
787 
788 /* invfind searches for an individual item in the inverted file */
789 long
invfind(INVCONTROL * invcntl,char * searchterm)790 invfind(INVCONTROL *invcntl, char *searchterm)
791 {
792 	int	imid, ilow, ihigh;
793 	long	num;
794 	int	i;
795 	unsigned long	*intptr, *intptr2;
796 	ENTRY *entryptr;
797 
798 	/* make sure it is initialized via invready  */
799 	if (invcntl->invfile == 0)
800 		return (-1L);
801 
802 	/* now search for the appropriate finger block */
803 	intptr = (unsigned long *)invcntl->iindex;
804 
805 	ilow = 0;
806 	ihigh = *intptr++ - 1;
807 	while (ilow <= ihigh) {
808 		imid = (ilow + ihigh) / 2;
809 		intptr2 = intptr + imid;
810 		i = strcmp(searchterm, (invcntl->iindex + *intptr2));
811 		if (i < 0)
812 			ihigh = imid - 1;
813 		else if (i > 0)
814 			ilow = ++imid;
815 		else {
816 			ilow = imid + 1;
817 			break;
818 		}
819 	}
820 	/* be careful about case where searchterm is after last in this block */
821 	imid = (ilow) ? ilow - 1 : 0;
822 
823 	/* fetch the appropriate logical block if not in core  */
824 	read_logblock(invcntl, imid);
825 
826 srch_ext:
827 	/* now find the term in this block. tricky this  */
828 	intptr = (unsigned long *)invcntl->logblk;
829 
830 	ilow = 0;
831 	ihigh = *intptr - 1;
832 	intptr += 3;
833 	num = 0;
834 	while (ilow <= ihigh) {
835 		imid = (ilow + ihigh) / 2;
836 		entryptr = (ENTRY *)intptr + imid;
837 		i = strncmp(searchterm, (invcntl->logblk + entryptr->offset),
838 		    (int)entryptr->size);
839 		if (i == 0)
840 			i = strlen(searchterm) - entryptr->size;
841 		if (i < 0)
842 			ihigh = imid - 1;
843 		else if (i > 0)
844 			ilow = ++imid;
845 		else {
846 			num = entryptr->post;
847 			break;
848 		}
849 	}
850 	/* be careful about case where searchterm is after last in this block */
851 	if (imid >= *(long *)invcntl->logblk) {
852 		invcntl->keypnt = *(long *)invcntl->logblk;
853 		invstep(invcntl);
854 		/* note if this happens the term could be in extended block */
855 		if (invcntl->param.startbyte <
856 		    invcntl->numblk * invcntl->param.sizeblk)
857 			goto srch_ext;
858 	} else
859 		invcntl->keypnt = imid;
860 	return (num);
861 }
862 
863 #if DEBUG
864 
865 /* invdump dumps the block the term parameter is in */
866 void
invdump(INVCONTROL * invcntl,char * term)867 invdump(INVCONTROL *invcntl, char *term)
868 {
869 	long	i, j, n, *longptr;
870 	ENTRY * entryptr;
871 	char	temp[512], *ptr;
872 
873 	/* dump superindex if term is "-"  */
874 	if (*term == '-') {
875 		j = atoi(term + 1);
876 		longptr = (long *)invcntl->iindex;
877 		n = *longptr++;
878 		(void) printf("Superindex dump, num blocks=%ld\n", n);
879 		longptr += j;
880 		while ((longptr <= ((long *)invcntl->iindex) + n) &&
881 		    invbreak == 0) {
882 			(void) printf("%2ld  %6ld %s\n", j++, *longptr,
883 			    invcntl->iindex + *longptr);
884 			longptr++;
885 		}
886 		return;
887 	} else if (*term == '#') {
888 		j = atoi(term + 1);
889 		/* fetch the appropriate logical block */
890 		read_logblock(invcntl, j);
891 	} else
892 		i = abs((int)invfind(invcntl, term));
893 	longptr = (long *)invcntl->logblk;
894 	n = *longptr++;
895 	(void) printf("Entry term to invdump=%s, postings=%ld, "
896 	    "forward ptr=%ld, back ptr=%ld\n", term, i, *(longptr),
897 	    *(longptr + 1));
898 	entryptr = (ENTRY *)(invcntl->logblk + 12);
899 	(void) printf("%ld terms in this block, block=%ld\n", n,
900 	    invcntl->numblk);
901 	(void) printf("\tterm\t\t\tposts\tsize\toffset\tspace\t1st word\n");
902 	for (j = 0; j < n && invbreak == 0; j++) {
903 		ptr = invcntl->logblk + entryptr->offset;
904 		(void) strncpy(temp, ptr, (int)entryptr->size);
905 		temp[entryptr->size] = '\0';
906 		ptr += (sizeof (long) *
907 		    (long)((entryptr->size +
908 		    (sizeof (long) - 1)) / sizeof (long)));
909 		(void) printf("%2ld  %-24s\t%5ld\t%3d\t%d\t%d\t%ld\n", j, temp,
910 		    entryptr->post, entryptr->size, entryptr->offset,
911 		    entryptr->space, *(long *)ptr);
912 		entryptr++;
913 	}
914 }
915 #endif
916 
917 static int
boolready(void)918 boolready(void)
919 {
920 	numitems = 0;
921 	if (item1 != NULL)
922 		free(item1);
923 	setsize1 = SETINC;
924 	if ((item1 = (POSTING *)malloc(SETINC * sizeof (POSTING))) == NULL) {
925 		invcannotalloc(SETINC);
926 		return (-1);
927 	}
928 	if (item2 != NULL)
929 		free(item2);
930 	setsize2 = SETINC;
931 	if ((item2 = (POSTING *)malloc(SETINC * sizeof (POSTING))) == NULL) {
932 		invcannotalloc(SETINC);
933 		return (-1);
934 	}
935 	item = item1;
936 	enditem = item;
937 	return (0);
938 }
939 
940 void
boolclear(void)941 boolclear(void)
942 {
943 	numitems = 0;
944 	item = item1;
945 	enditem = item;
946 }
947 
948 POSTING *
boolfile(INVCONTROL * invcntl,long * num,int bool)949 boolfile(INVCONTROL *invcntl, long *num, int bool)
950 {
951 	ENTRY	*entryptr;
952 	FILE	*file;
953 	char	*ptr;
954 	unsigned long	*ptr2;
955 	POSTING	*newitem;
956 	POSTING	posting;
957 	unsigned u;
958 	POSTING *newsetp, *set1p;
959 	long	newsetc, set1c, set2c;
960 
961 	entryptr = (ENTRY *) (invcntl->logblk + 12) + invcntl->keypnt;
962 	ptr = invcntl->logblk + entryptr->offset;
963 	ptr2 = ((unsigned long *)ptr) +
964 	    (entryptr->size + (sizeof (long) - 1)) / sizeof (long);
965 	*num = entryptr->post;
966 	switch (bool) {
967 	case OR:
968 	case NOT:
969 		if (*num == 0) {
970 			*num = numitems;
971 			return (item);
972 		}
973 	}
974 	/* make room for the new set */
975 	u = 0;
976 	switch (bool) {
977 	case AND:
978 	case NOT:
979 		newsetp = set1p = item;
980 		break;
981 
982 	case OR:
983 		u = enditem - item;
984 		/* FALLTHROUGH */
985 	case REVERSENOT:
986 		u += *num;
987 		if (item == item2) {
988 			if (u > setsize1) {
989 				u += SETINC;
990 				if ((item1 = (POSTING *) realloc(item1,
991 				    u * sizeof (POSTING))) == NULL) {
992 					goto cannotalloc;
993 				}
994 				setsize1 = u;
995 			}
996 			newitem = item1;
997 		} else {
998 			if (u > setsize2) {
999 				u += SETINC;
1000 				if ((item2 = (POSTING *)realloc(item2,
1001 				    u * sizeof (POSTING))) == NULL) {
1002 				cannotalloc:
1003 					invcannotalloc(u * sizeof (POSTING));
1004 					(void) boolready();
1005 					*num = -1;
1006 					return (NULL);
1007 				}
1008 				setsize2 = u;
1009 			}
1010 			newitem = item2;
1011 		}
1012 		set1p = item;
1013 		newsetp = newitem;
1014 	}
1015 	file = invcntl->postfile;
1016 	(void) fseek(file, (long)*ptr2, SEEK_SET);
1017 	read_next_posting(invcntl, &posting);
1018 	newsetc = 0;
1019 	switch (bool) {
1020 	case OR:
1021 		/* while something in both sets */
1022 		set1p = item;
1023 		newsetp = newitem;
1024 		for (set1c = 0, set2c = 0;
1025 		    set1c < numitems && set2c < *num; newsetc++) {
1026 			if (set1p->lineoffset < posting.lineoffset) {
1027 				*newsetp++ = *set1p++;
1028 				set1c++;
1029 			} else if (set1p->lineoffset > posting.lineoffset) {
1030 				*newsetp++ = posting;
1031 				read_next_posting(invcntl, &posting);
1032 				set2c++;
1033 			} else if (set1p->type < posting.type) {
1034 				*newsetp++ = *set1p++;
1035 				set1c++;
1036 			} else if (set1p->type > posting.type) {
1037 				*newsetp++ = posting;
1038 				read_next_posting(invcntl, &posting);
1039 				set2c++;
1040 			} else {	/* identical postings */
1041 				*newsetp++ = *set1p++;
1042 				set1c++;
1043 				read_next_posting(invcntl, &posting);
1044 				set2c++;
1045 			}
1046 		}
1047 		/* find out what ran out and move the rest in */
1048 		if (set1c < numitems) {
1049 			newsetc += numitems - set1c;
1050 			while (set1c++ < numitems) {
1051 				*newsetp++ = *set1p++;
1052 			}
1053 		} else {
1054 			while (set2c++ < *num) {
1055 				*newsetp++ = posting;
1056 				newsetc++;
1057 				read_next_posting(invcntl, &posting);
1058 			}
1059 		}
1060 		item = newitem;
1061 		break; /* end of OR */
1062 #if 0
1063 	case AND:
1064 		set1c = 0;
1065 		set2c = 0;
1066 		while (set1c < numitems && set2c < *num) {
1067 			if (set1p->lineoffset < posting.lineoffset) {
1068 				set1p++;
1069 				set1c++;
1070 			} else if (set1p->lineoffset > posting.lineoffset) {
1071 				read_next_posting(invcntl, &posting);
1072 				set2c++;
1073 			} else if (set1p->type < posting.type)  {
1074 				*set1p++;
1075 				set1c++;
1076 			} else if (set1p->type > posting.type) {
1077 				read_next_posting(invcntl, &posting);
1078 				set2c++;
1079 			} else {	/* identical postings */
1080 				*newsetp++ = *set1p++;
1081 				newsetc++;
1082 				set1c++;
1083 				read_next_posting(invcntl, &posting);
1084 				set2c++;
1085 			}
1086 		}
1087 		break; /* end of AND */
1088 
1089 	case NOT:
1090 		set1c = 0;
1091 		set2c = 0;
1092 		while (set1c < numitems && set2c < *num) {
1093 			if (set1p->lineoffset < posting.lineoffset) {
1094 				*newsetp++ = *set1p++;
1095 				newsetc++;
1096 				set1c++;
1097 			} else if (set1p->lineoffset > posting.lineoffset) {
1098 				read_next_posting(invcntl, &posting);
1099 				set2c++;
1100 			} else if (set1p->type < posting.type) {
1101 				*newsetp++ = *set1p++;
1102 				newsetc++;
1103 				set1c++;
1104 			} else if (set1p->type > posting.type) {
1105 				read_next_posting(invcntl, &posting);
1106 				set2c++;
1107 			} else {	/* identical postings */
1108 				set1c++;
1109 				set1p++;
1110 				read_next_posting(invcntl, &posting);
1111 				set2c++;
1112 			}
1113 		}
1114 		newsetc += numitems - set1c;
1115 		while (set1c++ < numitems) {
1116 			*newsetp++ = *set1p++;
1117 		}
1118 		break; /* end of NOT */
1119 
1120 	case REVERSENOT:  /* core NOT incoming set */
1121 		set1c = 0;
1122 		set2c = 0;
1123 		while (set1c < numitems && set2c < *num) {
1124 			if (set1p->lineoffset < posting.lineoffset) {
1125 				set1p++;
1126 				set1c++;
1127 			} else if (set1p->lineoffset > posting.lineoffset) {
1128 				*newsetp++ = posting;
1129 				read_next_posting(invcntl, &posting);
1130 				set2c++;
1131 			} else if (set1p->type < posting.type) {
1132 				set1p++;
1133 				set1c++;
1134 			} else if (set1p->type > posting.type) {
1135 				*newsetp++ = posting;
1136 				read_next_posting(invcntl, &posting);
1137 				set2c++;
1138 			} else {	/* identical postings */
1139 				set1c++;
1140 				set1p++;
1141 				read_next_posting(invcntl, &posting);
1142 				set2c++;
1143 			}
1144 		}
1145 		while (set2c++ < *num) {
1146 			*newsetp++ = posting;
1147 			newsetc++;
1148 			read_next_posting(invcntl, &posting);
1149 		}
1150 		item = newitem;
1151 		break; /* end of REVERSENOT  */
1152 #endif
1153 	}
1154 	numitems = newsetc;
1155 	*num = newsetc;
1156 	enditem = (POSTING *)newsetp;
1157 	return ((POSTING *)item);
1158 }
1159 
1160 #if 0
1161 POSTING *
1162 boolsave(int clear)
1163 {
1164 	int	i;
1165 	POSTING	*ptr;
1166 	POSTING	*oldstuff, *newstuff;
1167 
1168 	if (numitems == 0) {
1169 		if (clear)
1170 			boolclear();
1171 		return (NULL);
1172 	}
1173 	/*
1174 	 * if clear then give them what we have and use (void)
1175 	 * boolready to realloc
1176 	 */
1177 	if (clear) {
1178 		ptr = item;
1179 		/* free up the space we didn't give them */
1180 		if (item == item1)
1181 			item1 = NULL;
1182 		else
1183 			item2 = NULL;
1184 		(void) boolready();
1185 		return (ptr);
1186 	}
1187 	i = (enditem - item) * sizeof (POSTING) + 100;
1188 	if ((ptr = (POSTING *)malloc(i))r == NULL) {
1189 		invcannotalloc(i);
1190 		return (ptr);
1191 	}
1192 	/* move present set into place  */
1193 	oldstuff = item;
1194 	newstuff = ptr;
1195 	while (oldstuff < enditem)
1196 		*newstuff++ = *oldstuff++;
1197 	return (ptr);
1198 }
1199 #endif
1200 
1201 static void
invcannotalloc(size_t n)1202 invcannotalloc(size_t n)
1203 {
1204 	(void) fprintf(stderr, "%s: cannot allocate %u bytes\n", argv0, n);
1205 }
1206 
1207 static void
invcannotopen(char * file)1208 invcannotopen(char *file)
1209 {
1210 	(void) fprintf(stderr, "%s: cannot open file %s\n", argv0, file);
1211 }
1212 
1213 static void
invcannotwrite(char * file)1214 invcannotwrite(char *file)
1215 {
1216 	(void) perror(argv0);	/* must be first to preserve errno */
1217 	(void) fprintf(stderr, "%s: write to file %s failed\n", argv0, file);
1218 }
1219