1*4d131170SRobert Mustacchi /* $Id: read.c,v 1.220 2021/06/27 17:57:54 schwarze Exp $ */
295c635efSGarrett D'Amore /*
3*4d131170SRobert Mustacchi * Copyright (c) 2010-2020 Ingo Schwarze <schwarze@openbsd.org>
495c635efSGarrett D'Amore * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
5260e9a87SYuri Pankov * Copyright (c) 2010, 2012 Joerg Sonnenberger <joerg@netbsd.org>
695c635efSGarrett D'Amore *
795c635efSGarrett D'Amore * Permission to use, copy, modify, and distribute this software for any
895c635efSGarrett D'Amore * purpose with or without fee is hereby granted, provided that the above
995c635efSGarrett D'Amore * copyright notice and this permission notice appear in all copies.
1095c635efSGarrett D'Amore *
11371584c2SYuri Pankov * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
1295c635efSGarrett D'Amore * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13371584c2SYuri Pankov * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
1495c635efSGarrett D'Amore * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1595c635efSGarrett D'Amore * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1695c635efSGarrett D'Amore * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1795c635efSGarrett D'Amore * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18*4d131170SRobert Mustacchi *
19*4d131170SRobert Mustacchi * Top-level functions of the mandoc(3) parser:
20*4d131170SRobert Mustacchi * Parser and input encoding selection, decompression,
21*4d131170SRobert Mustacchi * handling of input bytes, characters, lines, and files,
22*4d131170SRobert Mustacchi * handling of roff(7) loops and file inclusion,
23*4d131170SRobert Mustacchi * and steering of the various parsers.
2495c635efSGarrett D'Amore */
2595c635efSGarrett D'Amore #include "config.h"
2695c635efSGarrett D'Amore
27260e9a87SYuri Pankov #include <sys/types.h>
28260e9a87SYuri Pankov #include <sys/mman.h>
29260e9a87SYuri Pankov #include <sys/stat.h>
3095c635efSGarrett D'Amore
3195c635efSGarrett D'Amore #include <assert.h>
3295c635efSGarrett D'Amore #include <ctype.h>
33260e9a87SYuri Pankov #include <errno.h>
3495c635efSGarrett D'Amore #include <fcntl.h>
3595c635efSGarrett D'Amore #include <stdarg.h>
3695c635efSGarrett D'Amore #include <stdio.h>
3795c635efSGarrett D'Amore #include <stdlib.h>
3895c635efSGarrett D'Amore #include <string.h>
3995c635efSGarrett D'Amore #include <unistd.h>
40371584c2SYuri Pankov #include <zlib.h>
4195c635efSGarrett D'Amore
42260e9a87SYuri Pankov #include "mandoc_aux.h"
43371584c2SYuri Pankov #include "mandoc.h"
44371584c2SYuri Pankov #include "roff.h"
4595c635efSGarrett D'Amore #include "mdoc.h"
4695c635efSGarrett D'Amore #include "man.h"
47cec8643bSMichal Nowak #include "mandoc_parse.h"
48371584c2SYuri Pankov #include "libmandoc.h"
49cec8643bSMichal Nowak #include "roff_int.h"
50*4d131170SRobert Mustacchi #include "tag.h"
5195c635efSGarrett D'Amore
5295c635efSGarrett D'Amore #define REPARSE_LIMIT 1000
5395c635efSGarrett D'Amore
5495c635efSGarrett D'Amore struct mparse {
5595c635efSGarrett D'Amore struct roff *roff; /* roff parser (!NULL) */
56a40ea1a7SYuri Pankov struct roff_man *man; /* man parser */
57260e9a87SYuri Pankov struct buf *primary; /* buffer currently being parsed */
58cec8643bSMichal Nowak struct buf *secondary; /* copy of top level input */
59cec8643bSMichal Nowak struct buf *loop; /* open .while request line */
60c66b8046SYuri Pankov const char *os_s; /* default operating system */
61260e9a87SYuri Pankov int options; /* parser options */
62371584c2SYuri Pankov int gzip; /* current input file is gzipped */
63260e9a87SYuri Pankov int filenc; /* encoding of the current file */
64260e9a87SYuri Pankov int reparse_count; /* finite interp. stack */
65260e9a87SYuri Pankov int line; /* line number in the file */
6695c635efSGarrett D'Amore };
6795c635efSGarrett D'Amore
68260e9a87SYuri Pankov static void choose_parser(struct mparse *);
69cec8643bSMichal Nowak static void free_buf_list(struct buf *);
7095c635efSGarrett D'Amore static void resize_buf(struct buf *, size_t);
71c66b8046SYuri Pankov static int mparse_buf_r(struct mparse *, struct buf, size_t, int);
72cec8643bSMichal Nowak static int read_whole_file(struct mparse *, int, struct buf *, int *);
7395c635efSGarrett D'Amore static void mparse_end(struct mparse *);
7495c635efSGarrett D'Amore
75260e9a87SYuri Pankov
7695c635efSGarrett D'Amore static void
resize_buf(struct buf * buf,size_t initial)7795c635efSGarrett D'Amore resize_buf(struct buf *buf, size_t initial)
7895c635efSGarrett D'Amore {
7995c635efSGarrett D'Amore
8095c635efSGarrett D'Amore buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial;
8195c635efSGarrett D'Amore buf->buf = mandoc_realloc(buf->buf, buf->sz);
8295c635efSGarrett D'Amore }
8395c635efSGarrett D'Amore
84cec8643bSMichal Nowak static void
free_buf_list(struct buf * buf)85cec8643bSMichal Nowak free_buf_list(struct buf *buf)
86cec8643bSMichal Nowak {
87cec8643bSMichal Nowak struct buf *tmp;
88cec8643bSMichal Nowak
89cec8643bSMichal Nowak while (buf != NULL) {
90cec8643bSMichal Nowak tmp = buf;
91cec8643bSMichal Nowak buf = tmp->next;
92cec8643bSMichal Nowak free(tmp->buf);
93cec8643bSMichal Nowak free(tmp);
94cec8643bSMichal Nowak }
95cec8643bSMichal Nowak }
96cec8643bSMichal Nowak
9795c635efSGarrett D'Amore static void
choose_parser(struct mparse * curp)98260e9a87SYuri Pankov choose_parser(struct mparse *curp)
9995c635efSGarrett D'Amore {
100260e9a87SYuri Pankov char *cp, *ep;
101260e9a87SYuri Pankov int format;
10295c635efSGarrett D'Amore
10395c635efSGarrett D'Amore /*
104260e9a87SYuri Pankov * If neither command line arguments -mdoc or -man select
105260e9a87SYuri Pankov * a parser nor the roff parser found a .Dd or .TH macro
106260e9a87SYuri Pankov * yet, look ahead in the main input buffer.
10795c635efSGarrett D'Amore */
10895c635efSGarrett D'Amore
109260e9a87SYuri Pankov if ((format = roff_getformat(curp->roff)) == 0) {
110260e9a87SYuri Pankov cp = curp->primary->buf;
111260e9a87SYuri Pankov ep = cp + curp->primary->sz;
112260e9a87SYuri Pankov while (cp < ep) {
113260e9a87SYuri Pankov if (*cp == '.' || *cp == '\'') {
114260e9a87SYuri Pankov cp++;
115260e9a87SYuri Pankov if (cp[0] == 'D' && cp[1] == 'd') {
116260e9a87SYuri Pankov format = MPARSE_MDOC;
117260e9a87SYuri Pankov break;
118260e9a87SYuri Pankov }
119260e9a87SYuri Pankov if (cp[0] == 'T' && cp[1] == 'H') {
120260e9a87SYuri Pankov format = MPARSE_MAN;
121260e9a87SYuri Pankov break;
122260e9a87SYuri Pankov }
123260e9a87SYuri Pankov }
124260e9a87SYuri Pankov cp = memchr(cp, '\n', ep - cp);
125260e9a87SYuri Pankov if (cp == NULL)
12695c635efSGarrett D'Amore break;
127260e9a87SYuri Pankov cp++;
128260e9a87SYuri Pankov }
12995c635efSGarrett D'Amore }
13095c635efSGarrett D'Amore
131371584c2SYuri Pankov if (format == MPARSE_MDOC) {
132cec8643bSMichal Nowak curp->man->meta.macroset = MACROSET_MDOC;
133c66b8046SYuri Pankov if (curp->man->mdocmac == NULL)
134c66b8046SYuri Pankov curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX);
135371584c2SYuri Pankov } else {
136cec8643bSMichal Nowak curp->man->meta.macroset = MACROSET_MAN;
137c66b8046SYuri Pankov if (curp->man->manmac == NULL)
138c66b8046SYuri Pankov curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX);
139371584c2SYuri Pankov }
140cec8643bSMichal Nowak curp->man->meta.first->tok = TOKEN_NONE;
14195c635efSGarrett D'Amore }
14295c635efSGarrett D'Amore
14395c635efSGarrett D'Amore /*
144260e9a87SYuri Pankov * Main parse routine for a buffer.
145260e9a87SYuri Pankov * It assumes encoding and line numbering are already set up.
146260e9a87SYuri Pankov * It can recurse directly (for invocations of user-defined
147260e9a87SYuri Pankov * macros, inline equations, and input line traps)
148260e9a87SYuri Pankov * and indirectly (for .so file inclusion).
14995c635efSGarrett D'Amore */
150c66b8046SYuri Pankov static int
mparse_buf_r(struct mparse * curp,struct buf blk,size_t i,int start)151260e9a87SYuri Pankov mparse_buf_r(struct mparse *curp, struct buf blk, size_t i, int start)
15295c635efSGarrett D'Amore {
15395c635efSGarrett D'Amore struct buf ln;
154cec8643bSMichal Nowak struct buf *firstln, *lastln, *thisln, *loop;
155260e9a87SYuri Pankov char *cp;
156260e9a87SYuri Pankov size_t pos; /* byte number in the ln buffer */
157*4d131170SRobert Mustacchi size_t spos; /* at the start of the current line parse */
158cec8643bSMichal Nowak int line_result, result;
159260e9a87SYuri Pankov int of;
16095c635efSGarrett D'Amore int lnn; /* line number in the real file */
161260e9a87SYuri Pankov int fd;
162cec8643bSMichal Nowak int inloop; /* Saw .while on this level. */
16395c635efSGarrett D'Amore unsigned char c;
16495c635efSGarrett D'Amore
165cec8643bSMichal Nowak ln.sz = 256;
166cec8643bSMichal Nowak ln.buf = mandoc_malloc(ln.sz);
167cec8643bSMichal Nowak ln.next = NULL;
168cec8643bSMichal Nowak firstln = lastln = loop = NULL;
169260e9a87SYuri Pankov lnn = curp->line;
170260e9a87SYuri Pankov pos = 0;
171cec8643bSMichal Nowak inloop = 0;
172cec8643bSMichal Nowak result = ROFF_CONT;
17395c635efSGarrett D'Amore
174cec8643bSMichal Nowak while (i < blk.sz && (blk.buf[i] != '\0' || pos != 0)) {
17595c635efSGarrett D'Amore if (start) {
17695c635efSGarrett D'Amore curp->line = lnn;
17795c635efSGarrett D'Amore curp->reparse_count = 0;
178260e9a87SYuri Pankov
179260e9a87SYuri Pankov if (lnn < 3 &&
180260e9a87SYuri Pankov curp->filenc & MPARSE_UTF8 &&
181260e9a87SYuri Pankov curp->filenc & MPARSE_LATIN1)
182260e9a87SYuri Pankov curp->filenc = preconv_cue(&blk, i);
18395c635efSGarrett D'Amore }
184*4d131170SRobert Mustacchi spos = pos;
18595c635efSGarrett D'Amore
186260e9a87SYuri Pankov while (i < blk.sz && (start || blk.buf[i] != '\0')) {
18795c635efSGarrett D'Amore
18895c635efSGarrett D'Amore /*
18995c635efSGarrett D'Amore * When finding an unescaped newline character,
19095c635efSGarrett D'Amore * leave the character loop to process the line.
19195c635efSGarrett D'Amore * Skip a preceding carriage return, if any.
19295c635efSGarrett D'Amore */
19395c635efSGarrett D'Amore
194260e9a87SYuri Pankov if ('\r' == blk.buf[i] && i + 1 < blk.sz &&
19595c635efSGarrett D'Amore '\n' == blk.buf[i + 1])
19695c635efSGarrett D'Amore ++i;
19795c635efSGarrett D'Amore if ('\n' == blk.buf[i]) {
19895c635efSGarrett D'Amore ++i;
19995c635efSGarrett D'Amore ++lnn;
20095c635efSGarrett D'Amore break;
20195c635efSGarrett D'Amore }
20295c635efSGarrett D'Amore
203698f87a4SGarrett D'Amore /*
204260e9a87SYuri Pankov * Make sure we have space for the worst
205cec8643bSMichal Nowak * case of 12 bytes: "\\[u10ffff]\n\0"
206698f87a4SGarrett D'Amore */
207698f87a4SGarrett D'Amore
208cec8643bSMichal Nowak if (pos + 12 > ln.sz)
209698f87a4SGarrett D'Amore resize_buf(&ln, 256);
210698f87a4SGarrett D'Amore
211260e9a87SYuri Pankov /*
212260e9a87SYuri Pankov * Encode 8-bit input.
21395c635efSGarrett D'Amore */
21495c635efSGarrett D'Amore
215260e9a87SYuri Pankov c = blk.buf[i];
216260e9a87SYuri Pankov if (c & 0x80) {
217260e9a87SYuri Pankov if ( ! (curp->filenc && preconv_encode(
218260e9a87SYuri Pankov &blk, &i, &ln, &pos, &curp->filenc))) {
219cec8643bSMichal Nowak mandoc_msg(MANDOCERR_CHAR_BAD,
220260e9a87SYuri Pankov curp->line, pos, "0x%x", c);
221260e9a87SYuri Pankov ln.buf[pos++] = '?';
222260e9a87SYuri Pankov i++;
223260e9a87SYuri Pankov }
224260e9a87SYuri Pankov continue;
225260e9a87SYuri Pankov }
22695c635efSGarrett D'Amore
227260e9a87SYuri Pankov /*
228260e9a87SYuri Pankov * Exclude control characters.
229260e9a87SYuri Pankov */
230260e9a87SYuri Pankov
231260e9a87SYuri Pankov if (c == 0x7f || (c < 0x20 && c != 0x09)) {
232cec8643bSMichal Nowak mandoc_msg(c == 0x00 || c == 0x04 ||
233260e9a87SYuri Pankov c > 0x0a ? MANDOCERR_CHAR_BAD :
234260e9a87SYuri Pankov MANDOCERR_CHAR_UNSUPP,
235cec8643bSMichal Nowak curp->line, pos, "0x%x", c);
23695c635efSGarrett D'Amore i++;
237260e9a87SYuri Pankov if (c != '\r')
238260e9a87SYuri Pankov ln.buf[pos++] = '?';
23995c635efSGarrett D'Amore continue;
24095c635efSGarrett D'Amore }
24195c635efSGarrett D'Amore
24295c635efSGarrett D'Amore ln.buf[pos++] = blk.buf[i++];
24395c635efSGarrett D'Amore }
244cec8643bSMichal Nowak ln.buf[pos] = '\0';
24595c635efSGarrett D'Amore
246cec8643bSMichal Nowak /*
247cec8643bSMichal Nowak * Maintain a lookaside buffer of all lines.
248cec8643bSMichal Nowak * parsed from this input source.
249cec8643bSMichal Nowak */
250cec8643bSMichal Nowak
251cec8643bSMichal Nowak thisln = mandoc_malloc(sizeof(*thisln));
252cec8643bSMichal Nowak thisln->buf = mandoc_strdup(ln.buf);
253cec8643bSMichal Nowak thisln->sz = strlen(ln.buf) + 1;
254cec8643bSMichal Nowak thisln->next = NULL;
255cec8643bSMichal Nowak if (firstln == NULL) {
256cec8643bSMichal Nowak firstln = lastln = thisln;
257cec8643bSMichal Nowak if (curp->secondary == NULL)
258cec8643bSMichal Nowak curp->secondary = firstln;
259cec8643bSMichal Nowak } else {
260cec8643bSMichal Nowak lastln->next = thisln;
261cec8643bSMichal Nowak lastln = thisln;
262cec8643bSMichal Nowak }
26395c635efSGarrett D'Amore
264cec8643bSMichal Nowak /* XXX Ugly hack to mark the end of the input. */
265cec8643bSMichal Nowak
266cec8643bSMichal Nowak if (i == blk.sz || blk.buf[i] == '\0') {
267*4d131170SRobert Mustacchi if (pos + 2 > ln.sz)
268*4d131170SRobert Mustacchi resize_buf(&ln, 256);
269c66b8046SYuri Pankov ln.buf[pos++] = '\n';
270cec8643bSMichal Nowak ln.buf[pos] = '\0';
271cec8643bSMichal Nowak }
27295c635efSGarrett D'Amore
27395c635efSGarrett D'Amore /*
27495c635efSGarrett D'Amore * A significant amount of complexity is contained by
27595c635efSGarrett D'Amore * the roff preprocessor. It's line-oriented but can be
27695c635efSGarrett D'Amore * expressed on one line, so we need at times to
27795c635efSGarrett D'Amore * readjust our starting point and re-run it. The roff
27895c635efSGarrett D'Amore * preprocessor can also readjust the buffers with new
27995c635efSGarrett D'Amore * data, so we pass them in wholesale.
28095c635efSGarrett D'Amore */
28195c635efSGarrett D'Amore
28295c635efSGarrett D'Amore of = 0;
283cec8643bSMichal Nowak rerun:
284*4d131170SRobert Mustacchi line_result = roff_parseln(curp->roff, curp->line,
285*4d131170SRobert Mustacchi &ln, &of, start && spos == 0 ? pos : 0);
28695c635efSGarrett D'Amore
287cec8643bSMichal Nowak /* Process options. */
288cec8643bSMichal Nowak
289cec8643bSMichal Nowak if (line_result & ROFF_APPEND)
290cec8643bSMichal Nowak assert(line_result == (ROFF_IGN | ROFF_APPEND));
29195c635efSGarrett D'Amore
292cec8643bSMichal Nowak if (line_result & ROFF_USERCALL)
293cec8643bSMichal Nowak assert((line_result & ROFF_MASK) == ROFF_REPARSE);
294cec8643bSMichal Nowak
295cec8643bSMichal Nowak if (line_result & ROFF_USERRET) {
296cec8643bSMichal Nowak assert(line_result == (ROFF_IGN | ROFF_USERRET));
297cec8643bSMichal Nowak if (start == 0) {
298cec8643bSMichal Nowak /* Return from the current macro. */
299cec8643bSMichal Nowak result = ROFF_USERRET;
300cec8643bSMichal Nowak goto out;
301cec8643bSMichal Nowak }
30295c635efSGarrett D'Amore }
30395c635efSGarrett D'Amore
304cec8643bSMichal Nowak switch (line_result & ROFF_LOOPMASK) {
305cec8643bSMichal Nowak case ROFF_IGN:
306cec8643bSMichal Nowak break;
307cec8643bSMichal Nowak case ROFF_WHILE:
308cec8643bSMichal Nowak if (curp->loop != NULL) {
309cec8643bSMichal Nowak if (loop == curp->loop)
310cec8643bSMichal Nowak break;
311cec8643bSMichal Nowak mandoc_msg(MANDOCERR_WHILE_NEST,
312260e9a87SYuri Pankov curp->line, pos, NULL);
313c66b8046SYuri Pankov }
314cec8643bSMichal Nowak curp->loop = thisln;
315cec8643bSMichal Nowak loop = NULL;
316cec8643bSMichal Nowak inloop = 1;
317cec8643bSMichal Nowak break;
318cec8643bSMichal Nowak case ROFF_LOOPCONT:
319cec8643bSMichal Nowak case ROFF_LOOPEXIT:
320cec8643bSMichal Nowak if (curp->loop == NULL) {
321cec8643bSMichal Nowak mandoc_msg(MANDOCERR_WHILE_FAIL,
322cec8643bSMichal Nowak curp->line, pos, NULL);
323cec8643bSMichal Nowak break;
324cec8643bSMichal Nowak }
325cec8643bSMichal Nowak if (inloop == 0) {
326cec8643bSMichal Nowak mandoc_msg(MANDOCERR_WHILE_INTO,
327cec8643bSMichal Nowak curp->line, pos, NULL);
328cec8643bSMichal Nowak curp->loop = loop = NULL;
329cec8643bSMichal Nowak break;
330cec8643bSMichal Nowak }
331cec8643bSMichal Nowak if (line_result & ROFF_LOOPCONT)
332cec8643bSMichal Nowak loop = curp->loop;
333cec8643bSMichal Nowak else {
334cec8643bSMichal Nowak curp->loop = loop = NULL;
335cec8643bSMichal Nowak inloop = 0;
336cec8643bSMichal Nowak }
337cec8643bSMichal Nowak break;
338cec8643bSMichal Nowak default:
339cec8643bSMichal Nowak abort();
340cec8643bSMichal Nowak }
341cec8643bSMichal Nowak
342cec8643bSMichal Nowak /* Process the main instruction from the roff parser. */
343cec8643bSMichal Nowak
344cec8643bSMichal Nowak switch (line_result & ROFF_MASK) {
345cec8643bSMichal Nowak case ROFF_IGN:
346cec8643bSMichal Nowak break;
347cec8643bSMichal Nowak case ROFF_CONT:
348cec8643bSMichal Nowak if (curp->man->meta.macroset == MACROSET_NONE)
349cec8643bSMichal Nowak choose_parser(curp);
350cec8643bSMichal Nowak if ((curp->man->meta.macroset == MACROSET_MDOC ?
351cec8643bSMichal Nowak mdoc_parseln(curp->man, curp->line, ln.buf, of) :
352cec8643bSMichal Nowak man_parseln(curp->man, curp->line, ln.buf, of)
353cec8643bSMichal Nowak ) == 2)
354cec8643bSMichal Nowak goto out;
355cec8643bSMichal Nowak break;
356260e9a87SYuri Pankov case ROFF_RERUN:
35795c635efSGarrett D'Amore goto rerun;
358cec8643bSMichal Nowak case ROFF_REPARSE:
359cec8643bSMichal Nowak if (++curp->reparse_count > REPARSE_LIMIT) {
360cec8643bSMichal Nowak /* Abort and return to the top level. */
361cec8643bSMichal Nowak result = ROFF_IGN;
362cec8643bSMichal Nowak mandoc_msg(MANDOCERR_ROFFLOOP,
363cec8643bSMichal Nowak curp->line, pos, NULL);
364cec8643bSMichal Nowak goto out;
365cec8643bSMichal Nowak }
366cec8643bSMichal Nowak result = mparse_buf_r(curp, ln, of, 0);
367cec8643bSMichal Nowak if (line_result & ROFF_USERCALL) {
368cec8643bSMichal Nowak roff_userret(curp->roff);
369cec8643bSMichal Nowak /* Continue normally. */
370cec8643bSMichal Nowak if (result & ROFF_USERRET)
371cec8643bSMichal Nowak result = ROFF_CONT;
372cec8643bSMichal Nowak }
373cec8643bSMichal Nowak if (start == 0 && result != ROFF_CONT)
374cec8643bSMichal Nowak goto out;
375cec8643bSMichal Nowak break;
376260e9a87SYuri Pankov case ROFF_SO:
377260e9a87SYuri Pankov if ( ! (curp->options & MPARSE_SO) &&
378260e9a87SYuri Pankov (i >= blk.sz || blk.buf[i] == '\0')) {
379cec8643bSMichal Nowak curp->man->meta.sodest =
380cec8643bSMichal Nowak mandoc_strdup(ln.buf + of);
381cec8643bSMichal Nowak goto out;
382260e9a87SYuri Pankov }
383371584c2SYuri Pankov if ((fd = mparse_open(curp, ln.buf + of)) != -1) {
384260e9a87SYuri Pankov mparse_readfd(curp, fd, ln.buf + of);
385371584c2SYuri Pankov close(fd);
386260e9a87SYuri Pankov } else {
387cec8643bSMichal Nowak mandoc_msg(MANDOCERR_SO_FAIL,
388cec8643bSMichal Nowak curp->line, of, ".so %s: %s",
389cec8643bSMichal Nowak ln.buf + of, strerror(errno));
390260e9a87SYuri Pankov ln.sz = mandoc_asprintf(&cp,
391260e9a87SYuri Pankov ".sp\nSee the file %s.\n.sp",
392260e9a87SYuri Pankov ln.buf + of);
393260e9a87SYuri Pankov free(ln.buf);
394260e9a87SYuri Pankov ln.buf = cp;
395260e9a87SYuri Pankov of = 0;
396260e9a87SYuri Pankov mparse_buf_r(curp, ln, of, 0);
397260e9a87SYuri Pankov }
39895c635efSGarrett D'Amore break;
399cec8643bSMichal Nowak default:
400cec8643bSMichal Nowak abort();
40195c635efSGarrett D'Amore }
40295c635efSGarrett D'Amore
40395c635efSGarrett D'Amore /* Start the next input line. */
40495c635efSGarrett D'Amore
405cec8643bSMichal Nowak if (loop != NULL &&
406cec8643bSMichal Nowak (line_result & ROFF_LOOPMASK) == ROFF_IGN)
407cec8643bSMichal Nowak loop = loop->next;
408cec8643bSMichal Nowak
409cec8643bSMichal Nowak if (loop != NULL) {
410cec8643bSMichal Nowak if ((line_result & ROFF_APPEND) == 0)
411cec8643bSMichal Nowak *ln.buf = '\0';
412cec8643bSMichal Nowak if (ln.sz < loop->sz)
413cec8643bSMichal Nowak resize_buf(&ln, loop->sz);
414cec8643bSMichal Nowak (void)strlcat(ln.buf, loop->buf, ln.sz);
415cec8643bSMichal Nowak of = 0;
416cec8643bSMichal Nowak goto rerun;
417cec8643bSMichal Nowak }
41895c635efSGarrett D'Amore
419cec8643bSMichal Nowak pos = (line_result & ROFF_APPEND) ? strlen(ln.buf) : 0;
420cec8643bSMichal Nowak }
421cec8643bSMichal Nowak out:
422cec8643bSMichal Nowak if (inloop) {
423cec8643bSMichal Nowak if (result != ROFF_USERRET)
424cec8643bSMichal Nowak mandoc_msg(MANDOCERR_WHILE_OUTOF,
425cec8643bSMichal Nowak curp->line, pos, NULL);
426cec8643bSMichal Nowak curp->loop = NULL;
427cec8643bSMichal Nowak }
42895c635efSGarrett D'Amore free(ln.buf);
429cec8643bSMichal Nowak if (firstln != curp->secondary)
430cec8643bSMichal Nowak free_buf_list(firstln);
431cec8643bSMichal Nowak return result;
43295c635efSGarrett D'Amore }
43395c635efSGarrett D'Amore
43495c635efSGarrett D'Amore static int
read_whole_file(struct mparse * curp,int fd,struct buf * fb,int * with_mmap)435cec8643bSMichal Nowak read_whole_file(struct mparse *curp, int fd, struct buf *fb, int *with_mmap)
43695c635efSGarrett D'Amore {
437a40ea1a7SYuri Pankov struct stat st;
438371584c2SYuri Pankov gzFile gz;
43995c635efSGarrett D'Amore size_t off;
44095c635efSGarrett D'Amore ssize_t ssz;
4416640c13bSYuri Pankov int gzerrnum, retval;
44295c635efSGarrett D'Amore
443c66b8046SYuri Pankov if (fstat(fd, &st) == -1) {
444*4d131170SRobert Mustacchi mandoc_msg(MANDOCERR_FSTAT, 0, 0, "%s", strerror(errno));
445*4d131170SRobert Mustacchi return -1;
446c66b8046SYuri Pankov }
44795c635efSGarrett D'Amore
44895c635efSGarrett D'Amore /*
44995c635efSGarrett D'Amore * If we're a regular file, try just reading in the whole entry
45095c635efSGarrett D'Amore * via mmap(). This is faster than reading it into blocks, and
45195c635efSGarrett D'Amore * since each file is only a few bytes to begin with, I'm not
45295c635efSGarrett D'Amore * concerned that this is going to tank any machines.
45395c635efSGarrett D'Amore */
45495c635efSGarrett D'Amore
455371584c2SYuri Pankov if (curp->gzip == 0 && S_ISREG(st.st_mode)) {
456260e9a87SYuri Pankov if (st.st_size > 0x7fffffff) {
457cec8643bSMichal Nowak mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL);
458*4d131170SRobert Mustacchi return -1;
45995c635efSGarrett D'Amore }
46095c635efSGarrett D'Amore *with_mmap = 1;
46195c635efSGarrett D'Amore fb->sz = (size_t)st.st_size;
462698f87a4SGarrett D'Amore fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
46395c635efSGarrett D'Amore if (fb->buf != MAP_FAILED)
464*4d131170SRobert Mustacchi return 0;
46595c635efSGarrett D'Amore }
46695c635efSGarrett D'Amore
467371584c2SYuri Pankov if (curp->gzip) {
4686640c13bSYuri Pankov /*
4696640c13bSYuri Pankov * Duplicating the file descriptor is required
4706640c13bSYuri Pankov * because we will have to call gzclose(3)
4716640c13bSYuri Pankov * to free memory used internally by zlib,
4726640c13bSYuri Pankov * but that will also close the file descriptor,
4736640c13bSYuri Pankov * which this function must not do.
4746640c13bSYuri Pankov */
4756640c13bSYuri Pankov if ((fd = dup(fd)) == -1) {
476*4d131170SRobert Mustacchi mandoc_msg(MANDOCERR_DUP, 0, 0,
477*4d131170SRobert Mustacchi "%s", strerror(errno));
478*4d131170SRobert Mustacchi return -1;
4796640c13bSYuri Pankov }
480c66b8046SYuri Pankov if ((gz = gzdopen(fd, "rb")) == NULL) {
481*4d131170SRobert Mustacchi mandoc_msg(MANDOCERR_GZDOPEN, 0, 0,
482*4d131170SRobert Mustacchi "%s", strerror(errno));
4836640c13bSYuri Pankov close(fd);
484*4d131170SRobert Mustacchi return -1;
485c66b8046SYuri Pankov }
486371584c2SYuri Pankov } else
487371584c2SYuri Pankov gz = NULL;
488371584c2SYuri Pankov
48995c635efSGarrett D'Amore /*
49095c635efSGarrett D'Amore * If this isn't a regular file (like, say, stdin), then we must
49195c635efSGarrett D'Amore * go the old way and just read things in bit by bit.
49295c635efSGarrett D'Amore */
49395c635efSGarrett D'Amore
49495c635efSGarrett D'Amore *with_mmap = 0;
49595c635efSGarrett D'Amore off = 0;
496*4d131170SRobert Mustacchi retval = -1;
49795c635efSGarrett D'Amore fb->sz = 0;
49895c635efSGarrett D'Amore fb->buf = NULL;
49995c635efSGarrett D'Amore for (;;) {
50095c635efSGarrett D'Amore if (off == fb->sz) {
50195c635efSGarrett D'Amore if (fb->sz == (1U << 31)) {
502cec8643bSMichal Nowak mandoc_msg(MANDOCERR_TOOLARGE, 0, 0, NULL);
50395c635efSGarrett D'Amore break;
50495c635efSGarrett D'Amore }
50595c635efSGarrett D'Amore resize_buf(fb, 65536);
50695c635efSGarrett D'Amore }
507371584c2SYuri Pankov ssz = curp->gzip ?
508371584c2SYuri Pankov gzread(gz, fb->buf + (int)off, fb->sz - off) :
509371584c2SYuri Pankov read(fd, fb->buf + (int)off, fb->sz - off);
51095c635efSGarrett D'Amore if (ssz == 0) {
51195c635efSGarrett D'Amore fb->sz = off;
512*4d131170SRobert Mustacchi retval = 0;
5136640c13bSYuri Pankov break;
51495c635efSGarrett D'Amore }
515c66b8046SYuri Pankov if (ssz == -1) {
5166640c13bSYuri Pankov if (curp->gzip)
5176640c13bSYuri Pankov (void)gzerror(gz, &gzerrnum);
518*4d131170SRobert Mustacchi mandoc_msg(MANDOCERR_READ, 0, 0, "%s",
5196640c13bSYuri Pankov curp->gzip && gzerrnum != Z_ERRNO ?
5206640c13bSYuri Pankov zError(gzerrnum) : strerror(errno));
521c66b8046SYuri Pankov break;
522c66b8046SYuri Pankov }
52395c635efSGarrett D'Amore off += (size_t)ssz;
52495c635efSGarrett D'Amore }
52595c635efSGarrett D'Amore
5266640c13bSYuri Pankov if (curp->gzip && (gzerrnum = gzclose(gz)) != Z_OK)
527*4d131170SRobert Mustacchi mandoc_msg(MANDOCERR_GZCLOSE, 0, 0, "%s",
5286640c13bSYuri Pankov gzerrnum == Z_ERRNO ? strerror(errno) :
5296640c13bSYuri Pankov zError(gzerrnum));
530*4d131170SRobert Mustacchi if (retval == -1) {
5316640c13bSYuri Pankov free(fb->buf);
5326640c13bSYuri Pankov fb->buf = NULL;
5336640c13bSYuri Pankov }
5346640c13bSYuri Pankov return retval;
53595c635efSGarrett D'Amore }
53695c635efSGarrett D'Amore
53795c635efSGarrett D'Amore static void
mparse_end(struct mparse * curp)53895c635efSGarrett D'Amore mparse_end(struct mparse *curp)
53995c635efSGarrett D'Amore {
540cec8643bSMichal Nowak if (curp->man->meta.macroset == MACROSET_NONE)
541cec8643bSMichal Nowak curp->man->meta.macroset = MACROSET_MAN;
542cec8643bSMichal Nowak if (curp->man->meta.macroset == MACROSET_MDOC)
543371584c2SYuri Pankov mdoc_endparse(curp->man);
544371584c2SYuri Pankov else
545260e9a87SYuri Pankov man_endparse(curp->man);
54695c635efSGarrett D'Amore roff_endparse(curp->roff);
54795c635efSGarrett D'Amore }
54895c635efSGarrett D'Amore
549cec8643bSMichal Nowak /*
550cec8643bSMichal Nowak * Read the whole file into memory and call the parsers.
551cec8643bSMichal Nowak * Called recursively when an .so request is encountered.
552cec8643bSMichal Nowak */
553cec8643bSMichal Nowak void
mparse_readfd(struct mparse * curp,int fd,const char * filename)554cec8643bSMichal Nowak mparse_readfd(struct mparse *curp, int fd, const char *filename)
55595c635efSGarrett D'Amore {
556698f87a4SGarrett D'Amore static int recursion_depth;
557698f87a4SGarrett D'Amore
558cec8643bSMichal Nowak struct buf blk;
559cec8643bSMichal Nowak struct buf *save_primary;
560*4d131170SRobert Mustacchi const char *save_filename, *cp;
561cec8643bSMichal Nowak size_t offset;
562cec8643bSMichal Nowak int save_filenc, save_lineno;
563cec8643bSMichal Nowak int with_mmap;
564cec8643bSMichal Nowak
565cec8643bSMichal Nowak if (recursion_depth > 64) {
566cec8643bSMichal Nowak mandoc_msg(MANDOCERR_ROFFLOOP, curp->line, 0, NULL);
567698f87a4SGarrett D'Amore return;
568*4d131170SRobert Mustacchi } else if (recursion_depth == 0 &&
569*4d131170SRobert Mustacchi (cp = strrchr(filename, '.')) != NULL &&
570*4d131170SRobert Mustacchi cp[1] >= '1' && cp[1] <= '9')
571*4d131170SRobert Mustacchi curp->man->filesec = cp[1];
572*4d131170SRobert Mustacchi else
573*4d131170SRobert Mustacchi curp->man->filesec = '\0';
574*4d131170SRobert Mustacchi
575*4d131170SRobert Mustacchi if (read_whole_file(curp, fd, &blk, &with_mmap) == -1)
576cec8643bSMichal Nowak return;
577cec8643bSMichal Nowak
578cec8643bSMichal Nowak /*
579cec8643bSMichal Nowak * Save some properties of the parent file.
580cec8643bSMichal Nowak */
581cec8643bSMichal Nowak
582cec8643bSMichal Nowak save_primary = curp->primary;
583cec8643bSMichal Nowak save_filenc = curp->filenc;
584cec8643bSMichal Nowak save_lineno = curp->line;
585cec8643bSMichal Nowak save_filename = mandoc_msg_getinfilename();
58695c635efSGarrett D'Amore
587260e9a87SYuri Pankov curp->primary = &blk;
588cec8643bSMichal Nowak curp->filenc = curp->options & (MPARSE_UTF8 | MPARSE_LATIN1);
58995c635efSGarrett D'Amore curp->line = 1;
590cec8643bSMichal Nowak mandoc_msg_setinfilename(filename);
59195c635efSGarrett D'Amore
592260e9a87SYuri Pankov /* Skip an UTF-8 byte order mark. */
593260e9a87SYuri Pankov if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 &&
594260e9a87SYuri Pankov (unsigned char)blk.buf[0] == 0xef &&
595260e9a87SYuri Pankov (unsigned char)blk.buf[1] == 0xbb &&
596260e9a87SYuri Pankov (unsigned char)blk.buf[2] == 0xbf) {
597260e9a87SYuri Pankov offset = 3;
598260e9a87SYuri Pankov curp->filenc &= ~MPARSE_LATIN1;
599260e9a87SYuri Pankov } else
600260e9a87SYuri Pankov offset = 0;
60195c635efSGarrett D'Amore
602cec8643bSMichal Nowak recursion_depth++;
603260e9a87SYuri Pankov mparse_buf_r(curp, blk, offset, 1);
604260e9a87SYuri Pankov if (--recursion_depth == 0)
60595c635efSGarrett D'Amore mparse_end(curp);
60695c635efSGarrett D'Amore
607cec8643bSMichal Nowak /*
608cec8643bSMichal Nowak * Clean up and restore saved parent properties.
609cec8643bSMichal Nowak */
61095c635efSGarrett D'Amore
611cec8643bSMichal Nowak if (with_mmap)
612cec8643bSMichal Nowak munmap(blk.buf, blk.sz);
613cec8643bSMichal Nowak else
614cec8643bSMichal Nowak free(blk.buf);
61595c635efSGarrett D'Amore
616cec8643bSMichal Nowak curp->primary = save_primary;
617cec8643bSMichal Nowak curp->filenc = save_filenc;
618cec8643bSMichal Nowak curp->line = save_lineno;
619cec8643bSMichal Nowak if (save_filename != NULL)
620cec8643bSMichal Nowak mandoc_msg_setinfilename(save_filename);
621260e9a87SYuri Pankov }
622260e9a87SYuri Pankov
623371584c2SYuri Pankov int
mparse_open(struct mparse * curp,const char * file)624371584c2SYuri Pankov mparse_open(struct mparse *curp, const char *file)
625260e9a87SYuri Pankov {
626260e9a87SYuri Pankov char *cp;
627cec8643bSMichal Nowak int fd, save_errno;
628260e9a87SYuri Pankov
629371584c2SYuri Pankov cp = strrchr(file, '.');
630371584c2SYuri Pankov curp->gzip = (cp != NULL && ! strcmp(cp + 1, "gz"));
631260e9a87SYuri Pankov
632371584c2SYuri Pankov /* First try to use the filename as it is. */
633260e9a87SYuri Pankov
634371584c2SYuri Pankov if ((fd = open(file, O_RDONLY)) != -1)
635371584c2SYuri Pankov return fd;
636260e9a87SYuri Pankov
637371584c2SYuri Pankov /*
638371584c2SYuri Pankov * If that doesn't work and the filename doesn't
639371584c2SYuri Pankov * already end in .gz, try appending .gz.
640371584c2SYuri Pankov */
641260e9a87SYuri Pankov
642371584c2SYuri Pankov if ( ! curp->gzip) {
643cec8643bSMichal Nowak save_errno = errno;
644260e9a87SYuri Pankov mandoc_asprintf(&cp, "%s.gz", file);
645371584c2SYuri Pankov fd = open(cp, O_RDONLY);
646260e9a87SYuri Pankov free(cp);
647cec8643bSMichal Nowak errno = save_errno;
648371584c2SYuri Pankov if (fd != -1) {
649371584c2SYuri Pankov curp->gzip = 1;
650371584c2SYuri Pankov return fd;
651260e9a87SYuri Pankov }
652260e9a87SYuri Pankov }
653260e9a87SYuri Pankov
654371584c2SYuri Pankov /* Neither worked, give up. */
655260e9a87SYuri Pankov
656371584c2SYuri Pankov return -1;
65795c635efSGarrett D'Amore }
65895c635efSGarrett D'Amore
65995c635efSGarrett D'Amore struct mparse *
mparse_alloc(int options,enum mandoc_os os_e,const char * os_s)660cec8643bSMichal Nowak mparse_alloc(int options, enum mandoc_os os_e, const char *os_s)
66195c635efSGarrett D'Amore {
66295c635efSGarrett D'Amore struct mparse *curp;
66395c635efSGarrett D'Amore
66495c635efSGarrett D'Amore curp = mandoc_calloc(1, sizeof(struct mparse));
66595c635efSGarrett D'Amore
666260e9a87SYuri Pankov curp->options = options;
667c66b8046SYuri Pankov curp->os_s = os_s;
66895c635efSGarrett D'Amore
669cec8643bSMichal Nowak curp->roff = roff_alloc(options);
670cec8643bSMichal Nowak curp->man = roff_man_alloc(curp->roff, curp->os_s,
671371584c2SYuri Pankov curp->options & MPARSE_QUICK ? 1 : 0);
672371584c2SYuri Pankov if (curp->options & MPARSE_MDOC) {
673cec8643bSMichal Nowak curp->man->meta.macroset = MACROSET_MDOC;
674c66b8046SYuri Pankov if (curp->man->mdocmac == NULL)
675c66b8046SYuri Pankov curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX);
676371584c2SYuri Pankov } else if (curp->options & MPARSE_MAN) {
677cec8643bSMichal Nowak curp->man->meta.macroset = MACROSET_MAN;
678c66b8046SYuri Pankov if (curp->man->manmac == NULL)
679c66b8046SYuri Pankov curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX);
680371584c2SYuri Pankov }
681cec8643bSMichal Nowak curp->man->meta.first->tok = TOKEN_NONE;
682c66b8046SYuri Pankov curp->man->meta.os_e = os_e;
683*4d131170SRobert Mustacchi tag_alloc();
684371584c2SYuri Pankov return curp;
68595c635efSGarrett D'Amore }
68695c635efSGarrett D'Amore
68795c635efSGarrett D'Amore void
mparse_reset(struct mparse * curp)68895c635efSGarrett D'Amore mparse_reset(struct mparse *curp)
68995c635efSGarrett D'Amore {
690*4d131170SRobert Mustacchi tag_free();
69195c635efSGarrett D'Amore roff_reset(curp->roff);
692a5934736SYuri Pankov roff_man_reset(curp->man);
693cec8643bSMichal Nowak free_buf_list(curp->secondary);
694cec8643bSMichal Nowak curp->secondary = NULL;
695a40ea1a7SYuri Pankov curp->gzip = 0;
696*4d131170SRobert Mustacchi tag_alloc();
69795c635efSGarrett D'Amore }
69895c635efSGarrett D'Amore
69995c635efSGarrett D'Amore void
mparse_free(struct mparse * curp)70095c635efSGarrett D'Amore mparse_free(struct mparse *curp)
70195c635efSGarrett D'Amore {
702*4d131170SRobert Mustacchi tag_free();
703c66b8046SYuri Pankov roffhash_free(curp->man->mdocmac);
704c66b8046SYuri Pankov roffhash_free(curp->man->manmac);
705371584c2SYuri Pankov roff_man_free(curp->man);
706a40ea1a7SYuri Pankov roff_free(curp->roff);
707cec8643bSMichal Nowak free_buf_list(curp->secondary);
70895c635efSGarrett D'Amore free(curp);
70995c635efSGarrett D'Amore }
71095c635efSGarrett D'Amore
711cec8643bSMichal Nowak struct roff_meta *
mparse_result(struct mparse * curp)712cec8643bSMichal Nowak mparse_result(struct mparse *curp)
71395c635efSGarrett D'Amore {
714cec8643bSMichal Nowak roff_state_reset(curp->man);
715cec8643bSMichal Nowak if (curp->options & MPARSE_VALIDATE) {
716cec8643bSMichal Nowak if (curp->man->meta.macroset == MACROSET_MDOC)
717cec8643bSMichal Nowak mdoc_validate(curp->man);
718cec8643bSMichal Nowak else
719cec8643bSMichal Nowak man_validate(curp->man);
720*4d131170SRobert Mustacchi tag_postprocess(curp->man, curp->man->meta.first);
721260e9a87SYuri Pankov }
722cec8643bSMichal Nowak return &curp->man->meta;
72395c635efSGarrett D'Amore }
72495c635efSGarrett D'Amore
725a5934736SYuri Pankov void
mparse_copy(const struct mparse * p)726cec8643bSMichal Nowak mparse_copy(const struct mparse *p)
72795c635efSGarrett D'Amore {
728cec8643bSMichal Nowak struct buf *buf;
72995c635efSGarrett D'Amore
730cec8643bSMichal Nowak for (buf = p->secondary; buf != NULL; buf = buf->next)
731cec8643bSMichal Nowak puts(buf->buf);
73295c635efSGarrett D'Amore }
733