15354776peter/* dfa.h - declarations for GNU deterministic regexp compiler
2fa89658obrien   Copyright (C) 1988, 1998 Free Software Foundation, Inc.
45354776peter   This program is free software; you can redistribute it and/or modify
55354776peter   it under the terms of the GNU General Public License as published by
65354776peter   the Free Software Foundation; either version 2, or (at your option)
75354776peter   any later version.
95354776peter   This program is distributed in the hope that it will be useful,
105354776peter   but WITHOUT ANY WARRANTY; without even the implied warranty of
125354776peter   GNU General Public License for more details.
145354776peter   You should have received a copy of the GNU General Public License
155354776peter   along with this program; if not, write to the Free Software
16fa89658obrien   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA */
185354776peter/* Written June, 1988 by Mike Haertel */
20fa89658obrien/* $FreeBSD$ */
225354776peter/* FIXME:
235354776peter   2.  We should not export so much of the DFA internals.
245354776peter   In addition to clobbering modularity, we eat up valuable
255354776peter   name space. */
2782d1ca3tjr#ifdef __STDC__
28fa89658obrien# ifndef _PTR_T
29fa89658obrien# define _PTR_T
30fa89658obrien  typedef void * ptr_t;
31fa89658obrien# endif
33fa89658obrien# ifndef _PTR_T
34fa89658obrien# define _PTR_T
35fa89658obrien  typedef char * ptr_t;
36fa89658obrien# endif
3982d1ca3tjr#ifdef PARAMS
4082d1ca3tjr# undef PARAMS
4282d1ca3tjr#if PROTOTYPES
4382d1ca3tjr# define PARAMS(x) x
45fa89658obrien# define PARAMS(x) ()
485354776peter/* Number of bits in an unsigned char. */
49fa89658obrien#ifndef CHARBITS
505354776peter#define CHARBITS 8
535354776peter/* First integer value that is greater than any character code. */
545354776peter#define NOTCHAR (1 << CHARBITS)
565354776peter/* INTBITS need not be exact, just a lower bound. */
57fa89658obrien#ifndef INTBITS
585354776peter#define INTBITS (CHARBITS * sizeof (int))
615354776peter/* Number of ints required to hold a bit for every character. */
625354776peter#define CHARCLASS_INTS ((NOTCHAR + INTBITS - 1) / INTBITS)
645354776peter/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
655354776petertypedef int charclass[CHARCLASS_INTS];
675354776peter/* The regexp is parsed into an array of tokens in postfix form.  Some tokens
685354776peter   are operators and others are terminal symbols.  Most (but not all) of these
695354776peter   codes are returned by the lexical analyzer. */
715354776petertypedef enum
735354776peter  END = -1,			/* END is a terminal symbol that matches the
745354776peter				   end of input; any value of END or less in
755354776peter				   the parse tree is such a symbol.  Accepting
765354776peter				   states of the DFA are those that would have
775354776peter				   a transition on END. */
795354776peter  /* Ordinary character values are terminal symbols that match themselves. */
815354776peter  EMPTY = NOTCHAR,		/* EMPTY is a terminal symbol that matches
825354776peter				   the empty string. */
845354776peter  BACKREF,			/* BACKREF is generated by \<digit>; it
855354776peter				   it not completely handled.  If the scanner
865354776peter				   detects a transition on backref, it returns
875354776peter				   a kind of "semi-success" indicating that
885354776peter				   the match will have to be verified with
895354776peter				   a backtracking matcher. */
915354776peter  BEGLINE,			/* BEGLINE is a terminal symbol that matches
925354776peter				   the empty string if it is at the beginning
935354776peter				   of a line. */
955354776peter  ENDLINE,			/* ENDLINE is a terminal symbol that matches
965354776peter				   the empty string if it is at the end of
975354776peter				   a line. */
995354776peter  BEGWORD,			/* BEGWORD is a terminal symbol that matches
1005354776peter				   the empty string if it is at the beginning
1015354776peter				   of a word. */
1035354776peter  ENDWORD,			/* ENDWORD is a terminal symbol that matches
1045354776peter				   the empty string if it is at the end of
1055354776peter				   a word. */
1075354776peter  LIMWORD,			/* LIMWORD is a terminal symbol that matches
1085354776peter				   the empty string if it is at the beginning
1095354776peter				   or the end of a word. */
1115354776peter  NOTLIMWORD,			/* NOTLIMWORD is a terminal symbol that
1125354776peter				   matches the empty string if it is not at
1135354776peter				   the beginning or end of a word. */
1155354776peter  QMARK,			/* QMARK is an operator of one argument that
1165354776peter				   matches zero or one occurences of its
1175354776peter				   argument. */
1195354776peter  STAR,				/* STAR is an operator of one argument that
1205354776peter				   matches the Kleene closure (zero or more
1215354776peter				   occurrences) of its argument. */
1235354776peter  PLUS,				/* PLUS is an operator of one argument that
1245354776peter				   matches the positive closure (one or more
1255354776peter				   occurrences) of its argument. */
1275354776peter  REPMN,			/* REPMN is a lexical token corresponding
1285354776peter				   to the {m,n} construct.  REPMN never
1295354776peter				   appears in the compiled token vector. */
1315354776peter  CAT,				/* CAT is an operator of two arguments that
1325354776peter				   matches the concatenation of its
1335354776peter				   arguments.  CAT is never returned by the
1345354776peter				   lexical analyzer. */
1365354776peter  OR,				/* OR is an operator of two arguments that
1375354776peter				   matches either of its arguments. */
1395354776peter  ORTOP,			/* OR at the toplevel in the parse tree.
1405354776peter				   This is used for a boyer-moore heuristic. */
1425354776peter  LPAREN,			/* LPAREN never appears in the parse tree,
1435354776peter				   it is only a lexeme. */
1455354776peter  RPAREN,			/* RPAREN never appears in the parse tree. */
14782d1ca3tjr  CRANGE,			/* CRANGE never appears in the parse tree.
14882d1ca3tjr				   It stands for a character range that can
14982d1ca3tjr				   match a string of one or more characters.
15082d1ca3tjr				   For example, [a-z] can match "ch" in
15182d1ca3tjr				   a Spanish locale.  */
15382d1ca3tjr#ifdef MBS_SUPPORT
15482d1ca3tjr  ANYCHAR,                     /* ANYCHAR is a terminal symbol that matches
15582d1ca3tjr                                  any multibyte(or singlebyte) characters.
15682d1ca3tjr			          It is used only if MB_CUR_MAX > 1.  */
15882d1ca3tjr  MBCSET,			/* MBCSET is similar to CSET, but for
15982d1ca3tjr				   multibyte characters.  */
16082d1ca3tjr#endif /* MBS_SUPPORT */
1625354776peter  CSET				/* CSET and (and any value greater) is a
1635354776peter				   terminal symbol that matches any of a
1645354776peter				   class of characters. */
1655354776peter} token;
1675354776peter/* Sets are stored in an array in the compiled dfa; the index of the
1685354776peter   array corresponding to a given set token is given by SET_INDEX(t). */
1695354776peter#define SET_INDEX(t) ((t) - CSET)
1715354776peter/* Sometimes characters can only be matched depending on the surrounding
1725354776peter   context.  Such context decisions depend on what the previous character
1735354776peter   was, and the value of the current (lookahead) character.  Context
1745354776peter   dependent constraints are encoded as 8 bit integers.  Each bit that
1755354776peter   is set indicates that the constraint succeeds in the corresponding
1765354776peter   context.
1785354776peter   bit 7 - previous and current are newlines
1795354776peter   bit 6 - previous was newline, current isn't
1805354776peter   bit 5 - previous wasn't newline, current is
1815354776peter   bit 4 - neither previous nor current is a newline
1825354776peter   bit 3 - previous and current are word-constituents
1835354776peter   bit 2 - previous was word-constituent, current isn't
1845354776peter   bit 1 - previous wasn't word-constituent, current is
1855354776peter   bit 0 - neither previous nor current is word-constituent
1875354776peter   Word-constituent characters are those that satisfy isalnum().
1895354776peter   The macro SUCCEEDS_IN_CONTEXT determines whether a a given constraint
1905354776peter   succeeds in a particular context.  Prevn is true if the previous character
1915354776peter   was a newline, currn is true if the lookahead character is a newline.
1925354776peter   Prevl and currl similarly depend upon whether the previous and current
1935354776peter   characters are word-constituent letters. */
1945354776peter#define MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn) \
1955354776peter  ((constraint) & 1 << (((prevn) ? 2 : 0) + ((currn) ? 1 : 0) + 4))
1965354776peter#define MATCHES_LETTER_CONTEXT(constraint, prevl, currl) \
1975354776peter  ((constraint) & 1 << (((prevl) ? 2 : 0) + ((currl) ? 1 : 0)))
1985354776peter#define SUCCEEDS_IN_CONTEXT(constraint, prevn, currn, prevl, currl) \
1995354776peter  (MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn)		     \
2005354776peter   && MATCHES_LETTER_CONTEXT(constraint, prevl, currl))
2025354776peter/* The following macros give information about what a constraint depends on. */
2035354776peter#define PREV_NEWLINE_DEPENDENT(constraint) \
2045354776peter  (((constraint) & 0xc0) >> 2 != ((constraint) & 0x30))
2055354776peter#define PREV_LETTER_DEPENDENT(constraint) \
2065354776peter  (((constraint) & 0x0c) >> 2 != ((constraint) & 0x03))
2085354776peter/* Tokens that match the empty string subject to some constraint actually
2095354776peter   work by applying that constraint to determine what may follow them,
2105354776peter   taking into account what has gone before.  The following values are
2115354776peter   the constraints corresponding to the special tokens previously defined. */
2125354776peter#define NO_CONSTRAINT 0xff
2135354776peter#define BEGLINE_CONSTRAINT 0xcf
2145354776peter#define ENDLINE_CONSTRAINT 0xaf
2155354776peter#define BEGWORD_CONSTRAINT 0xf2
2165354776peter#define ENDWORD_CONSTRAINT 0xf4
2175354776peter#define LIMWORD_CONSTRAINT 0xf6
2185354776peter#define NOTLIMWORD_CONSTRAINT 0xf9
2205354776peter/* States of the recognizer correspond to sets of positions in the parse
2215354776peter   tree, together with the constraints under which they may be matched.
2225354776peter   So a position is encoded as an index into the parse tree together with
2235354776peter   a constraint. */
2245354776petertypedef struct
2265354776peter  unsigned index;		/* Index into the parse array. */
2275354776peter  unsigned constraint;		/* Constraint for matching this position. */
2285354776peter} position;
2305354776peter/* Sets of positions are stored as arrays. */
2315354776petertypedef struct
2335354776peter  position *elems;		/* Elements of this position set. */
2345354776peter  int nelem;			/* Number of elements in this set. */
2355354776peter} position_set;
2375354776peter/* A state of the dfa consists of a set of positions, some flags,
2385354776peter   and the token value of the lowest-numbered position of the state that
2395354776peter   contains an END token. */
2405354776petertypedef struct
2425354776peter  int hash;			/* Hash of the positions of this state. */
2435354776peter  position_set elems;		/* Positions this state could match. */
2445354776peter  char newline;			/* True if previous state matched newline. */
2455354776peter  char letter;			/* True if previous state matched a letter. */
2465354776peter  char backref;			/* True if this state matches a \<digit>. */
2475354776peter  unsigned char constraint;	/* Constraint for this state to accept. */
2485354776peter  int first_end;		/* Token value of the first END in elems. */
24982d1ca3tjr#ifdef MBS_SUPPORT
25082d1ca3tjr  position_set mbps;           /* Positions which can match multibyte
25182d1ca3tjr                                  characters.  e.g. period.
25282d1ca3tjr				  These staff are used only if
25382d1ca3tjr				  MB_CUR_MAX > 1.  */
2555354776peter} dfa_state;
2575354776peter/* Element of a list of strings, at least one of which is known to
2585354776peter   appear in any R.E. matching the DFA. */
2595354776peterstruct dfamust
2615354776peter  int exact;
2625354776peter  char *must;
2635354776peter  struct dfamust *next;
26682d1ca3tjr#ifdef MBS_SUPPORT
26782d1ca3tjr/* A bracket operator.
26882d1ca3tjr   e.g. [a-c], [[:alpha:]], etc.  */
26982d1ca3tjrstruct mb_char_classes
27182d1ca3tjr  int invert;
27282d1ca3tjr  wchar_t *chars;		/* Normal characters.  */
27382d1ca3tjr  int nchars;
27482d1ca3tjr  wctype_t *ch_classes;		/* Character classes.  */
27582d1ca3tjr  int nch_classes;
27682d1ca3tjr  wchar_t *range_sts;		/* Range characters (start of the range).  */
27782d1ca3tjr  wchar_t *range_ends;		/* Range characters (end of the range).  */
27882d1ca3tjr  int nranges;
27982d1ca3tjr  char **equivs;		/* Equivalent classes.  */
28082d1ca3tjr  int nequivs;
28182d1ca3tjr  char **coll_elems;
28282d1ca3tjr  int ncoll_elems;		/* Collating elements.  */
2865354776peter/* A compiled regular expression. */
2875354776peterstruct dfa
2895354776peter  /* Stuff built by the scanner. */
2905354776peter  charclass *charclasses;	/* Array of character sets for CSET tokens. */
2915354776peter  int cindex;			/* Index for adding new charclasses. */
2925354776peter  int calloc;			/* Number of charclasses currently allocated. */
2945354776peter  /* Stuff built by the parser. */
2955354776peter  token *tokens;		/* Postfix parse array. */
2965354776peter  int tindex;			/* Index for adding new tokens. */
2975354776peter  int talloc;			/* Number of tokens currently allocated. */
2985354776peter  int depth;			/* Depth required of an evaluation stack
2995354776peter				   used for depth-first traversal of the
3005354776peter				   parse tree. */
3015354776peter  int nleaves;			/* Number of leaves on the parse tree. */
3025354776peter  int nregexps;			/* Count of parallel regexps being built
3035354776peter				   with dfaparse(). */
30482d1ca3tjr#ifdef MBS_SUPPORT
30582d1ca3tjr  /* These stuff are used only if MB_CUR_MAX > 1 or multibyte environments.  */
30682d1ca3tjr  int nmultibyte_prop;
30782d1ca3tjr  int *multibyte_prop;
30882d1ca3tjr  /* The value of multibyte_prop[i] is defined by following rule.
30982d1ca3tjr       if tokens[i] < NOTCHAR
31082d1ca3tjr         bit 1 : tokens[i] is a singlebyte character, or the last-byte of
31182d1ca3tjr	         a multibyte character.
31282d1ca3tjr	 bit 0 : tokens[i] is a singlebyte character, or the 1st-byte of
31382d1ca3tjr	         a multibyte character.
31482d1ca3tjr       if tokens[i] = MBCSET
31582d1ca3tjr         ("the index of mbcsets correspnd to this operator" << 2) + 3
31782d1ca3tjr     e.g.
31882d1ca3tjr     tokens
31982d1ca3tjr        = 'single_byte_a', 'multi_byte_A', single_byte_b'
32082d1ca3tjr        = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
32182d1ca3tjr     multibyte_prop
32282d1ca3tjr        = 3     , 1               ,  0              ,  2              , 3
32382d1ca3tjr  */
32582d1ca3tjr  /* Array of the bracket expressoin in the DFA.  */
32682d1ca3tjr  struct mb_char_classes *mbcsets;
32782d1ca3tjr  int nmbcsets;
32882d1ca3tjr  int mbcsets_alloc;
3315354776peter  /* Stuff owned by the state builder. */
3325354776peter  dfa_state *states;		/* States of the dfa. */
3335354776peter  int sindex;			/* Index for adding new states. */
3345354776peter  int salloc;			/* Number of states currently allocated. */
3365354776peter  /* Stuff built by the structure analyzer. */
3375354776peter  position_set *follows;	/* Array of follow sets, indexed by position
3385354776peter				   index.  The follow of a position is the set
3395354776peter				   of positions containing characters that
3405354776peter				   could conceivably follow a character
3415354776peter				   matching the given position in a string
3425354776peter				   matching the regexp.  Allocated to the
3435354776peter				   maximum possible position index. */
3445354776peter  int searchflag;		/* True if we are supposed to build a searching
3455354776peter				   as opposed to an exact matcher.  A searching
3465354776peter				   matcher finds the first and shortest string
3475354776peter				   matching a regexp anywhere in the buffer,
3485354776peter				   whereas an exact matcher finds the longest
3495354776peter				   string matching, but anchored to the
3505354776peter				   beginning of the buffer. */
3525354776peter  /* Stuff owned by the executor. */
3535354776peter  int tralloc;			/* Number of transition tables that have
3545354776peter				   slots so far. */
3555354776peter  int trcount;			/* Number of transition tables that have
3565354776peter				   actually been built. */
3575354776peter  int **trans;			/* Transition tables for states that can
3585354776peter				   never accept.  If the transitions for a
3595354776peter				   state have not yet been computed, or the
3605354776peter				   state could possibly accept, its entry in
3615354776peter				   this table is NULL. */
3625354776peter  int **realtrans;		/* Trans always points to realtrans + 1; this
3635354776peter				   is so trans[-1] can contain NULL. */
3645354776peter  int **fails;			/* Transition tables after failing to accept
3655354776peter				   on a state that potentially could do so. */
3665354776peter  int *success;			/* Table of acceptance conditions used in
3675354776peter				   dfaexec and computed in build_state. */
3685354776peter  struct dfamust *musts;	/* List of strings, at least one of which
3695354776peter				   is known to appear in any r.e. matching
3705354776peter				   the dfa. */
3735354776peter/* Some macros for user access to dfa internals. */
3755354776peter/* ACCEPTING returns true if s could possibly be an accepting state of r. */
3765354776peter#define ACCEPTING(s, r) ((r).states[s].constraint)
3785354776peter/* ACCEPTS_IN_CONTEXT returns true if the given state accepts in the
3795354776peter   specified context. */
3805354776peter#define ACCEPTS_IN_CONTEXT(prevn, currn, prevl, currl, state, dfa) \
3815354776peter  SUCCEEDS_IN_CONTEXT((dfa).states[state].constraint,		   \
3825354776peter		       prevn, currn, prevl, currl)
3845354776peter/* FIRST_MATCHING_REGEXP returns the index number of the first of parallel
3855354776peter   regexps that a given state could accept.  Parallel regexps are numbered
3865354776peter   starting at 1. */
3875354776peter#define FIRST_MATCHING_REGEXP(state, dfa) (-(dfa).states[state].first_end)
3895354776peter/* Entry points. */
3916a9796dobrien/* dfasyntax() takes three arguments; the first sets the syntax bits described
3926a9796dobrien   earlier in this file, the second sets the case-folding flag, and the
3936a9796dobrien   third specifies the line terminator. */
39482d1ca3tjrextern void dfasyntax PARAMS ((reg_syntax_t, int, unsigned char));
3965354776peter/* Compile the given string of the given length into the given struct dfa.
3975354776peter   Final argument is a flag specifying whether to build a searching or an
3985354776peter   exact matcher. */
39982d1ca3tjrextern void dfacomp PARAMS ((char const *, size_t, struct dfa *, int));
4015354776peter/* Execute the given struct dfa on the buffer of characters.  The
40282d1ca3tjr   last byte of the buffer must equal the end-of-line byte.
40382d1ca3tjr   The final argument points to a flag that will
4045354776peter   be set if further examination by a backtracking matcher is needed in
4055354776peter   order to verify backreferencing; otherwise the flag will be cleared.
40682d1ca3tjr   Returns (size_t) -1 if no match is found, or the offset of the first
4075354776peter   character after the first & shortest matching string in the buffer. */
408e4b49c8tjrextern size_t dfaexec PARAMS ((struct dfa *, char const *, size_t, int *));
4105354776peter/* Free the storage held by the components of a struct dfa. */
411fa89658obrienextern void dfafree PARAMS ((struct dfa *));
4135354776peter/* Entry points for people who know what they're doing. */
4155354776peter/* Initialize the components of a struct dfa. */
416fa89658obrienextern void dfainit PARAMS ((struct dfa *));
4185354776peter/* Incrementally parse a string of given length into a struct dfa. */
41982d1ca3tjrextern void dfaparse PARAMS ((char const *, size_t, struct dfa *));
4215354776peter/* Analyze a parsed regexp; second argument tells whether to build a searching
4225354776peter   or an exact matcher. */
423fa89658obrienextern void dfaanalyze PARAMS ((struct dfa *, int));
4255354776peter/* Compute, for each possible character, the transitions out of a given
4265354776peter   state, storing them in an array of integers. */
427fa89658obrienextern void dfastate PARAMS ((int, struct dfa *, int []));
4295354776peter/* Error handling. */
4315354776peter/* dfaerror() is called by the regexp routines whenever an error occurs.  It
4325354776peter   takes a single argument, a NUL-terminated string describing the error.
43382d1ca3tjr   The user must supply a dfaerror.  */
434fa89658obrienextern void dfaerror PARAMS ((const char *));