1 /* 2 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 3 * Copyright 2012 Milan Jurik. All rights reserved. 4 * Copyright (c) 2016 by Delphix. All rights reserved. 5 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 6 * Copyright (c) 1992, 1993, 1994 7 * The Regents of the University of California. All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * Henry Spencer. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 /* 38 * The matching engine and friends. This file is #included by regexec.c 39 * after suitable #defines of a variety of macros used herein, so that 40 * different state representations can be used without duplicating masses 41 * of code. 42 */ 43 44 #ifdef SNAMES 45 #define matcher smatcher 46 #define fast sfast 47 #define slow sslow 48 #define dissect sdissect 49 #define backref sbackref 50 #define step sstep 51 #define print sprint 52 #define at sat 53 #define match smat 54 #endif 55 #ifdef LNAMES 56 #define matcher lmatcher 57 #define fast lfast 58 #define slow lslow 59 #define dissect ldissect 60 #define backref lbackref 61 #define step lstep 62 #define print lprint 63 #define at lat 64 #define match lmat 65 #endif 66 #ifdef MNAMES 67 #define matcher mmatcher 68 #define fast mfast 69 #define slow mslow 70 #define dissect mdissect 71 #define backref mbackref 72 #define step mstep 73 #define print mprint 74 #define at mat 75 #define match mmat 76 #endif 77 78 /* another structure passed up and down to avoid zillions of parameters */ 79 struct match { 80 struct re_guts *g; 81 int eflags; 82 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 83 const char *offp; /* offsets work from here */ 84 const char *beginp; /* start of string -- virtual NUL precedes */ 85 const char *endp; /* end of string -- virtual NUL here */ 86 const char *coldp; /* can be no match starting before here */ 87 const char **lastpos; /* [nplus+1] */ 88 STATEVARS; 89 states st; /* current states */ 90 states fresh; /* states for a fresh start */ 91 states tmp; /* temporary */ 92 states empty; /* empty set of states */ 93 mbstate_t mbs; /* multibyte conversion state */ 94 }; 95 96 /* ========= begin header generated by ./mkh ========= */ 97 #ifdef __cplusplus 98 extern "C" { 99 #endif 100 101 /* === engine.c === */ 102 static int matcher(struct re_guts *, const char *, size_t, regmatch_t[], int); 103 static const char *dissect(struct match *, const char *, const char *, 104 sopno, sopno); 105 static const char *backref(struct match *, const char *, const char *, sopno, 106 sopno, sopno, int); 107 static const char *fast(struct match *, const char *, const char *, sopno, 108 sopno); 109 static const char *slow(struct match *, const char *, const char *, sopno, 110 sopno); 111 static states step(struct re_guts *, sopno, sopno, states, wint_t, states); 112 #define MAX_RECURSION 100 113 #define BOL (OUT-1) 114 #define EOL (BOL-1) 115 #define BOLEOL (BOL-2) 116 #define NOTHING (BOL-3) 117 #define BOW (BOL-4) 118 #define EOW (BOL-5) 119 #define BADCHAR (BOL-6) 120 #define NONCHAR(c) ((c) <= OUT) 121 #ifdef REDEBUG 122 static void print(struct match *, const char *, states, int, FILE *); 123 #endif 124 #ifdef REDEBUG 125 static void at(struct match *, const char *, const char *, const char *, 126 sopno, sopno); 127 #endif 128 #ifdef REDEBUG 129 static const char *pchar(int ch); 130 #endif 131 132 #ifdef __cplusplus 133 } 134 #endif 135 /* ========= end header generated by ./mkh ========= */ 136 137 #ifdef REDEBUG 138 #define SP(t, s, c) print(m, t, s, c, stdout) 139 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 140 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 141 #else 142 #define SP(t, s, c) /* nothing */ 143 #define AT(t, p1, p2, s1, s2) /* nothing */ 144 #define NOTE(s) /* nothing */ 145 #endif 146 147 /* 148 * matcher - the actual matching engine 149 */ 150 static int /* 0 success, REG_NOMATCH failure */ 151 matcher(struct re_guts *g, const char *string, size_t nmatch, 152 regmatch_t pmatch[], int eflags) 153 { 154 const char *endp; 155 size_t i; 156 struct match mv; 157 struct match *m = &mv; 158 const char *dp = NULL; 159 const sopno gf = g->firststate+1; /* +1 for OEND */ 160 const sopno gl = g->laststate; 161 const char *start; 162 const char *stop; 163 /* Boyer-Moore algorithms variables */ 164 const char *pp; 165 int cj, mj; 166 const char *mustfirst; 167 const char *mustlast; 168 int *matchjump; 169 int *charjump; 170 171 /* simplify the situation where possible */ 172 if (g->cflags®_NOSUB) 173 nmatch = 0; 174 175 if (eflags®_STARTEND) { 176 start = string + pmatch[0].rm_so; 177 stop = string + pmatch[0].rm_eo; 178 } else { 179 start = string; 180 stop = start + strlen(start); 181 } 182 183 if (stop < start) 184 return (REG_EFATAL); 185 186 /* prescreening; this does wonders for this rather slow code */ 187 if (g->must != NULL) { 188 if (g->charjump != NULL && g->matchjump != NULL) { 189 mustfirst = g->must; 190 mustlast = g->must + g->mlen - 1; 191 charjump = g->charjump; 192 matchjump = g->matchjump; 193 pp = mustlast; 194 for (dp = start+g->mlen-1; dp < stop; ) { 195 /* Fast skip non-matches */ 196 while (dp < stop && charjump[(int)*dp]) 197 dp += charjump[(int)*dp]; 198 199 if (dp >= stop) 200 break; 201 202 /* Greedy matcher */ 203 /* 204 * We depend on not being used for 205 * for strings of length 1 206 */ 207 while (*--dp == *--pp && pp != mustfirst) 208 ; 209 210 if (*dp == *pp) 211 break; 212 213 /* Jump to next possible match */ 214 mj = matchjump[pp - mustfirst]; 215 cj = charjump[(int)*dp]; 216 dp += (cj < mj ? mj : cj); 217 pp = mustlast; 218 } 219 if (pp != mustfirst) 220 return (REG_NOMATCH); 221 } else { 222 for (dp = start; dp < stop; dp++) 223 if (*dp == g->must[0] && 224 stop - dp >= g->mlen && 225 memcmp(dp, g->must, (size_t)g->mlen) == 0) 226 break; 227 if (dp == stop) /* we didn't find g->must */ 228 return (REG_NOMATCH); 229 } 230 } 231 232 /* match struct setup */ 233 m->g = g; 234 m->eflags = eflags; 235 m->pmatch = NULL; 236 m->lastpos = NULL; 237 m->offp = string; 238 m->beginp = start; 239 m->endp = stop; 240 STATESETUP(m, 4); 241 SETUP(m->st); 242 SETUP(m->fresh); 243 SETUP(m->tmp); 244 SETUP(m->empty); 245 CLEAR(m->empty); 246 ZAPSTATE(&m->mbs); 247 248 /* Adjust start according to moffset, to speed things up */ 249 if (dp != NULL && g->moffset > -1) 250 start = ((dp - g->moffset) < start) ? start : dp - g->moffset; 251 252 SP("mloop", m->st, *start); 253 254 /* this loop does only one repetition except for backrefs */ 255 for (;;) { 256 endp = fast(m, start, stop, gf, gl); 257 if (endp == NULL) { /* a miss */ 258 if (m->pmatch != NULL) 259 free((char *)m->pmatch); 260 if (m->lastpos != NULL) 261 free((char *)m->lastpos); 262 STATETEARDOWN(m); 263 return (REG_NOMATCH); 264 } 265 if (nmatch == 0 && !g->backrefs) 266 break; /* no further info needed */ 267 268 /* where? */ 269 assert(m->coldp != NULL); 270 for (;;) { 271 NOTE("finding start"); 272 endp = slow(m, m->coldp, stop, gf, gl); 273 if (endp != NULL) 274 break; 275 assert(m->coldp < m->endp); 276 m->coldp += XMBRTOWC(NULL, m->coldp, 277 m->endp - m->coldp, &m->mbs, 0); 278 } 279 if (nmatch == 1 && !g->backrefs) 280 break; /* no further info needed */ 281 282 /* oh my, it wants the subexpressions... */ 283 if (m->pmatch == NULL) 284 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 285 sizeof (regmatch_t)); 286 if (m->pmatch == NULL) { 287 STATETEARDOWN(m); 288 return (REG_ESPACE); 289 } 290 for (i = 1; i <= m->g->nsub; i++) 291 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 292 /* NB: FreeBSD has REG_BACKR, we do not */ 293 if (!g->backrefs /* && !(m->eflags®_BACKR) */) { 294 NOTE("dissecting"); 295 dp = dissect(m, m->coldp, endp, gf, gl); 296 } else { 297 if (g->nplus > 0 && m->lastpos == NULL) 298 m->lastpos = malloc((g->nplus+1) * 299 sizeof (const char *)); 300 if (g->nplus > 0 && m->lastpos == NULL) { 301 free(m->pmatch); 302 STATETEARDOWN(m); 303 return (REG_ESPACE); 304 } 305 NOTE("backref dissect"); 306 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 307 } 308 if (dp != NULL) 309 break; 310 311 /* uh-oh... we couldn't find a subexpression-level match */ 312 assert(g->backrefs); /* must be back references doing it */ 313 assert(g->nplus == 0 || m->lastpos != NULL); 314 for (;;) { 315 if (dp != NULL || endp <= m->coldp) 316 break; /* defeat */ 317 NOTE("backoff"); 318 endp = slow(m, m->coldp, endp-1, gf, gl); 319 if (endp == NULL) 320 break; /* defeat */ 321 /* try it on a shorter possibility */ 322 #ifndef NDEBUG 323 for (i = 1; i <= m->g->nsub; i++) { 324 assert(m->pmatch[i].rm_so == -1); 325 assert(m->pmatch[i].rm_eo == -1); 326 } 327 #endif 328 NOTE("backoff dissect"); 329 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 330 } 331 assert(dp == NULL || dp == endp); 332 if (dp != NULL) /* found a shorter one */ 333 break; 334 335 /* despite initial appearances, there is no match here */ 336 NOTE("false alarm"); 337 /* recycle starting later */ 338 start = m->coldp + XMBRTOWC(NULL, m->coldp, 339 stop - m->coldp, &m->mbs, 0); 340 assert(start <= stop); 341 } 342 343 /* fill in the details if requested */ 344 if (nmatch > 0) { 345 pmatch[0].rm_so = m->coldp - m->offp; 346 pmatch[0].rm_eo = endp - m->offp; 347 } 348 if (nmatch > 1) { 349 assert(m->pmatch != NULL); 350 for (i = 1; i < nmatch; i++) 351 if (i <= m->g->nsub) 352 pmatch[i] = m->pmatch[i]; 353 else { 354 pmatch[i].rm_so = -1; 355 pmatch[i].rm_eo = -1; 356 } 357 } 358 359 if (m->pmatch != NULL) 360 free((char *)m->pmatch); 361 if (m->lastpos != NULL) 362 free((char *)m->lastpos); 363 STATETEARDOWN(m); 364 return (0); 365 } 366 367 /* 368 * dissect - figure out what matched what, no back references 369 */ 370 static const char * 371 dissect(struct match *m, const char *start, const char *stop, sopno startst, 372 sopno stopst) 373 { 374 int i; 375 sopno ss; /* start sop of current subRE */ 376 sopno es; /* end sop of current subRE */ 377 const char *sp; /* start of string matched by it */ 378 const char *stp; /* string matched by it cannot pass here */ 379 const char *rest; /* start of rest of string */ 380 const char *tail; /* string unmatched by rest of RE */ 381 sopno ssub; /* start sop of subsubRE */ 382 sopno esub; /* end sop of subsubRE */ 383 const char *ssp; /* start of string matched by subsubRE */ 384 const char *sep; /* end of string matched by subsubRE */ 385 const char *oldssp; /* previous ssp */ 386 const char *dp; 387 388 AT("diss", start, stop, startst, stopst); 389 sp = start; 390 for (ss = startst; ss < stopst; ss = es) { 391 /* identify end of subRE */ 392 es = ss; 393 switch (OP(m->g->strip[es])) { 394 case OPLUS_: 395 case OQUEST_: 396 es += OPND(m->g->strip[es]); 397 break; 398 case OCH_: 399 while (OP(m->g->strip[es]) != O_CH) 400 es += OPND(m->g->strip[es]); 401 break; 402 } 403 es++; 404 405 /* figure out what it matched */ 406 switch (OP(m->g->strip[ss])) { 407 case OEND: 408 assert(0); 409 break; 410 case OCHAR: 411 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 412 break; 413 case OBOL: 414 case OEOL: 415 case OBOW: 416 case OEOW: 417 break; 418 case OANY: 419 case OANYOF: 420 sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); 421 break; 422 case OBACK_: 423 case O_BACK: 424 assert(0); 425 break; 426 /* cases where length of match is hard to find */ 427 case OQUEST_: 428 stp = stop; 429 for (;;) { 430 /* how long could this one be? */ 431 rest = slow(m, sp, stp, ss, es); 432 assert(rest != NULL); /* it did match */ 433 /* could the rest match the rest? */ 434 tail = slow(m, rest, stop, es, stopst); 435 if (tail == stop) 436 break; /* yes! */ 437 /* no -- try a shorter match for this one */ 438 stp = rest - 1; 439 assert(stp >= sp); /* it did work */ 440 } 441 ssub = ss + 1; 442 esub = es - 1; 443 /* did innards match? */ 444 if (slow(m, sp, rest, ssub, esub) != NULL) { 445 dp = dissect(m, sp, rest, ssub, esub); 446 assert(dp == rest); 447 #if defined(__lint) 448 (void) dp; 449 #endif 450 } else /* no */ 451 assert(sp == rest); 452 sp = rest; 453 break; 454 case OPLUS_: 455 stp = stop; 456 for (;;) { 457 /* how long could this one be? */ 458 rest = slow(m, sp, stp, ss, es); 459 assert(rest != NULL); /* it did match */ 460 /* could the rest match the rest? */ 461 tail = slow(m, rest, stop, es, stopst); 462 if (tail == stop) 463 break; /* yes! */ 464 /* no -- try a shorter match for this one */ 465 stp = rest - 1; 466 assert(stp >= sp); /* it did work */ 467 } 468 ssub = ss + 1; 469 esub = es - 1; 470 ssp = sp; 471 oldssp = ssp; 472 for (;;) { /* find last match of innards */ 473 sep = slow(m, ssp, rest, ssub, esub); 474 if (sep == NULL || sep == ssp) 475 break; /* failed or matched null */ 476 oldssp = ssp; /* on to next try */ 477 ssp = sep; 478 } 479 if (sep == NULL) { 480 /* last successful match */ 481 sep = ssp; 482 ssp = oldssp; 483 } 484 assert(sep == rest); /* must exhaust substring */ 485 assert(slow(m, ssp, sep, ssub, esub) == rest); 486 dp = dissect(m, ssp, sep, ssub, esub); 487 assert(dp == sep); 488 sp = rest; 489 break; 490 case OCH_: 491 stp = stop; 492 for (;;) { 493 /* how long could this one be? */ 494 rest = slow(m, sp, stp, ss, es); 495 assert(rest != NULL); /* it did match */ 496 /* could the rest match the rest? */ 497 tail = slow(m, rest, stop, es, stopst); 498 if (tail == stop) 499 break; /* yes! */ 500 /* no -- try a shorter match for this one */ 501 stp = rest - 1; 502 assert(stp >= sp); /* it did work */ 503 } 504 ssub = ss + 1; 505 esub = ss + OPND(m->g->strip[ss]) - 1; 506 assert(OP(m->g->strip[esub]) == OOR1); 507 for (;;) { /* find first matching branch */ 508 if (slow(m, sp, rest, ssub, esub) == rest) 509 break; /* it matched all of it */ 510 /* that one missed, try next one */ 511 assert(OP(m->g->strip[esub]) == OOR1); 512 esub++; 513 assert(OP(m->g->strip[esub]) == OOR2); 514 ssub = esub + 1; 515 esub += OPND(m->g->strip[esub]); 516 if (OP(m->g->strip[esub]) == OOR2) 517 esub--; 518 else 519 assert(OP(m->g->strip[esub]) == O_CH); 520 } 521 dp = dissect(m, sp, rest, ssub, esub); 522 assert(dp == rest); 523 sp = rest; 524 break; 525 case O_PLUS: 526 case O_QUEST: 527 case OOR1: 528 case OOR2: 529 case O_CH: 530 assert(0); 531 break; 532 case OLPAREN: 533 i = OPND(m->g->strip[ss]); 534 assert(0 < i && i <= m->g->nsub); 535 m->pmatch[i].rm_so = sp - m->offp; 536 break; 537 case ORPAREN: 538 i = OPND(m->g->strip[ss]); 539 assert(0 < i && i <= m->g->nsub); 540 m->pmatch[i].rm_eo = sp - m->offp; 541 break; 542 default: /* uh oh */ 543 assert(0); 544 break; 545 } 546 } 547 548 assert(sp == stop); 549 return (sp); 550 } 551 552 /* 553 * backref - figure out what matched what, figuring in back references 554 */ 555 static const char * 556 backref(struct match *m, const char *start, const char *stop, sopno startst, 557 sopno stopst, sopno lev, /* PLUS nesting level */ 558 int rec) 559 { 560 int i; 561 sopno ss; /* start sop of current subRE */ 562 const char *sp; /* start of string matched by it */ 563 sopno ssub; /* start sop of subsubRE */ 564 sopno esub; /* end sop of subsubRE */ 565 const char *ssp; /* start of string matched by subsubRE */ 566 const char *dp; 567 size_t len; 568 int hard; 569 sop s; 570 regoff_t offsave; 571 cset *cs; 572 wint_t wc; 573 574 AT("back", start, stop, startst, stopst); 575 sp = start; 576 577 /* get as far as we can with easy stuff */ 578 hard = 0; 579 for (ss = startst; !hard && ss < stopst; ss++) 580 switch (OP(s = m->g->strip[ss])) { 581 case OCHAR: 582 if (sp == stop) 583 return (NULL); 584 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 585 if (wc != OPND(s)) 586 return (NULL); 587 break; 588 case OANY: 589 if (sp == stop) 590 return (NULL); 591 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 592 if (wc == BADCHAR) 593 return (NULL); 594 break; 595 case OANYOF: 596 if (sp == stop) 597 return (NULL); 598 cs = &m->g->sets[OPND(s)]; 599 sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); 600 if (wc == BADCHAR || !CHIN(cs, wc)) 601 return (NULL); 602 break; 603 case OBOL: 604 if ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 605 (sp > m->offp && sp < m->endp && 606 *(sp-1) == '\n' && (m->g->cflags®_NEWLINE))) { 607 break; 608 } 609 return (NULL); 610 case OEOL: 611 if ((sp == m->endp && !(m->eflags®_NOTEOL)) || 612 (sp < m->endp && *sp == '\n' && 613 (m->g->cflags®_NEWLINE))) { 614 break; 615 } 616 return (NULL); 617 case OBOW: 618 if (sp < m->endp && ISWORD(*sp) && 619 ((sp == m->beginp && !(m->eflags®_NOTBOL)) || 620 (sp > m->offp && !ISWORD(*(sp-1))))) { 621 break; 622 } 623 return (NULL); 624 case OEOW: 625 if (((sp == m->endp && !(m->eflags®_NOTEOL)) || 626 (sp < m->endp && *sp == '\n' && 627 (m->g->cflags®_NEWLINE)) || 628 (sp < m->endp && !ISWORD(*sp))) && 629 (sp > m->beginp && ISWORD(*(sp-1)))) { 630 break; 631 } 632 return (NULL); 633 case O_QUEST: 634 break; 635 case OOR1: /* matches null but needs to skip */ 636 ss++; 637 s = m->g->strip[ss]; 638 do { 639 assert(OP(s) == OOR2); 640 ss += OPND(s); 641 } while (OP(s = m->g->strip[ss]) != O_CH); 642 /* note that the ss++ gets us past the O_CH */ 643 break; 644 default: /* have to make a choice */ 645 hard = 1; 646 break; 647 } 648 if (!hard) { /* that was it! */ 649 if (sp != stop) 650 return (NULL); 651 return (sp); 652 } 653 ss--; /* adjust for the for's final increment */ 654 655 /* the hard stuff */ 656 AT("hard", sp, stop, ss, stopst); 657 s = m->g->strip[ss]; 658 switch (OP(s)) { 659 case OBACK_: /* the vilest depths */ 660 i = OPND(s); 661 assert(0 < i && i <= m->g->nsub); 662 if (m->pmatch[i].rm_eo == -1) 663 return (NULL); 664 assert(m->pmatch[i].rm_so != -1); 665 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 666 if (len == 0 && rec++ > MAX_RECURSION) 667 return (NULL); 668 assert(stop - m->beginp >= len); 669 if (sp > stop - len) 670 return (NULL); /* not enough left to match */ 671 ssp = m->offp + m->pmatch[i].rm_so; 672 if (memcmp(sp, ssp, len) != 0) 673 return (NULL); 674 while (m->g->strip[ss] != SOP(O_BACK, i)) 675 ss++; 676 return (backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 677 case OQUEST_: /* to null or not */ 678 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 679 if (dp != NULL) 680 return (dp); /* not */ 681 return (backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 682 case OPLUS_: 683 assert(m->lastpos != NULL); 684 assert(lev+1 <= m->g->nplus); 685 m->lastpos[lev+1] = sp; 686 return (backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 687 case O_PLUS: 688 if (sp == m->lastpos[lev]) /* last pass matched null */ 689 return (backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 690 /* try another pass */ 691 m->lastpos[lev] = sp; 692 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 693 if (dp == NULL) 694 return (backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 695 return (dp); 696 case OCH_: /* find the right one, if any */ 697 ssub = ss + 1; 698 esub = ss + OPND(s) - 1; 699 assert(OP(m->g->strip[esub]) == OOR1); 700 for (;;) { /* find first matching branch */ 701 dp = backref(m, sp, stop, ssub, esub, lev, rec); 702 if (dp != NULL) 703 return (dp); 704 /* that one missed, try next one */ 705 if (OP(m->g->strip[esub]) == O_CH) 706 return (NULL); /* there is none */ 707 esub++; 708 assert(OP(m->g->strip[esub]) == OOR2); 709 ssub = esub + 1; 710 esub += OPND(m->g->strip[esub]); 711 if (OP(m->g->strip[esub]) == OOR2) 712 esub--; 713 else 714 assert(OP(m->g->strip[esub]) == O_CH); 715 } 716 /* NOTREACHED */ 717 break; 718 case OLPAREN: /* must undo assignment if rest fails */ 719 i = OPND(s); 720 assert(0 < i && i <= m->g->nsub); 721 offsave = m->pmatch[i].rm_so; 722 m->pmatch[i].rm_so = sp - m->offp; 723 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 724 if (dp != NULL) 725 return (dp); 726 m->pmatch[i].rm_so = offsave; 727 return (NULL); 728 case ORPAREN: /* must undo assignment if rest fails */ 729 i = OPND(s); 730 assert(0 < i && i <= m->g->nsub); 731 offsave = m->pmatch[i].rm_eo; 732 m->pmatch[i].rm_eo = sp - m->offp; 733 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 734 if (dp != NULL) 735 return (dp); 736 m->pmatch[i].rm_eo = offsave; 737 return (NULL); 738 default: /* uh oh */ 739 assert(0); 740 break; 741 } 742 743 /* "can't happen" */ 744 assert(0); 745 return (NULL); 746 } 747 748 /* 749 * fast - step through the string at top speed 750 */ 751 static const char * 752 fast(struct match *m, const char *start, const char *stop, sopno startst, 753 sopno stopst) 754 { 755 states st = m->st; 756 states fresh = m->fresh; 757 states tmp = m->tmp; 758 const char *p = start; 759 wint_t c; 760 wint_t lastc; /* previous c */ 761 wint_t flagch; 762 int i; 763 const char *coldp; /* last p after which no match was underway */ 764 size_t clen; 765 766 CLEAR(st); 767 SET1(st, startst); 768 SP("fast", st, *p); 769 st = step(m->g, startst, stopst, st, NOTHING, st); 770 ASSIGN(fresh, st); 771 SP("start", st, *p); 772 coldp = NULL; 773 if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL))) 774 c = OUT; 775 else { 776 /* 777 * XXX Wrong if the previous character was multi-byte. 778 * Newline never is (in encodings supported by FreeBSD), 779 * so this only breaks the ISWORD tests below. 780 */ 781 c = (uch)*(start - 1); 782 } 783 for (;;) { 784 /* next character */ 785 lastc = c; 786 if (p == m->endp) { 787 clen = 0; 788 c = OUT; 789 } else 790 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 791 if (EQ(st, fresh)) 792 coldp = p; 793 794 /* is there an EOL and/or BOL between lastc and c? */ 795 flagch = '\0'; 796 i = 0; 797 if ((lastc == '\n' && m->g->cflags®_NEWLINE) || 798 (lastc == OUT && !(m->eflags®_NOTBOL))) { 799 flagch = BOL; 800 i = m->g->nbol; 801 } 802 if ((c == '\n' && m->g->cflags®_NEWLINE) || 803 (c == OUT && !(m->eflags®_NOTEOL))) { 804 flagch = (flagch == BOL) ? BOLEOL : EOL; 805 i += m->g->neol; 806 } 807 if (i != 0) { 808 for (; i > 0; i--) 809 st = step(m->g, startst, stopst, st, 810 flagch, st); 811 SP("boleol", st, c); 812 } 813 814 /* how about a word boundary? */ 815 if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 816 (c != OUT && ISWORD(c))) { 817 flagch = BOW; 818 } 819 if ((lastc != OUT && ISWORD(lastc)) && 820 (flagch == EOL || (c != OUT && !ISWORD(c)))) { 821 flagch = EOW; 822 } 823 if (flagch == BOW || flagch == EOW) { 824 st = step(m->g, startst, stopst, st, flagch, st); 825 SP("boweow", st, c); 826 } 827 828 /* are we done? */ 829 if (ISSET(st, stopst) || p == stop || clen > stop - p) 830 break; /* NOTE BREAK OUT */ 831 832 /* no, we must deal with this character */ 833 ASSIGN(tmp, st); 834 ASSIGN(st, fresh); 835 assert(c != OUT); 836 st = step(m->g, startst, stopst, tmp, c, st); 837 SP("aft", st, c); 838 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 839 p += clen; 840 } 841 842 assert(coldp != NULL); 843 m->coldp = coldp; 844 if (ISSET(st, stopst)) 845 return (p+XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); 846 else 847 return (NULL); 848 } 849 850 /* 851 * slow - step through the string more deliberately 852 */ 853 static const char * 854 slow(struct match *m, const char *start, const char *stop, sopno startst, 855 sopno stopst) 856 { 857 states st = m->st; 858 states empty = m->empty; 859 states tmp = m->tmp; 860 const char *p = start; 861 wint_t c; 862 wint_t lastc; /* previous c */ 863 wint_t flagch; 864 int i; 865 const char *matchp; /* last p at which a match ended */ 866 size_t clen; 867 868 AT("slow", start, stop, startst, stopst); 869 CLEAR(st); 870 SET1(st, startst); 871 SP("sstart", st, *p); 872 st = step(m->g, startst, stopst, st, NOTHING, st); 873 matchp = NULL; 874 if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL))) 875 c = OUT; 876 else { 877 /* 878 * XXX Wrong if the previous character was multi-byte. 879 * Newline never is (in encodings supported by FreeBSD), 880 * so this only breaks the ISWORD tests below. 881 */ 882 c = (uch)*(start - 1); 883 } 884 for (;;) { 885 /* next character */ 886 lastc = c; 887 if (p == m->endp) { 888 c = OUT; 889 clen = 0; 890 } else 891 clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); 892 893 /* is there an EOL and/or BOL between lastc and c? */ 894 flagch = '\0'; 895 i = 0; 896 if ((lastc == '\n' && m->g->cflags®_NEWLINE) || 897 (lastc == OUT && !(m->eflags®_NOTBOL))) { 898 flagch = BOL; 899 i = m->g->nbol; 900 } 901 if ((c == '\n' && m->g->cflags®_NEWLINE) || 902 (c == OUT && !(m->eflags®_NOTEOL))) { 903 flagch = (flagch == BOL) ? BOLEOL : EOL; 904 i += m->g->neol; 905 } 906 if (i != 0) { 907 for (; i > 0; i--) 908 st = step(m->g, startst, stopst, st, 909 flagch, st); 910 SP("sboleol", st, c); 911 } 912 913 /* how about a word boundary? */ 914 if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 915 (c != OUT && ISWORD(c))) { 916 flagch = BOW; 917 } 918 if ((lastc != OUT && ISWORD(lastc)) && 919 (flagch == EOL || (c != OUT && !ISWORD(c)))) { 920 flagch = EOW; 921 } 922 if (flagch == BOW || flagch == EOW) { 923 st = step(m->g, startst, stopst, st, flagch, st); 924 SP("sboweow", st, c); 925 } 926 927 /* are we done? */ 928 if (ISSET(st, stopst)) 929 matchp = p; 930 if (EQ(st, empty) || p == stop || clen > stop - p) 931 break; /* NOTE BREAK OUT */ 932 933 /* no, we must deal with this character */ 934 ASSIGN(tmp, st); 935 ASSIGN(st, empty); 936 assert(c != OUT); 937 st = step(m->g, startst, stopst, tmp, c, st); 938 SP("saft", st, c); 939 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 940 p += clen; 941 } 942 943 return (matchp); 944 } 945 946 947 /* 948 * step - map set of states reachable before char to set reachable after 949 */ 950 static states 951 step(struct re_guts *g, 952 sopno start, /* start state within strip */ 953 sopno stop, /* state after stop state within strip */ 954 states bef, /* states reachable before */ 955 wint_t ch, /* character or NONCHAR code */ 956 states aft) /* states already known reachable after */ 957 { 958 cset *cs; 959 sop s; 960 sopno pc; 961 onestate here; /* note, macros know this name */ 962 sopno look; 963 int i; 964 965 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 966 s = g->strip[pc]; 967 switch (OP(s)) { 968 case OEND: 969 assert(pc == stop-1); 970 break; 971 case OCHAR: 972 /* only characters can match */ 973 assert(!NONCHAR(ch) || ch != OPND(s)); 974 if (ch == OPND(s)) 975 FWD(aft, bef, 1); 976 break; 977 case OBOL: 978 if (ch == BOL || ch == BOLEOL) 979 FWD(aft, bef, 1); 980 break; 981 case OEOL: 982 if (ch == EOL || ch == BOLEOL) 983 FWD(aft, bef, 1); 984 break; 985 case OBOW: 986 if (ch == BOW) 987 FWD(aft, bef, 1); 988 break; 989 case OEOW: 990 if (ch == EOW) 991 FWD(aft, bef, 1); 992 break; 993 case OANY: 994 if (!NONCHAR(ch)) 995 FWD(aft, bef, 1); 996 break; 997 case OANYOF: 998 cs = &g->sets[OPND(s)]; 999 if (!NONCHAR(ch) && CHIN(cs, ch)) 1000 FWD(aft, bef, 1); 1001 break; 1002 case OBACK_: /* ignored here */ 1003 case O_BACK: 1004 FWD(aft, aft, 1); 1005 break; 1006 case OPLUS_: /* forward, this is just an empty */ 1007 FWD(aft, aft, 1); 1008 break; 1009 case O_PLUS: /* both forward and back */ 1010 FWD(aft, aft, 1); 1011 i = ISSETBACK(aft, OPND(s)); 1012 BACK(aft, aft, OPND(s)); 1013 if (!i && ISSETBACK(aft, OPND(s))) { 1014 /* oho, must reconsider loop body */ 1015 pc -= OPND(s) + 1; 1016 INIT(here, pc); 1017 } 1018 break; 1019 case OQUEST_: /* two branches, both forward */ 1020 FWD(aft, aft, 1); 1021 FWD(aft, aft, OPND(s)); 1022 break; 1023 case O_QUEST: /* just an empty */ 1024 FWD(aft, aft, 1); 1025 break; 1026 case OLPAREN: /* not significant here */ 1027 case ORPAREN: 1028 FWD(aft, aft, 1); 1029 break; 1030 case OCH_: /* mark the first two branches */ 1031 FWD(aft, aft, 1); 1032 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1033 FWD(aft, aft, OPND(s)); 1034 break; 1035 case OOR1: /* done a branch, find the O_CH */ 1036 if (ISSTATEIN(aft, here)) { 1037 for (look = 1; 1038 OP(s = g->strip[pc+look]) != O_CH; 1039 look += OPND(s)) 1040 assert(OP(s) == OOR2); 1041 FWD(aft, aft, look + 1); 1042 } 1043 break; 1044 case OOR2: /* propagate OCH_'s marking */ 1045 FWD(aft, aft, 1); 1046 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 1047 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1048 FWD(aft, aft, OPND(s)); 1049 } 1050 break; 1051 case O_CH: /* just empty */ 1052 FWD(aft, aft, 1); 1053 break; 1054 default: /* ooooops... */ 1055 assert(0); 1056 break; 1057 } 1058 } 1059 1060 return (aft); 1061 } 1062 1063 #ifdef REDEBUG 1064 /* 1065 * print - print a set of states 1066 */ 1067 static void 1068 print(struct match *m, const char *caption, states st, int ch, FILE *d) 1069 { 1070 struct re_guts *g = m->g; 1071 sopno i; 1072 int first = 1; 1073 1074 if (!(m->eflags®_TRACE)) 1075 return; 1076 1077 (void) fprintf(d, "%s", caption); 1078 if (ch != '\0') 1079 (void) fprintf(d, " %s", pchar(ch)); 1080 for (i = 0; i < g->nstates; i++) 1081 if (ISSET(st, i)) { 1082 (void) fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1083 first = 0; 1084 } 1085 (void) fprintf(d, "\n"); 1086 } 1087 1088 /* 1089 * at - print current situation 1090 */ 1091 static void 1092 at(struct match *m, const char *title, const char *start, const char *stop, 1093 sopno startst, sopno stopst) 1094 { 1095 if (!(m->eflags®_TRACE)) 1096 return; 1097 1098 (void) printf("%s %s-", title, pchar(*start)); 1099 (void) printf("%s ", pchar(*stop)); 1100 (void) printf("%ld-%ld\n", (long)startst, (long)stopst); 1101 } 1102 1103 #ifndef PCHARDONE 1104 #define PCHARDONE /* never again */ 1105 /* 1106 * pchar - make a character printable 1107 * 1108 * Is this identical to regchar() over in debug.c? Well, yes. But a 1109 * duplicate here avoids having a debugging-capable regexec.o tied to 1110 * a matching debug.o, and this is convenient. It all disappears in 1111 * the non-debug compilation anyway, so it doesn't matter much. 1112 */ 1113 static const char * 1114 pchar(int ch) 1115 { 1116 static char pbuf[10]; 1117 1118 if (isprint((uch)ch) || ch == ' ') 1119 (void) sprintf(pbuf, "%c", ch); 1120 else 1121 (void) sprintf(pbuf, "\\%o", ch); 1122 return (pbuf); 1123 } 1124 #endif 1125 #endif 1126 1127 #undef matcher 1128 #undef fast 1129 #undef slow 1130 #undef dissect 1131 #undef backref 1132 #undef step 1133 #undef print 1134 #undef at 1135 #undef match 1136