LLVM  mainline
regcomp.c
Go to the documentation of this file.
00001 /*-
00002  * This code is derived from OpenBSD's libc/regex, original license follows:
00003  *
00004  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
00005  * Copyright (c) 1992, 1993, 1994
00006  *  The Regents of the University of California.  All rights reserved.
00007  *
00008  * This code is derived from software contributed to Berkeley by
00009  * Henry Spencer.
00010  *
00011  * Redistribution and use in source and binary forms, with or without
00012  * modification, are permitted provided that the following conditions
00013  * are met:
00014  * 1. Redistributions of source code must retain the above copyright
00015  *    notice, this list of conditions and the following disclaimer.
00016  * 2. Redistributions in binary form must reproduce the above copyright
00017  *    notice, this list of conditions and the following disclaimer in the
00018  *    documentation and/or other materials provided with the distribution.
00019  * 3. Neither the name of the University nor the names of its contributors
00020  *    may be used to endorse or promote products derived from this software
00021  *    without specific prior written permission.
00022  *
00023  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
00024  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00025  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00026  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
00027  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00028  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00029  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00030  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00031  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00032  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00033  * SUCH DAMAGE.
00034  *
00035  *  @(#)regcomp.c 8.5 (Berkeley) 3/20/94
00036  */
00037 
00038 #include <sys/types.h>
00039 #include <stdio.h>
00040 #include <string.h>
00041 #include <ctype.h>
00042 #include <limits.h>
00043 #include <stdlib.h>
00044 #include "regex_impl.h"
00045 
00046 #include "regutils.h"
00047 #include "regex2.h"
00048 
00049 #include "regcclass.h"
00050 #include "regcname.h"
00051 
00052 #include "llvm/Config/config.h"
00053 #if HAVE_STDINT_H
00054 #include <stdint.h>
00055 #else
00056 /* Pessimistically bound memory use */
00057 #define SIZE_MAX UINT_MAX
00058 #endif
00059 
00060 /*
00061  * parse structure, passed up and down to avoid global variables and
00062  * other clumsinesses
00063  */
00064 struct parse {
00065   char *next;   /* next character in RE */
00066   char *end;    /* end of string (-> NUL normally) */
00067   int error;    /* has an error been seen? */
00068   sop *strip;   /* malloced strip */
00069   sopno ssize;    /* malloced strip size (allocated) */
00070   sopno slen;   /* malloced strip length (used) */
00071   int ncsalloc;   /* number of csets allocated */
00072   struct re_guts *g;
00073 # define  NPAREN  10  /* we need to remember () 1-9 for back refs */
00074   sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
00075   sopno pend[NPAREN]; /* -> ) ([0] unused) */
00076 };
00077 
00078 static void p_ere(struct parse *, int);
00079 static void p_ere_exp(struct parse *);
00080 static void p_str(struct parse *);
00081 static void p_bre(struct parse *, int, int);
00082 static int p_simp_re(struct parse *, int);
00083 static int p_count(struct parse *);
00084 static void p_bracket(struct parse *);
00085 static void p_b_term(struct parse *, cset *);
00086 static void p_b_cclass(struct parse *, cset *);
00087 static void p_b_eclass(struct parse *, cset *);
00088 static char p_b_symbol(struct parse *);
00089 static char p_b_coll_elem(struct parse *, int);
00090 static char othercase(int);
00091 static void bothcases(struct parse *, int);
00092 static void ordinary(struct parse *, int);
00093 static void nonnewline(struct parse *);
00094 static void repeat(struct parse *, sopno, int, int);
00095 static int seterr(struct parse *, int);
00096 static cset *allocset(struct parse *);
00097 static void freeset(struct parse *, cset *);
00098 static int freezeset(struct parse *, cset *);
00099 static int firstch(struct parse *, cset *);
00100 static int nch(struct parse *, cset *);
00101 static void mcadd(struct parse *, cset *, const char *);
00102 static void mcinvert(struct parse *, cset *);
00103 static void mccase(struct parse *, cset *);
00104 static int isinsets(struct re_guts *, int);
00105 static int samesets(struct re_guts *, int, int);
00106 static void categorize(struct parse *, struct re_guts *);
00107 static sopno dupl(struct parse *, sopno, sopno);
00108 static void doemit(struct parse *, sop, size_t);
00109 static void doinsert(struct parse *, sop, size_t, sopno);
00110 static void dofwd(struct parse *, sopno, sop);
00111 static void enlarge(struct parse *, sopno);
00112 static void stripsnug(struct parse *, struct re_guts *);
00113 static void findmust(struct parse *, struct re_guts *);
00114 static sopno pluscount(struct parse *, struct re_guts *);
00115 
00116 static char nuls[10];   /* place to point scanner in event of error */
00117 
00118 /*
00119  * macros for use with parse structure
00120  * BEWARE:  these know that the parse structure is named `p' !!!
00121  */
00122 #define PEEK()  (*p->next)
00123 #define PEEK2() (*(p->next+1))
00124 #define MORE()  (p->next < p->end)
00125 #define MORE2() (p->next+1 < p->end)
00126 #define SEE(c)  (MORE() && PEEK() == (c))
00127 #define SEETWO(a, b)  (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
00128 #define EAT(c)  ((SEE(c)) ? (NEXT(), 1) : 0)
00129 #define EATTWO(a, b)  ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
00130 #define NEXT()  (p->next++)
00131 #define NEXT2() (p->next += 2)
00132 #define NEXTn(n)  (p->next += (n))
00133 #define GETNEXT() (*p->next++)
00134 #define SETERROR(e) seterr(p, (e))
00135 #define REQUIRE(co, e)  (void)((co) || SETERROR(e))
00136 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
00137 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
00138 #define MUSTNOTSEE(c, e)  (REQUIRE(!MORE() || PEEK() != (c), e))
00139 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
00140 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
00141 #define AHEAD(pos)    dofwd(p, pos, HERE()-(pos))
00142 #define ASTERN(sop, pos)  EMIT(sop, HERE()-pos)
00143 #define HERE()    (p->slen)
00144 #define THERE()   (p->slen - 1)
00145 #define THERETHERE()  (p->slen - 2)
00146 #define DROP(n) (p->slen -= (n))
00147 
00148 #ifdef  _POSIX2_RE_DUP_MAX
00149 #define DUPMAX  _POSIX2_RE_DUP_MAX
00150 #else
00151 #define DUPMAX  255
00152 #endif
00153 #define INFINITY  (DUPMAX + 1)
00154 
00155 #ifndef NDEBUG
00156 static int never = 0;   /* for use in asserts; shuts lint up */
00157 #else
00158 #define never 0   /* some <assert.h>s have bugs too */
00159 #endif
00160 
00161 /*
00162  - llvm_regcomp - interface for parser and compilation
00163  */
00164 int       /* 0 success, otherwise REG_something */
00165 llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags)
00166 {
00167   struct parse pa;
00168   struct re_guts *g;
00169   struct parse *p = &pa;
00170   int i;
00171   size_t len;
00172 #ifdef REDEBUG
00173 # define  GOODFLAGS(f)  (f)
00174 #else
00175 # define  GOODFLAGS(f)  ((f)&~REG_DUMP)
00176 #endif
00177 
00178   cflags = GOODFLAGS(cflags);
00179   if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
00180     return(REG_INVARG);
00181 
00182   if (cflags&REG_PEND) {
00183     if (preg->re_endp < pattern)
00184       return(REG_INVARG);
00185     len = preg->re_endp - pattern;
00186   } else
00187     len = strlen((const char *)pattern);
00188 
00189   /* do the mallocs early so failure handling is easy */
00190   g = (struct re_guts *)malloc(sizeof(struct re_guts) +
00191               (NC-1)*sizeof(cat_t));
00192   if (g == NULL)
00193     return(REG_ESPACE);
00194   p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
00195   p->strip = (sop *)calloc(p->ssize, sizeof(sop));
00196   p->slen = 0;
00197   if (p->strip == NULL) {
00198     free((char *)g);
00199     return(REG_ESPACE);
00200   }
00201 
00202   /* set things up */
00203   p->g = g;
00204   p->next = (char *)pattern;  /* convenience; we do not modify it */
00205   p->end = p->next + len;
00206   p->error = 0;
00207   p->ncsalloc = 0;
00208   for (i = 0; i < NPAREN; i++) {
00209     p->pbegin[i] = 0;
00210     p->pend[i] = 0;
00211   }
00212   g->csetsize = NC;
00213   g->sets = NULL;
00214   g->setbits = NULL;
00215   g->ncsets = 0;
00216   g->cflags = cflags;
00217   g->iflags = 0;
00218   g->nbol = 0;
00219   g->neol = 0;
00220   g->must = NULL;
00221   g->mlen = 0;
00222   g->nsub = 0;
00223   g->ncategories = 1; /* category 0 is "everything else" */
00224   g->categories = &g->catspace[-(CHAR_MIN)];
00225   (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
00226   g->backrefs = 0;
00227 
00228   /* do it */
00229   EMIT(OEND, 0);
00230   g->firststate = THERE();
00231   if (cflags&REG_EXTENDED)
00232     p_ere(p, OUT);
00233   else if (cflags&REG_NOSPEC)
00234     p_str(p);
00235   else
00236     p_bre(p, OUT, OUT);
00237   EMIT(OEND, 0);
00238   g->laststate = THERE();
00239 
00240   /* tidy up loose ends and fill things in */
00241   categorize(p, g);
00242   stripsnug(p, g);
00243   findmust(p, g);
00244   g->nplus = pluscount(p, g);
00245   g->magic = MAGIC2;
00246   preg->re_nsub = g->nsub;
00247   preg->re_g = g;
00248   preg->re_magic = MAGIC1;
00249 #ifndef REDEBUG
00250   /* not debugging, so can't rely on the assert() in llvm_regexec() */
00251   if (g->iflags&REGEX_BAD)
00252     SETERROR(REG_ASSERT);
00253 #endif
00254 
00255   /* win or lose, we're done */
00256   if (p->error != 0)  /* lose */
00257     llvm_regfree(preg);
00258   return(p->error);
00259 }
00260 
00261 /*
00262  - p_ere - ERE parser top level, concatenation and alternation
00263  */
00264 static void
00265 p_ere(struct parse *p, int stop)  /* character this ERE should end at */
00266 {
00267   char c;
00268   sopno prevback = 0;
00269   sopno prevfwd = 0;
00270   sopno conc;
00271   int first = 1;    /* is this the first alternative? */
00272 
00273   for (;;) {
00274     /* do a bunch of concatenated expressions */
00275     conc = HERE();
00276     while (MORE() && (c = PEEK()) != '|' && c != stop)
00277       p_ere_exp(p);
00278     REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
00279 
00280     if (!EAT('|'))
00281       break;    /* NOTE BREAK OUT */
00282 
00283     if (first) {
00284       INSERT(OCH_, conc); /* offset is wrong */
00285       prevfwd = conc;
00286       prevback = conc;
00287       first = 0;
00288     }
00289     ASTERN(OOR1, prevback);
00290     prevback = THERE();
00291     AHEAD(prevfwd);     /* fix previous offset */
00292     prevfwd = HERE();
00293     EMIT(OOR2, 0);      /* offset is very wrong */
00294   }
00295 
00296   if (!first) {   /* tail-end fixups */
00297     AHEAD(prevfwd);
00298     ASTERN(O_CH, prevback);
00299   }
00300 
00301   assert(!MORE() || SEE(stop));
00302 }
00303 
00304 /*
00305  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
00306  */
00307 static void
00308 p_ere_exp(struct parse *p)
00309 {
00310   char c;
00311   sopno pos;
00312   int count;
00313   int count2;
00314   int backrefnum;
00315   sopno subno;
00316   int wascaret = 0;
00317 
00318   assert(MORE());   /* caller should have ensured this */
00319   c = GETNEXT();
00320 
00321   pos = HERE();
00322   switch (c) {
00323   case '(':
00324     REQUIRE(MORE(), REG_EPAREN);
00325     p->g->nsub++;
00326     subno = p->g->nsub;
00327     if (subno < NPAREN)
00328       p->pbegin[subno] = HERE();
00329     EMIT(OLPAREN, subno);
00330     if (!SEE(')'))
00331       p_ere(p, ')');
00332     if (subno < NPAREN) {
00333       p->pend[subno] = HERE();
00334       assert(p->pend[subno] != 0);
00335     }
00336     EMIT(ORPAREN, subno);
00337     MUSTEAT(')', REG_EPAREN);
00338     break;
00339 #ifndef POSIX_MISTAKE
00340   case ')':   /* happens only if no current unmatched ( */
00341     /*
00342      * You may ask, why the ifndef?  Because I didn't notice
00343      * this until slightly too late for 1003.2, and none of the
00344      * other 1003.2 regular-expression reviewers noticed it at
00345      * all.  So an unmatched ) is legal POSIX, at least until
00346      * we can get it fixed.
00347      */
00348     SETERROR(REG_EPAREN);
00349     break;
00350 #endif
00351   case '^':
00352     EMIT(OBOL, 0);
00353     p->g->iflags |= USEBOL;
00354     p->g->nbol++;
00355     wascaret = 1;
00356     break;
00357   case '$':
00358     EMIT(OEOL, 0);
00359     p->g->iflags |= USEEOL;
00360     p->g->neol++;
00361     break;
00362   case '|':
00363     SETERROR(REG_EMPTY);
00364     break;
00365   case '*':
00366   case '+':
00367   case '?':
00368     SETERROR(REG_BADRPT);
00369     break;
00370   case '.':
00371     if (p->g->cflags&REG_NEWLINE)
00372       nonnewline(p);
00373     else
00374       EMIT(OANY, 0);
00375     break;
00376   case '[':
00377     p_bracket(p);
00378     break;
00379   case '\\':
00380     REQUIRE(MORE(), REG_EESCAPE);
00381     c = GETNEXT();
00382     if (c >= '1' && c <= '9') {
00383       /* \[0-9] is taken to be a back-reference to a previously specified
00384        * matching group. backrefnum will hold the number. The matching
00385        * group must exist (i.e. if \4 is found there must have been at
00386        * least 4 matching groups specified in the pattern previously).
00387        */
00388       backrefnum = c - '0';
00389       if (p->pend[backrefnum] == 0) {
00390         SETERROR(REG_ESUBREG);
00391         break;
00392       }
00393 
00394       /* Make sure everything checks out and emit the sequence
00395        * that marks a back-reference to the parse structure.
00396        */
00397       assert(backrefnum <= p->g->nsub);
00398       EMIT(OBACK_, backrefnum);
00399       assert(p->pbegin[backrefnum] != 0);
00400       assert(OP(p->strip[p->pbegin[backrefnum]]) != OLPAREN);
00401       assert(OP(p->strip[p->pend[backrefnum]]) != ORPAREN);
00402       (void) dupl(p, p->pbegin[backrefnum]+1, p->pend[backrefnum]);
00403       EMIT(O_BACK, backrefnum);
00404       p->g->backrefs = 1;
00405     } else {
00406       /* Other chars are simply themselves when escaped with a backslash.
00407        */
00408       ordinary(p, c);
00409     }
00410     break;
00411   case '{':   /* okay as ordinary except if digit follows */
00412     REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
00413     /* FALLTHROUGH */
00414   default:
00415     ordinary(p, c);
00416     break;
00417   }
00418 
00419   if (!MORE())
00420     return;
00421   c = PEEK();
00422   /* we call { a repetition if followed by a digit */
00423   if (!( c == '*' || c == '+' || c == '?' ||
00424         (c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
00425     return;   /* no repetition, we're done */
00426   NEXT();
00427 
00428   REQUIRE(!wascaret, REG_BADRPT);
00429   switch (c) {
00430   case '*': /* implemented as +? */
00431     /* this case does not require the (y|) trick, noKLUDGE */
00432     INSERT(OPLUS_, pos);
00433     ASTERN(O_PLUS, pos);
00434     INSERT(OQUEST_, pos);
00435     ASTERN(O_QUEST, pos);
00436     break;
00437   case '+':
00438     INSERT(OPLUS_, pos);
00439     ASTERN(O_PLUS, pos);
00440     break;
00441   case '?':
00442     /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
00443     INSERT(OCH_, pos);    /* offset slightly wrong */
00444     ASTERN(OOR1, pos);    /* this one's right */
00445     AHEAD(pos);     /* fix the OCH_ */
00446     EMIT(OOR2, 0);      /* offset very wrong... */
00447     AHEAD(THERE());     /* ...so fix it */
00448     ASTERN(O_CH, THERETHERE());
00449     break;
00450   case '{':
00451     count = p_count(p);
00452     if (EAT(',')) {
00453       if (isdigit((uch)PEEK())) {
00454         count2 = p_count(p);
00455         REQUIRE(count <= count2, REG_BADBR);
00456       } else    /* single number with comma */
00457         count2 = INFINITY;
00458     } else    /* just a single number */
00459       count2 = count;
00460     repeat(p, pos, count, count2);
00461     if (!EAT('}')) {  /* error heuristics */
00462       while (MORE() && PEEK() != '}')
00463         NEXT();
00464       REQUIRE(MORE(), REG_EBRACE);
00465       SETERROR(REG_BADBR);
00466     }
00467     break;
00468   }
00469 
00470   if (!MORE())
00471     return;
00472   c = PEEK();
00473   if (!( c == '*' || c == '+' || c == '?' ||
00474         (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
00475     return;
00476   SETERROR(REG_BADRPT);
00477 }
00478 
00479 /*
00480  - p_str - string (no metacharacters) "parser"
00481  */
00482 static void
00483 p_str(struct parse *p)
00484 {
00485   REQUIRE(MORE(), REG_EMPTY);
00486   while (MORE())
00487     ordinary(p, GETNEXT());
00488 }
00489 
00490 /*
00491  - p_bre - BRE parser top level, anchoring and concatenation
00492  * Giving end1 as OUT essentially eliminates the end1/end2 check.
00493  *
00494  * This implementation is a bit of a kludge, in that a trailing $ is first
00495  * taken as an ordinary character and then revised to be an anchor.  The
00496  * only undesirable side effect is that '$' gets included as a character
00497  * category in such cases.  This is fairly harmless; not worth fixing.
00498  * The amount of lookahead needed to avoid this kludge is excessive.
00499  */
00500 static void
00501 p_bre(struct parse *p,
00502     int end1,   /* first terminating character */
00503     int end2)   /* second terminating character */
00504 {
00505   sopno start = HERE();
00506   int first = 1;      /* first subexpression? */
00507   int wasdollar = 0;
00508 
00509   if (EAT('^')) {
00510     EMIT(OBOL, 0);
00511     p->g->iflags |= USEBOL;
00512     p->g->nbol++;
00513   }
00514   while (MORE() && !SEETWO(end1, end2)) {
00515     wasdollar = p_simp_re(p, first);
00516     first = 0;
00517   }
00518   if (wasdollar) {  /* oops, that was a trailing anchor */
00519     DROP(1);
00520     EMIT(OEOL, 0);
00521     p->g->iflags |= USEEOL;
00522     p->g->neol++;
00523   }
00524 
00525   REQUIRE(HERE() != start, REG_EMPTY);  /* require nonempty */
00526 }
00527 
00528 /*
00529  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
00530  */
00531 static int      /* was the simple RE an unbackslashed $? */
00532 p_simp_re(struct parse *p,
00533     int starordinary)   /* is a leading * an ordinary character? */
00534 {
00535   int c;
00536   int count;
00537   int count2;
00538   sopno pos;
00539   int i;
00540   sopno subno;
00541 # define  BACKSL  (1<<CHAR_BIT)
00542 
00543         pos = HERE(); /* repetition op, if any, covers from here */
00544 
00545         assert(MORE()); /* caller should have ensured this */
00546         c = GETNEXT();
00547   if (c == '\\') {
00548     REQUIRE(MORE(), REG_EESCAPE);
00549     c = BACKSL | GETNEXT();
00550   }
00551   switch (c) {
00552   case '.':
00553     if (p->g->cflags&REG_NEWLINE)
00554       nonnewline(p);
00555     else
00556       EMIT(OANY, 0);
00557     break;
00558   case '[':
00559     p_bracket(p);
00560     break;
00561   case BACKSL|'{':
00562     SETERROR(REG_BADRPT);
00563     break;
00564   case BACKSL|'(':
00565     p->g->nsub++;
00566     subno = p->g->nsub;
00567     if (subno < NPAREN)
00568       p->pbegin[subno] = HERE();
00569     EMIT(OLPAREN, subno);
00570     /* the MORE here is an error heuristic */
00571     if (MORE() && !SEETWO('\\', ')'))
00572       p_bre(p, '\\', ')');
00573     if (subno < NPAREN) {
00574       p->pend[subno] = HERE();
00575       assert(p->pend[subno] != 0);
00576     }
00577     EMIT(ORPAREN, subno);
00578     REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
00579     break;
00580   case BACKSL|')':  /* should not get here -- must be user */
00581   case BACKSL|'}':
00582     SETERROR(REG_EPAREN);
00583     break;
00584   case BACKSL|'1':
00585   case BACKSL|'2':
00586   case BACKSL|'3':
00587   case BACKSL|'4':
00588   case BACKSL|'5':
00589   case BACKSL|'6':
00590   case BACKSL|'7':
00591   case BACKSL|'8':
00592   case BACKSL|'9':
00593     i = (c&~BACKSL) - '0';
00594     assert(i < NPAREN);
00595     if (p->pend[i] != 0) {
00596       assert(i <= p->g->nsub);
00597       EMIT(OBACK_, i);
00598       assert(p->pbegin[i] != 0);
00599       assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
00600       assert(OP(p->strip[p->pend[i]]) == ORPAREN);
00601       (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
00602       EMIT(O_BACK, i);
00603     } else
00604       SETERROR(REG_ESUBREG);
00605     p->g->backrefs = 1;
00606     break;
00607   case '*':
00608     REQUIRE(starordinary, REG_BADRPT);
00609     /* FALLTHROUGH */
00610   default:
00611     ordinary(p, (char)c);
00612     break;
00613   }
00614 
00615   if (EAT('*')) {   /* implemented as +? */
00616     /* this case does not require the (y|) trick, noKLUDGE */
00617     INSERT(OPLUS_, pos);
00618     ASTERN(O_PLUS, pos);
00619     INSERT(OQUEST_, pos);
00620     ASTERN(O_QUEST, pos);
00621   } else if (EATTWO('\\', '{')) {
00622     count = p_count(p);
00623     if (EAT(',')) {
00624       if (MORE() && isdigit((uch)PEEK())) {
00625         count2 = p_count(p);
00626         REQUIRE(count <= count2, REG_BADBR);
00627       } else    /* single number with comma */
00628         count2 = INFINITY;
00629     } else    /* just a single number */
00630       count2 = count;
00631     repeat(p, pos, count, count2);
00632     if (!EATTWO('\\', '}')) { /* error heuristics */
00633       while (MORE() && !SEETWO('\\', '}'))
00634         NEXT();
00635       REQUIRE(MORE(), REG_EBRACE);
00636       SETERROR(REG_BADBR);
00637     }
00638   } else if (c == '$')  /* $ (but not \$) ends it */
00639     return(1);
00640 
00641   return(0);
00642 }
00643 
00644 /*
00645  - p_count - parse a repetition count
00646  */
00647 static int      /* the value */
00648 p_count(struct parse *p)
00649 {
00650   int count = 0;
00651   int ndigits = 0;
00652 
00653   while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
00654     count = count*10 + (GETNEXT() - '0');
00655     ndigits++;
00656   }
00657 
00658   REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
00659   return(count);
00660 }
00661 
00662 /*
00663  - p_bracket - parse a bracketed character list
00664  *
00665  * Note a significant property of this code:  if the allocset() did SETERROR,
00666  * no set operations are done.
00667  */
00668 static void
00669 p_bracket(struct parse *p)
00670 {
00671   cset *cs;
00672   int invert = 0;
00673 
00674   /* Dept of Truly Sickening Special-Case Kludges */
00675   if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
00676     EMIT(OBOW, 0);
00677     NEXTn(6);
00678     return;
00679   }
00680   if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
00681     EMIT(OEOW, 0);
00682     NEXTn(6);
00683     return;
00684   }
00685 
00686   if ((cs = allocset(p)) == NULL) {
00687     /* allocset did set error status in p */
00688     return;
00689   }
00690 
00691   if (EAT('^'))
00692     invert++; /* make note to invert set at end */
00693   if (EAT(']'))
00694     CHadd(cs, ']');
00695   else if (EAT('-'))
00696     CHadd(cs, '-');
00697   while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
00698     p_b_term(p, cs);
00699   if (EAT('-'))
00700     CHadd(cs, '-');
00701   MUSTEAT(']', REG_EBRACK);
00702 
00703   if (p->error != 0) {  /* don't mess things up further */
00704     freeset(p, cs);
00705     return;
00706   }
00707 
00708   if (p->g->cflags&REG_ICASE) {
00709     int i;
00710     int ci;
00711 
00712     for (i = p->g->csetsize - 1; i >= 0; i--)
00713       if (CHIN(cs, i) && isalpha(i)) {
00714         ci = othercase(i);
00715         if (ci != i)
00716           CHadd(cs, ci);
00717       }
00718     if (cs->multis != NULL)
00719       mccase(p, cs);
00720   }
00721   if (invert) {
00722     int i;
00723 
00724     for (i = p->g->csetsize - 1; i >= 0; i--)
00725       if (CHIN(cs, i))
00726         CHsub(cs, i);
00727       else
00728         CHadd(cs, i);
00729     if (p->g->cflags&REG_NEWLINE)
00730       CHsub(cs, '\n');
00731     if (cs->multis != NULL)
00732       mcinvert(p, cs);
00733   }
00734 
00735   assert(cs->multis == NULL);   /* xxx */
00736 
00737   if (nch(p, cs) == 1) {    /* optimize singleton sets */
00738     ordinary(p, firstch(p, cs));
00739     freeset(p, cs);
00740   } else
00741     EMIT(OANYOF, freezeset(p, cs));
00742 }
00743 
00744 /*
00745  - p_b_term - parse one term of a bracketed character list
00746  */
00747 static void
00748 p_b_term(struct parse *p, cset *cs)
00749 {
00750   char c;
00751   char start, finish;
00752   int i;
00753 
00754   /* classify what we've got */
00755   switch ((MORE()) ? PEEK() : '\0') {
00756   case '[':
00757     c = (MORE2()) ? PEEK2() : '\0';
00758     break;
00759   case '-':
00760     SETERROR(REG_ERANGE);
00761     return;     /* NOTE RETURN */
00762     break;
00763   default:
00764     c = '\0';
00765     break;
00766   }
00767 
00768   switch (c) {
00769   case ':':   /* character class */
00770     NEXT2();
00771     REQUIRE(MORE(), REG_EBRACK);
00772     c = PEEK();
00773     REQUIRE(c != '-' && c != ']', REG_ECTYPE);
00774     p_b_cclass(p, cs);
00775     REQUIRE(MORE(), REG_EBRACK);
00776     REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
00777     break;
00778   case '=':   /* equivalence class */
00779     NEXT2();
00780     REQUIRE(MORE(), REG_EBRACK);
00781     c = PEEK();
00782     REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
00783     p_b_eclass(p, cs);
00784     REQUIRE(MORE(), REG_EBRACK);
00785     REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
00786     break;
00787   default:    /* symbol, ordinary character, or range */
00788 /* xxx revision needed for multichar stuff */
00789     start = p_b_symbol(p);
00790     if (SEE('-') && MORE2() && PEEK2() != ']') {
00791       /* range */
00792       NEXT();
00793       if (EAT('-'))
00794         finish = '-';
00795       else
00796         finish = p_b_symbol(p);
00797     } else
00798       finish = start;
00799 /* xxx what about signed chars here... */
00800     REQUIRE(start <= finish, REG_ERANGE);
00801     for (i = start; i <= finish; i++)
00802       CHadd(cs, i);
00803     break;
00804   }
00805 }
00806 
00807 /*
00808  - p_b_cclass - parse a character-class name and deal with it
00809  */
00810 static void
00811 p_b_cclass(struct parse *p, cset *cs)
00812 {
00813   char *sp = p->next;
00814   struct cclass *cp;
00815   size_t len;
00816   const char *u;
00817   char c;
00818 
00819   while (MORE() && isalpha((uch)PEEK()))
00820     NEXT();
00821   len = p->next - sp;
00822   for (cp = cclasses; cp->name != NULL; cp++)
00823     if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
00824       break;
00825   if (cp->name == NULL) {
00826     /* oops, didn't find it */
00827     SETERROR(REG_ECTYPE);
00828     return;
00829   }
00830 
00831   u = cp->chars;
00832   while ((c = *u++) != '\0')
00833     CHadd(cs, c);
00834   for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
00835     MCadd(p, cs, u);
00836 }
00837 
00838 /*
00839  - p_b_eclass - parse an equivalence-class name and deal with it
00840  *
00841  * This implementation is incomplete. xxx
00842  */
00843 static void
00844 p_b_eclass(struct parse *p, cset *cs)
00845 {
00846   char c;
00847 
00848   c = p_b_coll_elem(p, '=');
00849   CHadd(cs, c);
00850 }
00851 
00852 /*
00853  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
00854  */
00855 static char     /* value of symbol */
00856 p_b_symbol(struct parse *p)
00857 {
00858   char value;
00859 
00860   REQUIRE(MORE(), REG_EBRACK);
00861   if (!EATTWO('[', '.'))
00862     return(GETNEXT());
00863 
00864   /* collating symbol */
00865   value = p_b_coll_elem(p, '.');
00866   REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
00867   return(value);
00868 }
00869 
00870 /*
00871  - p_b_coll_elem - parse a collating-element name and look it up
00872  */
00873 static char     /* value of collating element */
00874 p_b_coll_elem(struct parse *p,
00875     int endc)     /* name ended by endc,']' */
00876 {
00877   char *sp = p->next;
00878   struct cname *cp;
00879   int len;
00880 
00881   while (MORE() && !SEETWO(endc, ']'))
00882     NEXT();
00883   if (!MORE()) {
00884     SETERROR(REG_EBRACK);
00885     return(0);
00886   }
00887   len = p->next - sp;
00888   for (cp = cnames; cp->name != NULL; cp++)
00889     if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
00890       return(cp->code); /* known name */
00891   if (len == 1)
00892     return(*sp);  /* single character */
00893   SETERROR(REG_ECOLLATE);     /* neither */
00894   return(0);
00895 }
00896 
00897 /*
00898  - othercase - return the case counterpart of an alphabetic
00899  */
00900 static char     /* if no counterpart, return ch */
00901 othercase(int ch)
00902 {
00903   ch = (uch)ch;
00904   assert(isalpha(ch));
00905   if (isupper(ch))
00906     return ((uch)tolower(ch));
00907   else if (islower(ch))
00908     return ((uch)toupper(ch));
00909   else      /* peculiar, but could happen */
00910     return(ch);
00911 }
00912 
00913 /*
00914  - bothcases - emit a dualcase version of a two-case character
00915  *
00916  * Boy, is this implementation ever a kludge...
00917  */
00918 static void
00919 bothcases(struct parse *p, int ch)
00920 {
00921   char *oldnext = p->next;
00922   char *oldend = p->end;
00923   char bracket[3];
00924 
00925   ch = (uch)ch;
00926   assert(othercase(ch) != ch);  /* p_bracket() would recurse */
00927   p->next = bracket;
00928   p->end = bracket+2;
00929   bracket[0] = ch;
00930   bracket[1] = ']';
00931   bracket[2] = '\0';
00932   p_bracket(p);
00933   assert(p->next == bracket+2);
00934   p->next = oldnext;
00935   p->end = oldend;
00936 }
00937 
00938 /*
00939  - ordinary - emit an ordinary character
00940  */
00941 static void
00942 ordinary(struct parse *p, int ch)
00943 {
00944   cat_t *cap = p->g->categories;
00945 
00946   if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
00947     bothcases(p, ch);
00948   else {
00949     EMIT(OCHAR, (uch)ch);
00950     if (cap[ch] == 0)
00951       cap[ch] = p->g->ncategories++;
00952   }
00953 }
00954 
00955 /*
00956  - nonnewline - emit REG_NEWLINE version of OANY
00957  *
00958  * Boy, is this implementation ever a kludge...
00959  */
00960 static void
00961 nonnewline(struct parse *p)
00962 {
00963   char *oldnext = p->next;
00964   char *oldend = p->end;
00965   char bracket[4];
00966 
00967   p->next = bracket;
00968   p->end = bracket+3;
00969   bracket[0] = '^';
00970   bracket[1] = '\n';
00971   bracket[2] = ']';
00972   bracket[3] = '\0';
00973   p_bracket(p);
00974   assert(p->next == bracket+3);
00975   p->next = oldnext;
00976   p->end = oldend;
00977 }
00978 
00979 /*
00980  - repeat - generate code for a bounded repetition, recursively if needed
00981  */
00982 static void
00983 repeat(struct parse *p,
00984     sopno start,    /* operand from here to end of strip */
00985     int from,     /* repeated from this number */
00986     int to)     /* to this number of times (maybe INFINITY) */
00987 {
00988   sopno finish = HERE();
00989 # define  N 2
00990 # define  INF 3
00991 # define  REP(f, t) ((f)*8 + (t))
00992 # define  MAP(n)  (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
00993   sopno copy;
00994 
00995   if (p->error != 0)  /* head off possible runaway recursion */
00996     return;
00997 
00998   assert(from <= to);
00999 
01000   switch (REP(MAP(from), MAP(to))) {
01001   case REP(0, 0):     /* must be user doing this */
01002     DROP(finish-start); /* drop the operand */
01003     break;
01004   case REP(0, 1):     /* as x{1,1}? */
01005   case REP(0, N):     /* as x{1,n}? */
01006   case REP(0, INF):   /* as x{1,}? */
01007     /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
01008     INSERT(OCH_, start);    /* offset is wrong... */
01009     repeat(p, start+1, 1, to);
01010     ASTERN(OOR1, start);
01011     AHEAD(start);     /* ... fix it */
01012     EMIT(OOR2, 0);
01013     AHEAD(THERE());
01014     ASTERN(O_CH, THERETHERE());
01015     break;
01016   case REP(1, 1):     /* trivial case */
01017     /* done */
01018     break;
01019   case REP(1, N):     /* as x?x{1,n-1} */
01020     /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
01021     INSERT(OCH_, start);
01022     ASTERN(OOR1, start);
01023     AHEAD(start);
01024     EMIT(OOR2, 0);      /* offset very wrong... */
01025     AHEAD(THERE());     /* ...so fix it */
01026     ASTERN(O_CH, THERETHERE());
01027     copy = dupl(p, start+1, finish+1);
01028     assert(copy == finish+4);
01029     repeat(p, copy, 1, to-1);
01030     break;
01031   case REP(1, INF):   /* as x+ */
01032     INSERT(OPLUS_, start);
01033     ASTERN(O_PLUS, start);
01034     break;
01035   case REP(N, N):     /* as xx{m-1,n-1} */
01036     copy = dupl(p, start, finish);
01037     repeat(p, copy, from-1, to-1);
01038     break;
01039   case REP(N, INF):   /* as xx{n-1,INF} */
01040     copy = dupl(p, start, finish);
01041     repeat(p, copy, from-1, to);
01042     break;
01043   default:      /* "can't happen" */
01044     SETERROR(REG_ASSERT); /* just in case */
01045     break;
01046   }
01047 }
01048 
01049 /*
01050  - seterr - set an error condition
01051  */
01052 static int      /* useless but makes type checking happy */
01053 seterr(struct parse *p, int e)
01054 {
01055   if (p->error == 0)  /* keep earliest error condition */
01056     p->error = e;
01057   p->next = nuls;   /* try to bring things to a halt */
01058   p->end = nuls;
01059   return(0);    /* make the return value well-defined */
01060 }
01061 
01062 /*
01063  - allocset - allocate a set of characters for []
01064  */
01065 static cset *
01066 allocset(struct parse *p)
01067 {
01068   int no = p->g->ncsets++;
01069   size_t nc;
01070   size_t nbytes;
01071   cset *cs;
01072   size_t css = (size_t)p->g->csetsize;
01073   int i;
01074 
01075   if (no >= p->ncsalloc) {  /* need another column of space */
01076     void *ptr;
01077 
01078     p->ncsalloc += CHAR_BIT;
01079     nc = p->ncsalloc;
01080     if (nc > SIZE_MAX / sizeof(cset))
01081       goto nomem;
01082     assert(nc % CHAR_BIT == 0);
01083     nbytes = nc / CHAR_BIT * css;
01084 
01085     ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset));
01086     if (ptr == NULL)
01087       goto nomem;
01088     p->g->sets = ptr;
01089 
01090     ptr = (uch *)realloc((char *)p->g->setbits, nbytes);
01091     if (ptr == NULL)
01092       goto nomem;
01093     p->g->setbits = ptr;
01094 
01095     for (i = 0; i < no; i++)
01096       p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
01097 
01098     (void) memset((char *)p->g->setbits + (nbytes - css), 0, css);
01099   }
01100   /* XXX should not happen */
01101   if (p->g->sets == NULL || p->g->setbits == NULL)
01102     goto nomem;
01103 
01104   cs = &p->g->sets[no];
01105   cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
01106   cs->mask = 1 << ((no) % CHAR_BIT);
01107   cs->hash = 0;
01108   cs->smultis = 0;
01109   cs->multis = NULL;
01110 
01111   return(cs);
01112 nomem:
01113   free(p->g->sets);
01114   p->g->sets = NULL;
01115   free(p->g->setbits);
01116   p->g->setbits = NULL;
01117 
01118   SETERROR(REG_ESPACE);
01119   /* caller's responsibility not to do set ops */
01120   return(NULL);
01121 }
01122 
01123 /*
01124  - freeset - free a now-unused set
01125  */
01126 static void
01127 freeset(struct parse *p, cset *cs)
01128 {
01129   size_t i;
01130   cset *top = &p->g->sets[p->g->ncsets];
01131   size_t css = (size_t)p->g->csetsize;
01132 
01133   for (i = 0; i < css; i++)
01134     CHsub(cs, i);
01135   if (cs == top-1)  /* recover only the easy case */
01136     p->g->ncsets--;
01137 }
01138 
01139 /*
01140  - freezeset - final processing on a set of characters
01141  *
01142  * The main task here is merging identical sets.  This is usually a waste
01143  * of time (although the hash code minimizes the overhead), but can win
01144  * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
01145  * is done using addition rather than xor -- all ASCII [aA] sets xor to
01146  * the same value!
01147  */
01148 static int      /* set number */
01149 freezeset(struct parse *p, cset *cs)
01150 {
01151   uch h = cs->hash;
01152   size_t i;
01153   cset *top = &p->g->sets[p->g->ncsets];
01154   cset *cs2;
01155   size_t css = (size_t)p->g->csetsize;
01156 
01157   /* look for an earlier one which is the same */
01158   for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
01159     if (cs2->hash == h && cs2 != cs) {
01160       /* maybe */
01161       for (i = 0; i < css; i++)
01162         if (!!CHIN(cs2, i) != !!CHIN(cs, i))
01163           break;    /* no */
01164       if (i == css)
01165         break;      /* yes */
01166     }
01167 
01168   if (cs2 < top) {  /* found one */
01169     freeset(p, cs);
01170     cs = cs2;
01171   }
01172 
01173   return((int)(cs - p->g->sets));
01174 }
01175 
01176 /*
01177  - firstch - return first character in a set (which must have at least one)
01178  */
01179 static int      /* character; there is no "none" value */
01180 firstch(struct parse *p, cset *cs)
01181 {
01182   size_t i;
01183   size_t css = (size_t)p->g->csetsize;
01184 
01185   for (i = 0; i < css; i++)
01186     if (CHIN(cs, i))
01187       return((char)i);
01188   assert(never);
01189   return(0);    /* arbitrary */
01190 }
01191 
01192 /*
01193  - nch - number of characters in a set
01194  */
01195 static int
01196 nch(struct parse *p, cset *cs)
01197 {
01198   size_t i;
01199   size_t css = (size_t)p->g->csetsize;
01200   int n = 0;
01201 
01202   for (i = 0; i < css; i++)
01203     if (CHIN(cs, i))
01204       n++;
01205   return(n);
01206 }
01207 
01208 /*
01209  - mcadd - add a collating element to a cset
01210  */
01211 static void
01212 mcadd( struct parse *p, cset *cs, const char *cp)
01213 {
01214   size_t oldend = cs->smultis;
01215   void *np;
01216 
01217   cs->smultis += strlen(cp) + 1;
01218   np = realloc(cs->multis, cs->smultis);
01219   if (np == NULL) {
01220     if (cs->multis)
01221       free(cs->multis);
01222     cs->multis = NULL;
01223     SETERROR(REG_ESPACE);
01224     return;
01225   }
01226   cs->multis = np;
01227 
01228   llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1);
01229 }
01230 
01231 /*
01232  - mcinvert - invert the list of collating elements in a cset
01233  *
01234  * This would have to know the set of possibilities.  Implementation
01235  * is deferred.
01236  */
01237 /* ARGSUSED */
01238 static void
01239 mcinvert(struct parse *p, cset *cs)
01240 {
01241   assert(cs->multis == NULL); /* xxx */
01242 }
01243 
01244 /*
01245  - mccase - add case counterparts of the list of collating elements in a cset
01246  *
01247  * This would have to know the set of possibilities.  Implementation
01248  * is deferred.
01249  */
01250 /* ARGSUSED */
01251 static void
01252 mccase(struct parse *p, cset *cs)
01253 {
01254   assert(cs->multis == NULL); /* xxx */
01255 }
01256 
01257 /*
01258  - isinsets - is this character in any sets?
01259  */
01260 static int      /* predicate */
01261 isinsets(struct re_guts *g, int c)
01262 {
01263   uch *col;
01264   int i;
01265   int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
01266   unsigned uc = (uch)c;
01267 
01268   for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
01269     if (col[uc] != 0)
01270       return(1);
01271   return(0);
01272 }
01273 
01274 /*
01275  - samesets - are these two characters in exactly the same sets?
01276  */
01277 static int      /* predicate */
01278 samesets(struct re_guts *g, int c1, int c2)
01279 {
01280   uch *col;
01281   int i;
01282   int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
01283   unsigned uc1 = (uch)c1;
01284   unsigned uc2 = (uch)c2;
01285 
01286   for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
01287     if (col[uc1] != col[uc2])
01288       return(0);
01289   return(1);
01290 }
01291 
01292 /*
01293  - categorize - sort out character categories
01294  */
01295 static void
01296 categorize(struct parse *p, struct re_guts *g)
01297 {
01298   cat_t *cats = g->categories;
01299   int c;
01300   int c2;
01301   cat_t cat;
01302 
01303   /* avoid making error situations worse */
01304   if (p->error != 0)
01305     return;
01306 
01307   for (c = CHAR_MIN; c <= CHAR_MAX; c++)
01308     if (cats[c] == 0 && isinsets(g, c)) {
01309       cat = g->ncategories++;
01310       cats[c] = cat;
01311       for (c2 = c+1; c2 <= CHAR_MAX; c2++)
01312         if (cats[c2] == 0 && samesets(g, c, c2))
01313           cats[c2] = cat;
01314     }
01315 }
01316 
01317 /*
01318  - dupl - emit a duplicate of a bunch of sops
01319  */
01320 static sopno      /* start of duplicate */
01321 dupl(struct parse *p,
01322     sopno start,    /* from here */
01323     sopno finish)   /* to this less one */
01324 {
01325   sopno ret = HERE();
01326   sopno len = finish - start;
01327 
01328   assert(finish >= start);
01329   if (len == 0)
01330     return(ret);
01331   enlarge(p, p->ssize + len); /* this many unexpected additions */
01332   assert(p->ssize >= p->slen + len);
01333   (void) memmove((char *)(p->strip + p->slen),
01334     (char *)(p->strip + start), (size_t)len*sizeof(sop));
01335   p->slen += len;
01336   return(ret);
01337 }
01338 
01339 /*
01340  - doemit - emit a strip operator
01341  *
01342  * It might seem better to implement this as a macro with a function as
01343  * hard-case backup, but it's just too big and messy unless there are
01344  * some changes to the data structures.  Maybe later.
01345  */
01346 static void
01347 doemit(struct parse *p, sop op, size_t opnd)
01348 {
01349   /* avoid making error situations worse */
01350   if (p->error != 0)
01351     return;
01352 
01353   /* deal with oversize operands ("can't happen", more or less) */
01354   assert(opnd < 1<<OPSHIFT);
01355 
01356   /* deal with undersized strip */
01357   if (p->slen >= p->ssize)
01358     enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
01359   assert(p->slen < p->ssize);
01360 
01361   /* finally, it's all reduced to the easy case */
01362   p->strip[p->slen++] = SOP(op, opnd);
01363 }
01364 
01365 /*
01366  - doinsert - insert a sop into the strip
01367  */
01368 static void
01369 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
01370 {
01371   sopno sn;
01372   sop s;
01373   int i;
01374 
01375   /* avoid making error situations worse */
01376   if (p->error != 0)
01377     return;
01378 
01379   sn = HERE();
01380   EMIT(op, opnd);   /* do checks, ensure space */
01381   assert(HERE() == sn+1);
01382   s = p->strip[sn];
01383 
01384   /* adjust paren pointers */
01385   assert(pos > 0);
01386   for (i = 1; i < NPAREN; i++) {
01387     if (p->pbegin[i] >= pos) {
01388       p->pbegin[i]++;
01389     }
01390     if (p->pend[i] >= pos) {
01391       p->pend[i]++;
01392     }
01393   }
01394 
01395   memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
01396             (HERE()-pos-1)*sizeof(sop));
01397   p->strip[pos] = s;
01398 }
01399 
01400 /*
01401  - dofwd - complete a forward reference
01402  */
01403 static void
01404 dofwd(struct parse *p, sopno pos, sop value)
01405 {
01406   /* avoid making error situations worse */
01407   if (p->error != 0)
01408     return;
01409 
01410   assert(value < 1<<OPSHIFT);
01411   p->strip[pos] = OP(p->strip[pos]) | value;
01412 }
01413 
01414 /*
01415  - enlarge - enlarge the strip
01416  */
01417 static void
01418 enlarge(struct parse *p, sopno size)
01419 {
01420   sop *sp;
01421 
01422   if (p->ssize >= size)
01423     return;
01424 
01425   if ((uintptr_t)size > SIZE_MAX / sizeof(sop)) {
01426     SETERROR(REG_ESPACE);
01427     return;
01428   }
01429 
01430   sp = (sop *)realloc(p->strip, size*sizeof(sop));
01431   if (sp == NULL) {
01432     SETERROR(REG_ESPACE);
01433     return;
01434   }
01435   p->strip = sp;
01436   p->ssize = size;
01437 }
01438 
01439 /*
01440  - stripsnug - compact the strip
01441  */
01442 static void
01443 stripsnug(struct parse *p, struct re_guts *g)
01444 {
01445   g->nstates = p->slen;
01446   if ((uintptr_t)p->slen > SIZE_MAX / sizeof(sop)) {
01447     g->strip = p->strip;
01448     SETERROR(REG_ESPACE);
01449     return;
01450   }
01451 
01452   g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
01453   if (g->strip == NULL) {
01454     SETERROR(REG_ESPACE);
01455     g->strip = p->strip;
01456   }
01457 }
01458 
01459 /*
01460  - findmust - fill in must and mlen with longest mandatory literal string
01461  *
01462  * This algorithm could do fancy things like analyzing the operands of |
01463  * for common subsequences.  Someday.  This code is simple and finds most
01464  * of the interesting cases.
01465  *
01466  * Note that must and mlen got initialized during setup.
01467  */
01468 static void
01469 findmust(struct parse *p, struct re_guts *g)
01470 {
01471   sop *scan;
01472   sop *start = 0; /* start initialized in the default case, after that */
01473   sop *newstart = 0; /* newstart was initialized in the OCHAR case */
01474   sopno newlen;
01475   sop s;
01476   char *cp;
01477   sopno i;
01478 
01479   /* avoid making error situations worse */
01480   if (p->error != 0)
01481     return;
01482 
01483   /* find the longest OCHAR sequence in strip */
01484   newlen = 0;
01485   scan = g->strip + 1;
01486   do {
01487     s = *scan++;
01488     switch (OP(s)) {
01489     case OCHAR:   /* sequence member */
01490       if (newlen == 0)    /* new sequence */
01491         newstart = scan - 1;
01492       newlen++;
01493       break;
01494     case OPLUS_:    /* things that don't break one */
01495     case OLPAREN:
01496     case ORPAREN:
01497       break;
01498     case OQUEST_:   /* things that must be skipped */
01499     case OCH_:
01500       scan--;
01501       do {
01502         scan += OPND(s);
01503         s = *scan;
01504         /* assert() interferes w debug printouts */
01505         if (OP(s) != O_QUEST && OP(s) != O_CH &&
01506               OP(s) != OOR2) {
01507           g->iflags |= REGEX_BAD;
01508           return;
01509         }
01510       } while (OP(s) != O_QUEST && OP(s) != O_CH);
01511       /* fallthrough */
01512     default:    /* things that break a sequence */
01513       if (newlen > g->mlen) {   /* ends one */
01514         start = newstart;
01515         g->mlen = newlen;
01516       }
01517       newlen = 0;
01518       break;
01519     }
01520   } while (OP(s) != OEND);
01521 
01522   if (g->mlen == 0)   /* there isn't one */
01523     return;
01524 
01525   /* turn it into a character string */
01526   g->must = malloc((size_t)g->mlen + 1);
01527   if (g->must == NULL) {    /* argh; just forget it */
01528     g->mlen = 0;
01529     return;
01530   }
01531   cp = g->must;
01532   scan = start;
01533   for (i = g->mlen; i > 0; i--) {
01534     while (OP(s = *scan++) != OCHAR)
01535       continue;
01536     assert(cp < g->must + g->mlen);
01537     *cp++ = (char)OPND(s);
01538   }
01539   assert(cp == g->must + g->mlen);
01540   *cp++ = '\0';   /* just on general principles */
01541 }
01542 
01543 /*
01544  - pluscount - count + nesting
01545  */
01546 static sopno      /* nesting depth */
01547 pluscount(struct parse *p, struct re_guts *g)
01548 {
01549   sop *scan;
01550   sop s;
01551   sopno plusnest = 0;
01552   sopno maxnest = 0;
01553 
01554   if (p->error != 0)
01555     return(0);  /* there may not be an OEND */
01556 
01557   scan = g->strip + 1;
01558   do {
01559     s = *scan++;
01560     switch (OP(s)) {
01561     case OPLUS_:
01562       plusnest++;
01563       break;
01564     case O_PLUS:
01565       if (plusnest > maxnest)
01566         maxnest = plusnest;
01567       plusnest--;
01568       break;
01569     }
01570   } while (OP(s) != OEND);
01571   if (plusnest != 0)
01572     g->iflags |= REGEX_BAD;
01573   return(maxnest);
01574 }