LLVM  17.0.0git
regengine.inc
Go to the documentation of this file.
1 /*-
2  * This code is derived from OpenBSD's libc/regex, original license follows:
3  *
4  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5  * Copyright (c) 1992, 1993, 1994
6  * The Regents of the University of California. All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * Henry Spencer.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  * notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  * notice, this list of conditions and the following disclaimer in the
18  * documentation and/or other materials provided with the distribution.
19  * 3. Neither the name of the University nor the names of its contributors
20  * may be used to endorse or promote products derived from this software
21  * without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  * @(#)engine.c 8.5 (Berkeley) 3/20/94
36  */
37 
38 /*
39  * The matching engine and friends. This file is #included by regexec.c
40  * after suitable #defines of a variety of macros used herein, so that
41  * different state representations can be used without duplicating masses
42  * of code.
43  */
44 
45 #ifdef SNAMES
46 #define matcher smatcher
47 #define fast sfast
48 #define slow sslow
49 #define dissect sdissect
50 #define backref sbackref
51 #define step sstep
52 #define print sprint
53 #define at sat
54 #define match smat
55 #define nope snope
56 #define step_back sstep_back
57 #endif
58 #ifdef LNAMES
59 #define matcher lmatcher
60 #define fast lfast
61 #define slow lslow
62 #define dissect ldissect
63 #define backref lbackref
64 #define step lstep
65 #define print lprint
66 #define at lat
67 #define match lmat
68 #define nope lnope
69 #define step_back lstep_back
70 #endif
71 
72 /* another structure passed up and down to avoid zillions of parameters */
73 struct match {
74  struct re_guts *g;
75  int eflags;
76  llvm_regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
77  const char *offp; /* offsets work from here */
78  const char *beginp; /* start of string -- virtual NUL precedes */
79  const char *endp; /* end of string -- virtual NUL here */
80  const char *coldp; /* can be no match starting before here */
81  const char **lastpos; /* [nplus+1] */
82  STATEVARS;
83  states st; /* current states */
84  states fresh; /* states for a fresh start */
85  states tmp; /* temporary */
86  states empty; /* empty set of states */
87 };
88 
89 static int matcher(struct re_guts *, const char *, size_t,
90  llvm_regmatch_t[], int);
91 static const char *dissect(struct match *, const char *, const char *, sopno,
92  sopno);
93 static const char *backref(struct match *, const char *, const char *, sopno,
94  sopno, sopno, int);
95 static const char *fast(struct match *, const char *, const char *, sopno, sopno);
96 static const char *slow(struct match *, const char *, const char *, sopno, sopno);
97 static states step(struct re_guts *, sopno, sopno, states, int, states);
98 #define MAX_RECURSION 100
99 #define BOL (OUT+1)
100 #define EOL (BOL+1)
101 #define BOLEOL (BOL+2)
102 #define NOTHING (BOL+3)
103 #define BOW (BOL+4)
104 #define EOW (BOL+5)
105 #define CODEMAX (BOL+5) /* highest code used */
106 #define NONCHAR(c) ((c) > CHAR_MAX)
107 #define NNONCHAR (CODEMAX-CHAR_MAX)
108 #ifdef REDEBUG
109 static void print(struct match *, const char *, states, int, FILE *);
110 #endif
111 #ifdef REDEBUG
112 static void at(
113  struct match *, const char *, const char *, const char *, sopno, sopno);
114 #endif
115 #ifdef REDEBUG
116 static char *pchar(int);
117 #endif
118 
119 #ifdef REDEBUG
120 #define SP(t, s, c) print(m, t, s, c, stdout)
121 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
122 #define NOTE(str) { if (m->eflags&REG_TRACE) (void)printf("=%s\n", (str)); }
123 static int nope = 0;
124 #else
125 #define SP(t, s, c) /* nothing */
126 #define AT(t, p1, p2, s1, s2) /* nothing */
127 #define NOTE(s) /* nothing */
128 #endif
129 
130 /*
131  - matcher - the actual matching engine
132  */
133 static int /* 0 success, REG_NOMATCH failure */
134 matcher(struct re_guts *g, const char *string, size_t nmatch,
135  llvm_regmatch_t pmatch[],
136  int eflags)
137 {
138  const char *endp;
139  size_t i;
140  struct match mv;
141  struct match *m = &mv;
142  const char *dp;
143  const sopno gf = g->firststate+1; /* +1 for OEND */
144  const sopno gl = g->laststate;
145  const char *start;
146  const char *stop;
147 
148  /* simplify the situation where possible */
149  if (g->cflags&REG_NOSUB)
150  nmatch = 0;
151  if (eflags&REG_STARTEND) {
152  start = string + pmatch[0].rm_so;
153  stop = string + pmatch[0].rm_eo;
154  } else {
155  start = string;
156  stop = start + strlen(start);
157  }
158  if (stop < start)
159  return(REG_INVARG);
160 
161  /* prescreening; this does wonders for this rather slow code */
162  if (g->must != NULL) {
163  for (dp = start; dp < stop; dp++)
164  if (*dp == g->must[0] && stop - dp >= g->mlen &&
165  memcmp(dp, g->must, (size_t)g->mlen) == 0)
166  break;
167  if (dp == stop) /* we didn't find g->must */
168  return(REG_NOMATCH);
169  }
170 
171  /* match struct setup */
172  m->g = g;
173  m->eflags = eflags;
174  m->pmatch = NULL;
175  m->lastpos = NULL;
176  m->offp = string;
177  m->beginp = start;
178  m->endp = stop;
179  STATESETUP(m, 4);
180  SETUP(m->st);
181  SETUP(m->fresh);
182  SETUP(m->tmp);
183  SETUP(m->empty);
184  CLEAR(m->empty);
185 
186  /* this loop does only one repetition except for backrefs */
187  for (;;) {
188  endp = fast(m, start, stop, gf, gl);
189  if (endp == NULL) { /* a miss */
190  free(m->pmatch);
191  free((void*)m->lastpos);
192  STATETEARDOWN(m);
193  return(REG_NOMATCH);
194  }
195  if (nmatch == 0 && !g->backrefs)
196  break; /* no further info needed */
197 
198  /* where? */
199  assert(m->coldp != NULL);
200  for (;;) {
201  NOTE("finding start");
202  endp = slow(m, m->coldp, stop, gf, gl);
203  if (endp != NULL)
204  break;
205  assert(m->coldp < m->endp);
206  m->coldp++;
207  }
208  if (nmatch == 1 && !g->backrefs)
209  break; /* no further info needed */
210 
211  /* oh my, they want the subexpressions... */
212  if (m->pmatch == NULL)
213  m->pmatch = (llvm_regmatch_t *)malloc((m->g->nsub + 1) *
214  sizeof(llvm_regmatch_t));
215  if (m->pmatch == NULL) {
216  STATETEARDOWN(m);
217  return(REG_ESPACE);
218  }
219  for (i = 1; i <= m->g->nsub; i++)
220  m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
221  if (!g->backrefs && !(m->eflags&REG_BACKR)) {
222  NOTE("dissecting");
223  dp = dissect(m, m->coldp, endp, gf, gl);
224  } else {
225  if (g->nplus > 0 && m->lastpos == NULL)
226  m->lastpos = (const char **)malloc((g->nplus+1) *
227  sizeof(char *));
228  if (g->nplus > 0 && m->lastpos == NULL) {
229  free(m->pmatch);
230  STATETEARDOWN(m);
231  return(REG_ESPACE);
232  }
233  NOTE("backref dissect");
234  dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
235  }
236  if (dp != NULL)
237  break;
238 
239  /* uh-oh... we couldn't find a subexpression-level match */
240  assert(g->backrefs); /* must be back references doing it */
241  assert(g->nplus == 0 || m->lastpos != NULL);
242  for (;;) {
243  if (dp != NULL || endp <= m->coldp)
244  break; /* defeat */
245  NOTE("backoff");
246  endp = slow(m, m->coldp, endp-1, gf, gl);
247  if (endp == NULL)
248  break; /* defeat */
249  /* try it on a shorter possibility */
250 #ifndef NDEBUG
251  for (i = 1; i <= m->g->nsub; i++) {
252  assert(m->pmatch[i].rm_so == -1);
253  assert(m->pmatch[i].rm_eo == -1);
254  }
255 #endif
256  NOTE("backoff dissect");
257  dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
258  }
259  assert(dp == NULL || dp == endp);
260  if (dp != NULL) /* found a shorter one */
261  break;
262 
263  /* despite initial appearances, there is no match here */
264  NOTE("false alarm");
265  if (m->coldp == stop)
266  break;
267  start = m->coldp + 1; /* recycle starting later */
268  }
269 
270  /* fill in the details if requested */
271  if (nmatch > 0) {
272  pmatch[0].rm_so = m->coldp - m->offp;
273  pmatch[0].rm_eo = endp - m->offp;
274  }
275  if (nmatch > 1) {
276  assert(m->pmatch != NULL);
277  for (i = 1; i < nmatch; i++)
278  if (i <= m->g->nsub)
279  pmatch[i] = m->pmatch[i];
280  else {
281  pmatch[i].rm_so = -1;
282  pmatch[i].rm_eo = -1;
283  }
284  }
285 
286  if (m->pmatch != NULL)
287  free((char *)m->pmatch);
288  if (m->lastpos != NULL)
289  free((char *)m->lastpos);
290  STATETEARDOWN(m);
291  return(0);
292 }
293 
294 /* Step back from "stop" to a position where the strip startst..stopst might
295  * match. This can always conservatively return "stop - 1", but may return an
296  * earlier position if matches at later positions are impossible. */
297 static const char *
298 step_back(struct re_guts *g, const char *start, const char *stop, sopno startst,
299  sopno stopst)
300 {
301  /* Always step back at least one character. */
302  assert(stop > start);
303  const char *res = stop - 1;
304 
305  /* Check whether the strip startst..stropst starts with a fixed character,
306  * ignoring any closing parentheses. If not, return a conservative result. */
307  for (;;) {
308  if (startst >= stopst)
309  return res;
310  if (OP(g->strip[startst]) != ORPAREN)
311  break;
312  startst++;
313  }
314  if (OP(g->strip[startst]) != OCHAR)
315  return res;
316 
317  /* Find the character that starts the following match. */
318  char ch = OPND(g->strip[startst]);
319  for (; res != start; --res) {
320  if (*res == ch) {
321  /* Try to check the next fixed character as well. */
322  sopno nextst = startst + 1;
323  const char *next = res + 1;
324  if (nextst >= stopst || OP(g->strip[nextst]) != OCHAR || next >= stop ||
325  *next == (char)OPND(g->strip[nextst]))
326  break;
327  }
328  }
329  return res;
330 }
331 
332 /*
333  - dissect - figure out what matched what, no back references
334  */
335 static const char * /* == stop (success) always */
336 dissect(struct match *m, const char *start, const char *stop, sopno startst,
337  sopno stopst)
338 {
339  int i;
340  sopno ss; /* start sop of current subRE */
341  sopno es; /* end sop of current subRE */
342  const char *sp; /* start of string matched by it */
343  const char *stp; /* string matched by it cannot pass here */
344  const char *rest; /* start of rest of string */
345  const char *tail; /* string unmatched by rest of RE */
346  sopno ssub; /* start sop of subsubRE */
347  sopno esub; /* end sop of subsubRE */
348  const char *ssp; /* start of string matched by subsubRE */
349  const char *sep; /* end of string matched by subsubRE */
350  const char *oldssp; /* previous ssp */
351 
352  AT("diss", start, stop, startst, stopst);
353  sp = start;
354  for (ss = startst; ss < stopst; ss = es) {
355  /* identify end of subRE */
356  es = ss;
357  switch (OP(m->g->strip[es])) {
358  case OPLUS_:
359  case OQUEST_:
360  es += OPND(m->g->strip[es]);
361  break;
362  case OCH_:
363  while (OP(m->g->strip[es]) != O_CH)
364  es += OPND(m->g->strip[es]);
365  break;
366  }
367  es++;
368 
369  /* figure out what it matched */
370  switch (OP(m->g->strip[ss])) {
371  case OEND:
372  assert(nope);
373  break;
374  case OCHAR:
375  sp++;
376  break;
377  case OBOL:
378  case OEOL:
379  case OBOW:
380  case OEOW:
381  break;
382  case OANY:
383  case OANYOF:
384  sp++;
385  break;
386  case OBACK_:
387  case O_BACK:
388  assert(nope);
389  break;
390  /* cases where length of match is hard to find */
391  case OQUEST_:
392  stp = stop;
393  for (;;) {
394  /* how long could this one be? */
395  rest = slow(m, sp, stp, ss, es);
396  assert(rest != NULL); /* it did match */
397  /* could the rest match the rest? */
398  tail = slow(m, rest, stop, es, stopst);
399  if (tail == stop)
400  break; /* yes! */
401  /* no -- try a shorter match for this one */
402  stp = step_back(m->g, sp, rest, es, stopst);
403  assert(stp >= sp); /* it did work */
404  }
405  ssub = ss + 1;
406  esub = es - 1;
407  /* did innards match? */
408  if (slow(m, sp, rest, ssub, esub) != NULL) {
409  const char *dp = dissect(m, sp, rest, ssub, esub);
410  (void)dp; /* avoid warning if assertions off */
411  assert(dp == rest);
412  } else /* no */
413  assert(sp == rest);
414  sp = rest;
415  break;
416  case OPLUS_:
417  stp = stop;
418  for (;;) {
419  /* how long could this one be? */
420  rest = slow(m, sp, stp, ss, es);
421  assert(rest != NULL); /* it did match */
422  /* could the rest match the rest? */
423  tail = slow(m, rest, stop, es, stopst);
424  if (tail == stop)
425  break; /* yes! */
426  /* no -- try a shorter match for this one */
427  stp = step_back(m->g, sp, rest, es, stopst);
428  assert(stp >= sp); /* it did work */
429  }
430  ssub = ss + 1;
431  esub = es - 1;
432  ssp = sp;
433  oldssp = ssp;
434  for (;;) { /* find last match of innards */
435  sep = slow(m, ssp, rest, ssub, esub);
436  if (sep == NULL || sep == ssp)
437  break; /* failed or matched null */
438  oldssp = ssp; /* on to next try */
439  ssp = sep;
440  }
441  if (sep == NULL) {
442  /* last successful match */
443  sep = ssp;
444  ssp = oldssp;
445  }
446  assert(sep == rest); /* must exhaust substring */
447  assert(slow(m, ssp, sep, ssub, esub) == rest);
448  {
449  const char *dp = dissect(m, ssp, sep, ssub, esub);
450  (void)dp; /* avoid warning if assertions off */
451  assert(dp == sep);
452  }
453  sp = rest;
454  break;
455  case OCH_:
456  stp = stop;
457  for (;;) {
458  /* how long could this one be? */
459  rest = slow(m, sp, stp, ss, es);
460  assert(rest != NULL); /* it did match */
461  /* could the rest match the rest? */
462  tail = slow(m, rest, stop, es, stopst);
463  if (tail == stop)
464  break; /* yes! */
465  /* no -- try a shorter match for this one */
466  stp = rest - 1;
467  assert(stp >= sp); /* it did work */
468  }
469  ssub = ss + 1;
470  esub = ss + OPND(m->g->strip[ss]) - 1;
471  assert(OP(m->g->strip[esub]) == OOR1);
472  for (;;) { /* find first matching branch */
473  if (slow(m, sp, rest, ssub, esub) == rest)
474  break; /* it matched all of it */
475  /* that one missed, try next one */
476  assert(OP(m->g->strip[esub]) == OOR1);
477  esub++;
478  assert(OP(m->g->strip[esub]) == OOR2);
479  ssub = esub + 1;
480  esub += OPND(m->g->strip[esub]);
481  if (OP(m->g->strip[esub]) == OOR2)
482  esub--;
483  else
484  assert(OP(m->g->strip[esub]) == O_CH);
485  }
486  {
487  const char *dp = dissect(m, sp, rest, ssub, esub);
488  (void)dp; /* avoid warning if assertions off */
489  assert(dp == rest);
490  }
491  sp = rest;
492  break;
493  case O_PLUS:
494  case O_QUEST:
495  case OOR1:
496  case OOR2:
497  case O_CH:
498  assert(nope);
499  break;
500  case OLPAREN:
501  i = OPND(m->g->strip[ss]);
502  assert(0 < i && i <= m->g->nsub);
503  m->pmatch[i].rm_so = sp - m->offp;
504  break;
505  case ORPAREN:
506  i = OPND(m->g->strip[ss]);
507  assert(0 < i && i <= m->g->nsub);
508  m->pmatch[i].rm_eo = sp - m->offp;
509  break;
510  default: /* uh oh */
511  assert(nope);
512  break;
513  }
514  }
515 
516  assert(sp == stop);
517  return(sp);
518 }
519 
520 /*
521  - backref - figure out what matched what, figuring in back references
522  */
523 static const char * /* == stop (success) or NULL (failure) */
524 backref(struct match *m, const char *start, const char *stop, sopno startst,
525  sopno stopst, sopno lev, int rec) /* PLUS nesting level */
526 {
527  int i;
528  sopno ss; /* start sop of current subRE */
529  const char *sp; /* start of string matched by it */
530  sopno ssub; /* start sop of subsubRE */
531  sopno esub; /* end sop of subsubRE */
532  const char *ssp; /* start of string matched by subsubRE */
533  const char *dp;
534  size_t len;
535  int hard;
536  sop s;
537  llvm_regoff_t offsave;
538  cset *cs;
539 
540  AT("back", start, stop, startst, stopst);
541  sp = start;
542 
543  /* get as far as we can with easy stuff */
544  hard = 0;
545  for (ss = startst; !hard && ss < stopst; ss++)
546  switch (OP(s = m->g->strip[ss])) {
547  case OCHAR:
548  if (sp == stop || *sp++ != (char)OPND(s))
549  return(NULL);
550  break;
551  case OANY:
552  if (sp == stop)
553  return(NULL);
554  sp++;
555  break;
556  case OANYOF:
557  cs = &m->g->sets[OPND(s)];
558  if (sp == stop || !CHIN(cs, *sp++))
559  return(NULL);
560  break;
561  case OBOL:
562  if ( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
563  (sp < m->endp && *(sp-1) == '\n' &&
564  (m->g->cflags&REG_NEWLINE)) )
565  { /* yes */ }
566  else
567  return(NULL);
568  break;
569  case OEOL:
570  if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
571  (sp < m->endp && *sp == '\n' &&
572  (m->g->cflags&REG_NEWLINE)) )
573  { /* yes */ }
574  else
575  return(NULL);
576  break;
577  case OBOW:
578  if (( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
579  (sp < m->endp && *(sp-1) == '\n' &&
580  (m->g->cflags&REG_NEWLINE)) ||
581  (sp > m->beginp &&
582  !ISWORD(*(sp-1))) ) &&
583  (sp < m->endp && ISWORD(*sp)) )
584  { /* yes */ }
585  else
586  return(NULL);
587  break;
588  case OEOW:
589  if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
590  (sp < m->endp && *sp == '\n' &&
591  (m->g->cflags&REG_NEWLINE)) ||
592  (sp < m->endp && !ISWORD(*sp)) ) &&
593  (sp > m->beginp && ISWORD(*(sp-1))) )
594  { /* yes */ }
595  else
596  return(NULL);
597  break;
598  case O_QUEST:
599  case O_CH:
600  break;
601  case OOR1: /* matches null but needs to skip */
602  ss++;
603  s = m->g->strip[ss];
604  do {
605  assert(OP(s) == OOR2);
606  ss += OPND(s);
607  } while (OP(s = m->g->strip[ss]) != O_CH);
608  /* note that the ss++ gets us past the O_CH */
609  break;
610  default: /* have to make a choice */
611  hard = 1;
612  break;
613  }
614  if (!hard) { /* that was it! */
615  if (sp != stop)
616  return(NULL);
617  return(sp);
618  }
619  ss--; /* adjust for the for's final increment */
620 
621  /* the hard stuff */
622  AT("hard", sp, stop, ss, stopst);
623  s = m->g->strip[ss];
624  switch (OP(s)) {
625  case OBACK_: /* the vilest depths */
626  i = OPND(s);
627  assert(0 < i && i <= m->g->nsub);
628  if (m->pmatch[i].rm_eo == -1)
629  return(NULL);
630  assert(m->pmatch[i].rm_so != -1);
631  len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
632  if (len == 0 && rec++ > MAX_RECURSION)
633  return(NULL);
634  assert(stop - m->beginp >= len);
635  if (sp > stop - len)
636  return(NULL); /* not enough left to match */
637  ssp = m->offp + m->pmatch[i].rm_so;
638  if (memcmp(sp, ssp, len) != 0)
639  return(NULL);
640  while (m->g->strip[ss] != SOP(O_BACK, i))
641  ss++;
642  return(backref(m, sp+len, stop, ss+1, stopst, lev, rec));
643  break;
644  case OQUEST_: /* to null or not */
645  dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
646  if (dp != NULL)
647  return(dp); /* not */
648  return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec));
649  break;
650  case OPLUS_:
651  assert(m->lastpos != NULL);
652  assert(lev+1 <= m->g->nplus);
653  m->lastpos[lev+1] = sp;
654  return(backref(m, sp, stop, ss+1, stopst, lev+1, rec));
655  break;
656  case O_PLUS:
657  if (sp == m->lastpos[lev]) /* last pass matched null */
658  return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
659  /* try another pass */
660  m->lastpos[lev] = sp;
661  dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec);
662  if (dp == NULL)
663  return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
664  else
665  return(dp);
666  break;
667  case OCH_: /* find the right one, if any */
668  ssub = ss + 1;
669  esub = ss + OPND(s) - 1;
670  assert(OP(m->g->strip[esub]) == OOR1);
671  for (;;) { /* find first matching branch */
672  dp = backref(m, sp, stop, ssub, stopst, lev, rec);
673  if (dp != NULL)
674  return(dp);
675  /* that one missed, try next one */
676  if (OP(m->g->strip[esub]) == O_CH)
677  return(NULL); /* there is none */
678  esub++;
679  assert(OP(m->g->strip[esub]) == OOR2);
680  ssub = esub + 1;
681  esub += OPND(m->g->strip[esub]);
682  if (OP(m->g->strip[esub]) == OOR2)
683  esub--;
684  else
685  assert(OP(m->g->strip[esub]) == O_CH);
686  }
687  break;
688  case OLPAREN: /* must undo assignment if rest fails */
689  i = OPND(s);
690  assert(0 < i && i <= m->g->nsub);
691  offsave = m->pmatch[i].rm_so;
692  m->pmatch[i].rm_so = sp - m->offp;
693  dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
694  if (dp != NULL)
695  return(dp);
696  m->pmatch[i].rm_so = offsave;
697  return(NULL);
698  break;
699  case ORPAREN: /* must undo assignment if rest fails */
700  i = OPND(s);
701  assert(0 < i && i <= m->g->nsub);
702  offsave = m->pmatch[i].rm_eo;
703  m->pmatch[i].rm_eo = sp - m->offp;
704  dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
705  if (dp != NULL)
706  return(dp);
707  m->pmatch[i].rm_eo = offsave;
708  return(NULL);
709  break;
710  default: /* uh oh */
711  assert(nope);
712  break;
713  }
714 
715  /* "can't happen" */
716  assert(nope);
717  /* NOTREACHED */
718  return NULL;
719 }
720 
721 /*
722  - fast - step through the string at top speed
723  */
724 static const char * /* where tentative match ended, or NULL */
725 fast(struct match *m, const char *start, const char *stop, sopno startst,
726  sopno stopst)
727 {
728  states st = m->st;
729  states fresh = m->fresh;
730  states tmp = m->tmp;
731  const char *p = start;
732  int c = (start == m->beginp) ? OUT : *(start-1);
733  int lastc; /* previous c */
734  int flagch;
735  int i;
736  const char *coldp; /* last p after which no match was underway */
737 
738  CLEAR(st);
739  SET1(st, startst);
740  st = step(m->g, startst, stopst, st, NOTHING, st);
741  ASSIGN(fresh, st);
742  SP("start", st, *p);
743  coldp = NULL;
744  for (;;) {
745  /* next character */
746  lastc = c;
747  c = (p == m->endp) ? OUT : *p;
748  if (EQ(st, fresh))
749  coldp = p;
750 
751  /* is there an EOL and/or BOL between lastc and c? */
752  flagch = '\0';
753  i = 0;
754  if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
755  (lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
756  flagch = BOL;
757  i = m->g->nbol;
758  }
759  if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
760  (c == OUT && !(m->eflags&REG_NOTEOL)) ) {
761  flagch = (flagch == BOL) ? BOLEOL : EOL;
762  i += m->g->neol;
763  }
764  if (i != 0) {
765  for (; i > 0; i--)
766  st = step(m->g, startst, stopst, st, flagch, st);
767  SP("boleol", st, c);
768  }
769 
770  /* how about a word boundary? */
771  if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
772  (c != OUT && ISWORD(c)) ) {
773  flagch = BOW;
774  }
775  if ( (lastc != OUT && ISWORD(lastc)) &&
776  (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
777  flagch = EOW;
778  }
779  if (flagch == BOW || flagch == EOW) {
780  st = step(m->g, startst, stopst, st, flagch, st);
781  SP("boweow", st, c);
782  }
783 
784  /* are we done? */
785  if (ISSET(st, stopst) || p == stop)
786  break; /* NOTE BREAK OUT */
787 
788  /* no, we must deal with this character */
789  ASSIGN(tmp, st);
790  ASSIGN(st, fresh);
791  assert(c != OUT);
792  st = step(m->g, startst, stopst, tmp, c, st);
793  SP("aft", st, c);
794  assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
795  p++;
796  }
797 
798  assert(coldp != NULL);
799  m->coldp = coldp;
800  if (ISSET(st, stopst))
801  return(p+1);
802  else
803  return(NULL);
804 }
805 
806 /*
807  - slow - step through the string more deliberately
808  */
809 static const char * /* where it ended */
810 slow(struct match *m, const char *start, const char *stop, sopno startst,
811  sopno stopst)
812 {
813  /* Quickly skip over fixed character matches at the start. */
814  const char *p = start;
815  for (; startst < stopst; ++startst) {
816  int hard = 0;
817  sop s = m->g->strip[startst];
818  switch (OP(s)) {
819  case OLPAREN:
820  case ORPAREN:
821  /* Not relevant here. */
822  break;
823  case OCHAR:
824  if (p == stop || *p != (char)OPND(s))
825  return NULL;
826  ++p;
827  break;
828  default:
829  hard = 1;
830  break;
831  }
832  if (hard)
833  break;
834  }
835 
836  states st = m->st;
837  states empty = m->empty;
838  states tmp = m->tmp;
839  int c = (p == m->beginp) ? OUT : *(p-1);
840  int lastc; /* previous c */
841  int flagch;
842  int i;
843  const char *matchp; /* last p at which a match ended */
844 
845  AT("slow", start, stop, startst, stopst);
846  CLEAR(st);
847  SET1(st, startst);
848  SP("sstart", st, *p);
849  st = step(m->g, startst, stopst, st, NOTHING, st);
850  matchp = NULL;
851  for (;;) {
852  /* next character */
853  lastc = c;
854  c = (p == m->endp) ? OUT : *p;
855 
856  /* is there an EOL and/or BOL between lastc and c? */
857  flagch = '\0';
858  i = 0;
859  if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
860  (lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
861  flagch = BOL;
862  i = m->g->nbol;
863  }
864  if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
865  (c == OUT && !(m->eflags&REG_NOTEOL)) ) {
866  flagch = (flagch == BOL) ? BOLEOL : EOL;
867  i += m->g->neol;
868  }
869  if (i != 0) {
870  for (; i > 0; i--)
871  st = step(m->g, startst, stopst, st, flagch, st);
872  SP("sboleol", st, c);
873  }
874 
875  /* how about a word boundary? */
876  if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
877  (c != OUT && ISWORD(c)) ) {
878  flagch = BOW;
879  }
880  if ( (lastc != OUT && ISWORD(lastc)) &&
881  (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
882  flagch = EOW;
883  }
884  if (flagch == BOW || flagch == EOW) {
885  st = step(m->g, startst, stopst, st, flagch, st);
886  SP("sboweow", st, c);
887  }
888 
889  /* are we done? */
890  if (ISSET(st, stopst))
891  matchp = p;
892  if (EQ(st, empty) || p == stop)
893  break; /* NOTE BREAK OUT */
894 
895  /* no, we must deal with this character */
896  ASSIGN(tmp, st);
897  ASSIGN(st, empty);
898  assert(c != OUT);
899  st = step(m->g, startst, stopst, tmp, c, st);
900  SP("saft", st, c);
901  assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
902  p++;
903  }
904 
905  return(matchp);
906 }
907 
908 
909 /*
910  - step - map set of states reachable before char to set reachable after
911  */
912 static states
913 step(struct re_guts *g,
914  sopno start, /* start state within strip */
915  sopno stop, /* state after stop state within strip */
916  states bef, /* states reachable before */
917  int ch, /* character or NONCHAR code */
918  states aft) /* states already known reachable after */
919 {
920  cset *cs;
921  sop s;
922  sopno pc;
923  onestate here; /* note, macros know this name */
924  sopno look;
925  int i;
926 
927  for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
928  s = g->strip[pc];
929  switch (OP(s)) {
930  case OEND:
931  assert(pc == stop-1);
932  break;
933  case OCHAR:
934  /* only characters can match */
935  assert(!NONCHAR(ch) || ch != (char)OPND(s));
936  if (ch == (char)OPND(s))
937  FWD(aft, bef, 1);
938  break;
939  case OBOL:
940  if (ch == BOL || ch == BOLEOL)
941  FWD(aft, bef, 1);
942  break;
943  case OEOL:
944  if (ch == EOL || ch == BOLEOL)
945  FWD(aft, bef, 1);
946  break;
947  case OBOW:
948  if (ch == BOW)
949  FWD(aft, bef, 1);
950  break;
951  case OEOW:
952  if (ch == EOW)
953  FWD(aft, bef, 1);
954  break;
955  case OANY:
956  if (!NONCHAR(ch))
957  FWD(aft, bef, 1);
958  break;
959  case OANYOF:
960  cs = &g->sets[OPND(s)];
961  if (!NONCHAR(ch) && CHIN(cs, ch))
962  FWD(aft, bef, 1);
963  break;
964  case OBACK_: /* ignored here */
965  case O_BACK:
966  FWD(aft, aft, 1);
967  break;
968  case OPLUS_: /* forward, this is just an empty */
969  FWD(aft, aft, 1);
970  break;
971  case O_PLUS: /* both forward and back */
972  FWD(aft, aft, 1);
973  i = ISSETBACK(aft, OPND(s));
974  BACK(aft, aft, OPND(s));
975  if (!i && ISSETBACK(aft, OPND(s))) {
976  /* oho, must reconsider loop body */
977  pc -= OPND(s) + 1;
978  INIT(here, pc);
979  }
980  break;
981  case OQUEST_: /* two branches, both forward */
982  FWD(aft, aft, 1);
983  FWD(aft, aft, OPND(s));
984  break;
985  case O_QUEST: /* just an empty */
986  FWD(aft, aft, 1);
987  break;
988  case OLPAREN: /* not significant here */
989  case ORPAREN:
990  FWD(aft, aft, 1);
991  break;
992  case OCH_: /* mark the first two branches */
993  FWD(aft, aft, 1);
994  assert(OP(g->strip[pc+OPND(s)]) == OOR2);
995  FWD(aft, aft, OPND(s));
996  break;
997  case OOR1: /* done a branch, find the O_CH */
998  if (ISSTATEIN(aft, here)) {
999  for (look = 1;
1000  OP(s = g->strip[pc+look]) != O_CH;
1001  look += OPND(s))
1002  assert(OP(s) == OOR2);
1003  FWD(aft, aft, look);
1004  }
1005  break;
1006  case OOR2: /* propagate OCH_'s marking */
1007  FWD(aft, aft, 1);
1008  if (OP(g->strip[pc+OPND(s)]) != O_CH) {
1009  assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1010  FWD(aft, aft, OPND(s));
1011  }
1012  break;
1013  case O_CH: /* just empty */
1014  FWD(aft, aft, 1);
1015  break;
1016  default: /* ooooops... */
1017  assert(nope);
1018  break;
1019  }
1020  }
1021 
1022  return(aft);
1023 }
1024 
1025 #ifdef REDEBUG
1026 /*
1027  - print - print a set of states
1028  */
1029 static void
1030 print(struct match *m, const char *caption, states st, int ch, FILE *d)
1031 {
1032  struct re_guts *g = m->g;
1033  int i;
1034  int first = 1;
1035 
1036  if (!(m->eflags&REG_TRACE))
1037  return;
1038 
1039  (void)fprintf(d, "%s", caption);
1040  if (ch != '\0')
1041  (void)fprintf(d, " %s", pchar(ch));
1042  for (i = 0; i < g->nstates; i++)
1043  if (ISSET(st, i)) {
1044  (void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
1045  first = 0;
1046  }
1047  (void)fprintf(d, "\n");
1048 }
1049 
1050 /*
1051  - at - print current situation
1052  */
1053 static void
1054 at(struct match *m, const char *title, const char *start, const char *stop,
1055  sopno startst, sopno stopst)
1056 {
1057  if (!(m->eflags&REG_TRACE))
1058  return;
1059 
1060  (void)printf("%s %s-", title, pchar(*start));
1061  (void)printf("%s ", pchar(*stop));
1062  (void)printf("%ld-%ld\n", (long)startst, (long)stopst);
1063 }
1064 
1065 #ifndef PCHARDONE
1066 #define PCHARDONE /* never again */
1067 /*
1068  - pchar - make a character printable
1069  *
1070  * Is this identical to regchar() over in debug.c? Well, yes. But a
1071  * duplicate here avoids having a debugging-capable regexec.o tied to
1072  * a matching debug.o, and this is convenient. It all disappears in
1073  * the non-debug compilation anyway, so it doesn't matter much.
1074  */
1075 static char * /* -> representation */
1076 pchar(int ch)
1077 {
1078  static char pbuf[10];
1079 
1080  if (isprint(ch) || ch == ' ')
1081  (void)snprintf(pbuf, sizeof pbuf, "%c", ch);
1082  else
1083  (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch);
1084  return(pbuf);
1085 }
1086 #endif
1087 #endif
1088 
1089 #undef matcher
1090 #undef fast
1091 #undef slow
1092 #undef dissect
1093 #undef backref
1094 #undef step
1095 #undef print
1096 #undef at
1097 #undef match
1098 #undef nope
1099 #undef step_back
i
i
Definition: README.txt:29
ISSETBACK
#define ISSETBACK(v, n)
Definition: regexec.c:127
STATEVARS
#define STATEVARS
Definition: regexec.c:113
llvm_regmatch_t::rm_so
llvm_regoff_t rm_so
Definition: regex_impl.h:44
print
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Definition: ArchiveWriter.cpp:189
SETUP
#define SETUP(v)
Definition: regexec.c:118
O_PLUS
#define O_PLUS
Definition: regex2.h:87
REG_STARTEND
#define REG_STARTEND
Definition: regex_impl.h:88
REG_NOSUB
#define REG_NOSUB
Definition: regex_impl.h:59
llvm_regmatch_t::rm_eo
llvm_regoff_t rm_eo
Definition: regex_impl.h:45
memcmp
Merge contiguous icmps into a memcmp
Definition: MergeICmps.cpp:908
OANY
#define OANY
Definition: regex2.h:82
at
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional ldr LCPI1_0 ldr ldr tst movne lsr ldr LCPI1_1 and r0 bx lr it saves an instruction and a register It might be profitable to cse MOVi16 if there are lots of bit immediates with the same bottom half Robert Muth started working on an alternate jump table implementation that does not put the tables in line in the text This is more like the llvm default jump table implementation This might be useful sometime Several revisions of patches are on the mailing beginning at
Definition: README.txt:582
g
should just be implemented with a CLZ instruction Since there are other e g
Definition: README.txt:709
OCH_
#define OCH_
Definition: regex2.h:92
tmp
alloca< 16 x float >, align 16 %tmp2=alloca< 16 x float >, align 16 store< 16 x float > %A,< 16 x float > *%tmp %s=bitcast< 16 x float > *%tmp to i8 *%s2=bitcast< 16 x float > *%tmp2 to i8 *call void @llvm.memcpy.i64(i8 *%s, i8 *%s2, i64 64, i32 16) %R=load< 16 x float > *%tmp2 ret< 16 x float > %R } declare void @llvm.memcpy.i64(i8 *nocapture, i8 *nocapture, i64, i32) nounwind which compiles to:_foo:subl $140, %esp movaps %xmm3, 112(%esp) movaps %xmm2, 96(%esp) movaps %xmm1, 80(%esp) movaps %xmm0, 64(%esp) movl 60(%esp), %eax movl %eax, 124(%esp) movl 56(%esp), %eax movl %eax, 120(%esp) movl 52(%esp), %eax< many many more 32-bit copies > movaps(%esp), %xmm0 movaps 16(%esp), %xmm1 movaps 32(%esp), %xmm2 movaps 48(%esp), %xmm3 addl $140, %esp ret On Nehalem, it may even be cheaper to just use movups when unaligned than to fall back to lower-granularity chunks. Implement processor-specific optimizations for parity with GCC on these processors. GCC does two optimizations:1. ix86_pad_returns inserts a noop before ret instructions if immediately preceded by a conditional branch or is the target of a jump. 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of code contains more than 3 branches. The first one is done for all AMDs, Core2, and "Generic" The second one is done for:Atom, Pentium Pro, all AMDs, Pentium 4, Nocona, Core 2, and "Generic" Testcase:int x(int a) { return(a &0xf0)> >4 tmp
Definition: README.txt:1347
OBACK_
#define OBACK_
Definition: regex2.h:84
p
the resulting code requires compare and branches when and if * p
Definition: README.txt:396
OOR2
#define OOR2
Definition: regex2.h:94
INC
#define INC(o)
Definition: regexec.c:121
re_guts
Definition: regex2.h:132
OBOW
#define OBOW
Definition: regex2.h:96
here
A predicate compare being used in a select_cc should have the same peephole applied to it as a predicate compare used by a br_cc There should be no mfcr here
Definition: README_ALTIVEC.txt:147
ORPAREN
#define ORPAREN
Definition: regex2.h:91
llvm_regoff_t
off_t llvm_regoff_t
Definition: regex_impl.h:42
OEOW
#define OEOW
Definition: regex2.h:97
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
sopno
long sopno
Definition: regex2.h:69
OANYOF
#define OANYOF
Definition: regex2.h:83
O_CH
#define O_CH
Definition: regex2.h:95
EQ
#define EQ(a, b)
Definition: regexec.c:112
OUT
#define OUT
Definition: regex2.h:162
SET1
#define SET1(v, n)
Definition: regexec.c:109
states
#define states
Definition: regexec.c:106
REG_TRACE
#define REG_TRACE
Definition: regex_impl.h:89
OCHAR
#define OCHAR
Definition: regex2.h:79
ISSET
#define ISSET(v, n)
Definition: regexec.c:110
sop
unsigned long sop
Definition: regex2.h:68
OLPAREN
#define OLPAREN
Definition: regex2.h:90
c
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical int int c
Definition: README.txt:418
OBOL
#define OBOL
Definition: regex2.h:80
STATESETUP
#define STATESETUP(m, n)
Definition: regexec.c:114
REG_NOTBOL
#define REG_NOTBOL
Definition: regex_impl.h:86
INIT
#define INIT(o, n)
Definition: regexec.c:120
CHIN
#define CHIN(cs, c)
Definition: regex2.h:121
ASSIGN
#define ASSIGN(d, s)
Definition: regexec.c:111
s
multiplies can be turned into SHL s
Definition: README.txt:370
REG_NEWLINE
#define REG_NEWLINE
Definition: regex_impl.h:60
REG_NOMATCH
#define REG_NOMATCH
Definition: regex_impl.h:66
O_BACK
#define O_BACK
Definition: regex2.h:85
FWD
#define FWD(dst, src, n)
Definition: regexec.c:125
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
CLEAR
#define CLEAR(v)
Definition: regexec.c:107
SOP
#define SOP(op, opnd)
Definition: regex2.h:75
OP
#define OP(n)
Definition: regex2.h:73
O_QUEST
#define O_QUEST
Definition: regex2.h:89
cset
Definition: regex2.h:111
STATETEARDOWN
#define STATETEARDOWN(m)
Definition: regexec.c:117
onestate
#define onestate
Definition: regexec.c:119
if
if(llvm_vc STREQUAL "") set(fake_version_inc "$
Definition: CMakeLists.txt:14
ISSTATEIN
#define ISSTATEIN(v, o)
Definition: regexec.c:122
ISWORD
#define ISWORD(c)
Definition: regex2.h:163
BACK
#define BACK(dst, src, n)
Definition: regexec.c:126
OEOL
#define OEOL
Definition: regex2.h:81
pc
add sub stmia L5 ldr pc
Definition: README.txt:201
sp
_bar sp
Definition: README.txt:151
REG_ESPACE
#define REG_ESPACE
Definition: regex_impl.h:77
OPND
#define OPND(n)
Definition: regex2.h:74
REG_NOTEOL
#define REG_NOTEOL
Definition: regex_impl.h:87
llvm_regmatch_t
Definition: regex_impl.h:43
OOR1
#define OOR1
Definition: regex2.h:93
REG_INVARG
#define REG_INVARG
Definition: regex_impl.h:81
OEND
#define OEND
Definition: regex2.h:78
matcher
Code Generation Notes for reduce the size of the ISel matcher
Definition: MSA.txt:5
REG_BACKR
#define REG_BACKR
Definition: regex_impl.h:91
d
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical int int int d
Definition: README.txt:418
OPLUS_
#define OPLUS_
Definition: regex2.h:86
OQUEST_
#define OQUEST_
Definition: regex2.h:88
SpecialSubKind::string
@ string