Black Lives Matter. Support the Equal Justice Initiative.

Source file src/regexp/backtrack.go

Documentation: regexp

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // backtrack is a regular expression search with submatch
     6  // tracking for small regular expressions and texts. It allocates
     7  // a bit vector with (length of input) * (length of prog) bits,
     8  // to make sure it never explores the same (character position, instruction)
     9  // state multiple times. This limits the search to run in time linear in
    10  // the length of the test.
    11  //
    12  // backtrack is a fast replacement for the NFA code on small
    13  // regexps when onepass cannot be used.
    14  
    15  package regexp
    16  
    17  import (
    18  	"regexp/syntax"
    19  	"sync"
    20  )
    21  
    22  // A job is an entry on the backtracker's job stack. It holds
    23  // the instruction pc and the position in the input.
    24  type job struct {
    25  	pc  uint32
    26  	arg bool
    27  	pos int
    28  }
    29  
    30  const (
    31  	visitedBits        = 32
    32  	maxBacktrackProg   = 500        // len(prog.Inst) <= max
    33  	maxBacktrackVector = 256 * 1024 // bit vector size <= max (bits)
    34  )
    35  
    36  // bitState holds state for the backtracker.
    37  type bitState struct {
    38  	end      int
    39  	cap      []int
    40  	matchcap []int
    41  	jobs     []job
    42  	visited  []uint32
    43  
    44  	inputs inputs
    45  }
    46  
    47  var bitStatePool sync.Pool
    48  
    49  func newBitState() *bitState {
    50  	b, ok := bitStatePool.Get().(*bitState)
    51  	if !ok {
    52  		b = new(bitState)
    53  	}
    54  	return b
    55  }
    56  
    57  func freeBitState(b *bitState) {
    58  	b.inputs.clear()
    59  	bitStatePool.Put(b)
    60  }
    61  
    62  // maxBitStateLen returns the maximum length of a string to search with
    63  // the backtracker using prog.
    64  func maxBitStateLen(prog *syntax.Prog) int {
    65  	if !shouldBacktrack(prog) {
    66  		return 0
    67  	}
    68  	return maxBacktrackVector / len(prog.Inst)
    69  }
    70  
    71  // shouldBacktrack reports whether the program is too
    72  // long for the backtracker to run.
    73  func shouldBacktrack(prog *syntax.Prog) bool {
    74  	return len(prog.Inst) <= maxBacktrackProg
    75  }
    76  
    77  // reset resets the state of the backtracker.
    78  // end is the end position in the input.
    79  // ncap is the number of captures.
    80  func (b *bitState) reset(prog *syntax.Prog, end int, ncap int) {
    81  	b.end = end
    82  
    83  	if cap(b.jobs) == 0 {
    84  		b.jobs = make([]job, 0, 256)
    85  	} else {
    86  		b.jobs = b.jobs[:0]
    87  	}
    88  
    89  	visitedSize := (len(prog.Inst)*(end+1) + visitedBits - 1) / visitedBits
    90  	if cap(b.visited) < visitedSize {
    91  		b.visited = make([]uint32, visitedSize, maxBacktrackVector/visitedBits)
    92  	} else {
    93  		b.visited = b.visited[:visitedSize]
    94  		for i := range b.visited {
    95  			b.visited[i] = 0
    96  		}
    97  	}
    98  
    99  	if cap(b.cap) < ncap {
   100  		b.cap = make([]int, ncap)
   101  	} else {
   102  		b.cap = b.cap[:ncap]
   103  	}
   104  	for i := range b.cap {
   105  		b.cap[i] = -1
   106  	}
   107  
   108  	if cap(b.matchcap) < ncap {
   109  		b.matchcap = make([]int, ncap)
   110  	} else {
   111  		b.matchcap = b.matchcap[:ncap]
   112  	}
   113  	for i := range b.matchcap {
   114  		b.matchcap[i] = -1
   115  	}
   116  }
   117  
   118  // shouldVisit reports whether the combination of (pc, pos) has not
   119  // been visited yet.
   120  func (b *bitState) shouldVisit(pc uint32, pos int) bool {
   121  	n := uint(int(pc)*(b.end+1) + pos)
   122  	if b.visited[n/visitedBits]&(1<<(n&(visitedBits-1))) != 0 {
   123  		return false
   124  	}
   125  	b.visited[n/visitedBits] |= 1 << (n & (visitedBits - 1))
   126  	return true
   127  }
   128  
   129  // push pushes (pc, pos, arg) onto the job stack if it should be
   130  // visited.
   131  func (b *bitState) push(re *Regexp, pc uint32, pos int, arg bool) {
   132  	// Only check shouldVisit when arg is false.
   133  	// When arg is true, we are continuing a previous visit.
   134  	if re.prog.Inst[pc].Op != syntax.InstFail && (arg || b.shouldVisit(pc, pos)) {
   135  		b.jobs = append(b.jobs, job{pc: pc, arg: arg, pos: pos})
   136  	}
   137  }
   138  
   139  // tryBacktrack runs a backtracking search starting at pos.
   140  func (re *Regexp) tryBacktrack(b *bitState, i input, pc uint32, pos int) bool {
   141  	longest := re.longest
   142  
   143  	b.push(re, pc, pos, false)
   144  	for len(b.jobs) > 0 {
   145  		l := len(b.jobs) - 1
   146  		// Pop job off the stack.
   147  		pc := b.jobs[l].pc
   148  		pos := b.jobs[l].pos
   149  		arg := b.jobs[l].arg
   150  		b.jobs = b.jobs[:l]
   151  
   152  		// Optimization: rather than push and pop,
   153  		// code that is going to Push and continue
   154  		// the loop simply updates ip, p, and arg
   155  		// and jumps to CheckAndLoop. We have to
   156  		// do the ShouldVisit check that Push
   157  		// would have, but we avoid the stack
   158  		// manipulation.
   159  		goto Skip
   160  	CheckAndLoop:
   161  		if !b.shouldVisit(pc, pos) {
   162  			continue
   163  		}
   164  	Skip:
   165  
   166  		inst := re.prog.Inst[pc]
   167  
   168  		switch inst.Op {
   169  		default:
   170  			panic("bad inst")
   171  		case syntax.InstFail:
   172  			panic("unexpected InstFail")
   173  		case syntax.InstAlt:
   174  			// Cannot just
   175  			//   b.push(inst.Out, pos, false)
   176  			//   b.push(inst.Arg, pos, false)
   177  			// If during the processing of inst.Out, we encounter
   178  			// inst.Arg via another path, we want to process it then.
   179  			// Pushing it here will inhibit that. Instead, re-push
   180  			// inst with arg==true as a reminder to push inst.Arg out
   181  			// later.
   182  			if arg {
   183  				// Finished inst.Out; try inst.Arg.
   184  				arg = false
   185  				pc = inst.Arg
   186  				goto CheckAndLoop
   187  			} else {
   188  				b.push(re, pc, pos, true)
   189  				pc = inst.Out
   190  				goto CheckAndLoop
   191  			}
   192  
   193  		case syntax.InstAltMatch:
   194  			// One opcode consumes runes; the other leads to match.
   195  			switch re.prog.Inst[inst.Out].Op {
   196  			case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
   197  				// inst.Arg is the match.
   198  				b.push(re, inst.Arg, pos, false)
   199  				pc = inst.Arg
   200  				pos = b.end
   201  				goto CheckAndLoop
   202  			}
   203  			// inst.Out is the match - non-greedy
   204  			b.push(re, inst.Out, b.end, false)
   205  			pc = inst.Out
   206  			goto CheckAndLoop
   207  
   208  		case syntax.InstRune:
   209  			r, width := i.step(pos)
   210  			if !inst.MatchRune(r) {
   211  				continue
   212  			}
   213  			pos += width
   214  			pc = inst.Out
   215  			goto CheckAndLoop
   216  
   217  		case syntax.InstRune1:
   218  			r, width := i.step(pos)
   219  			if r != inst.Rune[0] {
   220  				continue
   221  			}
   222  			pos += width
   223  			pc = inst.Out
   224  			goto CheckAndLoop
   225  
   226  		case syntax.InstRuneAnyNotNL:
   227  			r, width := i.step(pos)
   228  			if r == '\n' || r == endOfText {
   229  				continue
   230  			}
   231  			pos += width
   232  			pc = inst.Out
   233  			goto CheckAndLoop
   234  
   235  		case syntax.InstRuneAny:
   236  			r, width := i.step(pos)
   237  			if r == endOfText {
   238  				continue
   239  			}
   240  			pos += width
   241  			pc = inst.Out
   242  			goto CheckAndLoop
   243  
   244  		case syntax.InstCapture:
   245  			if arg {
   246  				// Finished inst.Out; restore the old value.
   247  				b.cap[inst.Arg] = pos
   248  				continue
   249  			} else {
   250  				if inst.Arg < uint32(len(b.cap)) {
   251  					// Capture pos to register, but save old value.
   252  					b.push(re, pc, b.cap[inst.Arg], true) // come back when we're done.
   253  					b.cap[inst.Arg] = pos
   254  				}
   255  				pc = inst.Out
   256  				goto CheckAndLoop
   257  			}
   258  
   259  		case syntax.InstEmptyWidth:
   260  			flag := i.context(pos)
   261  			if !flag.match(syntax.EmptyOp(inst.Arg)) {
   262  				continue
   263  			}
   264  			pc = inst.Out
   265  			goto CheckAndLoop
   266  
   267  		case syntax.InstNop:
   268  			pc = inst.Out
   269  			goto CheckAndLoop
   270  
   271  		case syntax.InstMatch:
   272  			// We found a match. If the caller doesn't care
   273  			// where the match is, no point going further.
   274  			if len(b.cap) == 0 {
   275  				return true
   276  			}
   277  
   278  			// Record best match so far.
   279  			// Only need to check end point, because this entire
   280  			// call is only considering one start position.
   281  			if len(b.cap) > 1 {
   282  				b.cap[1] = pos
   283  			}
   284  			if old := b.matchcap[1]; old == -1 || (longest && pos > 0 && pos > old) {
   285  				copy(b.matchcap, b.cap)
   286  			}
   287  
   288  			// If going for first match, we're done.
   289  			if !longest {
   290  				return true
   291  			}
   292  
   293  			// If we used the entire text, no longer match is possible.
   294  			if pos == b.end {
   295  				return true
   296  			}
   297  
   298  			// Otherwise, continue on in hope of a longer match.
   299  			continue
   300  		}
   301  	}
   302  
   303  	return longest && len(b.matchcap) > 1 && b.matchcap[1] >= 0
   304  }
   305  
   306  // backtrack runs a backtracking search of prog on the input starting at pos.
   307  func (re *Regexp) backtrack(ib []byte, is string, pos int, ncap int, dstCap []int) []int {
   308  	startCond := re.cond
   309  	if startCond == ^syntax.EmptyOp(0) { // impossible
   310  		return nil
   311  	}
   312  	if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
   313  		// Anchored match, past beginning of text.
   314  		return nil
   315  	}
   316  
   317  	b := newBitState()
   318  	i, end := b.inputs.init(nil, ib, is)
   319  	b.reset(re.prog, end, ncap)
   320  
   321  	// Anchored search must start at the beginning of the input
   322  	if startCond&syntax.EmptyBeginText != 0 {
   323  		if len(b.cap) > 0 {
   324  			b.cap[0] = pos
   325  		}
   326  		if !re.tryBacktrack(b, i, uint32(re.prog.Start), pos) {
   327  			freeBitState(b)
   328  			return nil
   329  		}
   330  	} else {
   331  
   332  		// Unanchored search, starting from each possible text position.
   333  		// Notice that we have to try the empty string at the end of
   334  		// the text, so the loop condition is pos <= end, not pos < end.
   335  		// This looks like it's quadratic in the size of the text,
   336  		// but we are not clearing visited between calls to TrySearch,
   337  		// so no work is duplicated and it ends up still being linear.
   338  		width := -1
   339  		for ; pos <= end && width != 0; pos += width {
   340  			if len(re.prefix) > 0 {
   341  				// Match requires literal prefix; fast search for it.
   342  				advance := i.index(re, pos)
   343  				if advance < 0 {
   344  					freeBitState(b)
   345  					return nil
   346  				}
   347  				pos += advance
   348  			}
   349  
   350  			if len(b.cap) > 0 {
   351  				b.cap[0] = pos
   352  			}
   353  			if re.tryBacktrack(b, i, uint32(re.prog.Start), pos) {
   354  				// Match must be leftmost; done.
   355  				goto Match
   356  			}
   357  			_, width = i.step(pos)
   358  		}
   359  		freeBitState(b)
   360  		return nil
   361  	}
   362  
   363  Match:
   364  	dstCap = append(dstCap, b.matchcap...)
   365  	freeBitState(b)
   366  	return dstCap
   367  }
   368  

View as plain text