...

Source file src/regexp/backtrack.go

     1	// Copyright 2015 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// backtrack is a regular expression search with submatch
     6	// tracking for small regular expressions and texts. It allocates
     7	// a bit vector with (length of input) * (length of prog) bits,
     8	// to make sure it never explores the same (character position, instruction)
     9	// state multiple times. This limits the search to run in time linear in
    10	// the length of the test.
    11	//
    12	// backtrack is a fast replacement for the NFA code on small
    13	// regexps when onepass cannot be used.
    14	
    15	package regexp
    16	
    17	import (
    18		"regexp/syntax"
    19		"sync"
    20	)
    21	
    22	// A job is an entry on the backtracker's job stack. It holds
    23	// the instruction pc and the position in the input.
    24	type job struct {
    25		pc  uint32
    26		arg bool
    27		pos int
    28	}
    29	
    30	const (
    31		visitedBits        = 32
    32		maxBacktrackProg   = 500        // len(prog.Inst) <= max
    33		maxBacktrackVector = 256 * 1024 // bit vector size <= max (bits)
    34	)
    35	
    36	// bitState holds state for the backtracker.
    37	type bitState struct {
    38		end      int
    39		cap      []int
    40		matchcap []int
    41		jobs     []job
    42		visited  []uint32
    43	
    44		inputs inputs
    45	}
    46	
    47	var bitStatePool sync.Pool
    48	
    49	func newBitState() *bitState {
    50		b, ok := bitStatePool.Get().(*bitState)
    51		if !ok {
    52			b = new(bitState)
    53		}
    54		return b
    55	}
    56	
    57	func freeBitState(b *bitState) {
    58		b.inputs.clear()
    59		bitStatePool.Put(b)
    60	}
    61	
    62	// maxBitStateLen returns the maximum length of a string to search with
    63	// the backtracker using prog.
    64	func maxBitStateLen(prog *syntax.Prog) int {
    65		if !shouldBacktrack(prog) {
    66			return 0
    67		}
    68		return maxBacktrackVector / len(prog.Inst)
    69	}
    70	
    71	// shouldBacktrack reports whether the program is too
    72	// long for the backtracker to run.
    73	func shouldBacktrack(prog *syntax.Prog) bool {
    74		return len(prog.Inst) <= maxBacktrackProg
    75	}
    76	
    77	// reset resets the state of the backtracker.
    78	// end is the end position in the input.
    79	// ncap is the number of captures.
    80	func (b *bitState) reset(prog *syntax.Prog, end int, ncap int) {
    81		b.end = end
    82	
    83		if cap(b.jobs) == 0 {
    84			b.jobs = make([]job, 0, 256)
    85		} else {
    86			b.jobs = b.jobs[:0]
    87		}
    88	
    89		visitedSize := (len(prog.Inst)*(end+1) + visitedBits - 1) / visitedBits
    90		if cap(b.visited) < visitedSize {
    91			b.visited = make([]uint32, visitedSize, maxBacktrackVector/visitedBits)
    92		} else {
    93			b.visited = b.visited[:visitedSize]
    94			for i := range b.visited {
    95				b.visited[i] = 0
    96			}
    97		}
    98	
    99		if cap(b.cap) < ncap {
   100			b.cap = make([]int, ncap)
   101		} else {
   102			b.cap = b.cap[:ncap]
   103		}
   104		for i := range b.cap {
   105			b.cap[i] = -1
   106		}
   107	
   108		if cap(b.matchcap) < ncap {
   109			b.matchcap = make([]int, ncap)
   110		} else {
   111			b.matchcap = b.matchcap[:ncap]
   112		}
   113		for i := range b.matchcap {
   114			b.matchcap[i] = -1
   115		}
   116	}
   117	
   118	// shouldVisit reports whether the combination of (pc, pos) has not
   119	// been visited yet.
   120	func (b *bitState) shouldVisit(pc uint32, pos int) bool {
   121		n := uint(int(pc)*(b.end+1) + pos)
   122		if b.visited[n/visitedBits]&(1<<(n&(visitedBits-1))) != 0 {
   123			return false
   124		}
   125		b.visited[n/visitedBits] |= 1 << (n & (visitedBits - 1))
   126		return true
   127	}
   128	
   129	// push pushes (pc, pos, arg) onto the job stack if it should be
   130	// visited.
   131	func (b *bitState) push(re *Regexp, pc uint32, pos int, arg bool) {
   132		// Only check shouldVisit when arg is false.
   133		// When arg is true, we are continuing a previous visit.
   134		if re.prog.Inst[pc].Op != syntax.InstFail && (arg || b.shouldVisit(pc, pos)) {
   135			b.jobs = append(b.jobs, job{pc: pc, arg: arg, pos: pos})
   136		}
   137	}
   138	
   139	// tryBacktrack runs a backtracking search starting at pos.
   140	func (re *Regexp) tryBacktrack(b *bitState, i input, pc uint32, pos int) bool {
   141		longest := re.longest
   142	
   143		b.push(re, pc, pos, false)
   144		for len(b.jobs) > 0 {
   145			l := len(b.jobs) - 1
   146			// Pop job off the stack.
   147			pc := b.jobs[l].pc
   148			pos := b.jobs[l].pos
   149			arg := b.jobs[l].arg
   150			b.jobs = b.jobs[:l]
   151	
   152			// Optimization: rather than push and pop,
   153			// code that is going to Push and continue
   154			// the loop simply updates ip, p, and arg
   155			// and jumps to CheckAndLoop. We have to
   156			// do the ShouldVisit check that Push
   157			// would have, but we avoid the stack
   158			// manipulation.
   159			goto Skip
   160		CheckAndLoop:
   161			if !b.shouldVisit(pc, pos) {
   162				continue
   163			}
   164		Skip:
   165	
   166			inst := re.prog.Inst[pc]
   167	
   168			switch inst.Op {
   169			default:
   170				panic("bad inst")
   171			case syntax.InstFail:
   172				panic("unexpected InstFail")
   173			case syntax.InstAlt:
   174				// Cannot just
   175				//   b.push(inst.Out, pos, false)
   176				//   b.push(inst.Arg, pos, false)
   177				// If during the processing of inst.Out, we encounter
   178				// inst.Arg via another path, we want to process it then.
   179				// Pushing it here will inhibit that. Instead, re-push
   180				// inst with arg==true as a reminder to push inst.Arg out
   181				// later.
   182				if arg {
   183					// Finished inst.Out; try inst.Arg.
   184					arg = false
   185					pc = inst.Arg
   186					goto CheckAndLoop
   187				} else {
   188					b.push(re, pc, pos, true)
   189					pc = inst.Out
   190					goto CheckAndLoop
   191				}
   192	
   193			case syntax.InstAltMatch:
   194				// One opcode consumes runes; the other leads to match.
   195				switch re.prog.Inst[inst.Out].Op {
   196				case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
   197					// inst.Arg is the match.
   198					b.push(re, inst.Arg, pos, false)
   199					pc = inst.Arg
   200					pos = b.end
   201					goto CheckAndLoop
   202				}
   203				// inst.Out is the match - non-greedy
   204				b.push(re, inst.Out, b.end, false)
   205				pc = inst.Out
   206				goto CheckAndLoop
   207	
   208			case syntax.InstRune:
   209				r, width := i.step(pos)
   210				if !inst.MatchRune(r) {
   211					continue
   212				}
   213				pos += width
   214				pc = inst.Out
   215				goto CheckAndLoop
   216	
   217			case syntax.InstRune1:
   218				r, width := i.step(pos)
   219				if r != inst.Rune[0] {
   220					continue
   221				}
   222				pos += width
   223				pc = inst.Out
   224				goto CheckAndLoop
   225	
   226			case syntax.InstRuneAnyNotNL:
   227				r, width := i.step(pos)
   228				if r == '\n' || r == endOfText {
   229					continue
   230				}
   231				pos += width
   232				pc = inst.Out
   233				goto CheckAndLoop
   234	
   235			case syntax.InstRuneAny:
   236				r, width := i.step(pos)
   237				if r == endOfText {
   238					continue
   239				}
   240				pos += width
   241				pc = inst.Out
   242				goto CheckAndLoop
   243	
   244			case syntax.InstCapture:
   245				if arg {
   246					// Finished inst.Out; restore the old value.
   247					b.cap[inst.Arg] = pos
   248					continue
   249				} else {
   250					if 0 <= inst.Arg && inst.Arg < uint32(len(b.cap)) {
   251						// Capture pos to register, but save old value.
   252						b.push(re, pc, b.cap[inst.Arg], true) // come back when we're done.
   253						b.cap[inst.Arg] = pos
   254					}
   255					pc = inst.Out
   256					goto CheckAndLoop
   257				}
   258	
   259			case syntax.InstEmptyWidth:
   260				flag := i.context(pos)
   261				if !flag.match(syntax.EmptyOp(inst.Arg)) {
   262					continue
   263				}
   264				pc = inst.Out
   265				goto CheckAndLoop
   266	
   267			case syntax.InstNop:
   268				pc = inst.Out
   269				goto CheckAndLoop
   270	
   271			case syntax.InstMatch:
   272				// We found a match. If the caller doesn't care
   273				// where the match is, no point going further.
   274				if len(b.cap) == 0 {
   275					return true
   276				}
   277	
   278				// Record best match so far.
   279				// Only need to check end point, because this entire
   280				// call is only considering one start position.
   281				if len(b.cap) > 1 {
   282					b.cap[1] = pos
   283				}
   284				if old := b.matchcap[1]; old == -1 || (longest && pos > 0 && pos > old) {
   285					copy(b.matchcap, b.cap)
   286				}
   287	
   288				// If going for first match, we're done.
   289				if !longest {
   290					return true
   291				}
   292	
   293				// If we used the entire text, no longer match is possible.
   294				if pos == b.end {
   295					return true
   296				}
   297	
   298				// Otherwise, continue on in hope of a longer match.
   299				continue
   300			}
   301		}
   302	
   303		return longest && len(b.matchcap) > 1 && b.matchcap[1] >= 0
   304	}
   305	
   306	// backtrack runs a backtracking search of prog on the input starting at pos.
   307	func (re *Regexp) backtrack(ib []byte, is string, pos int, ncap int, dstCap []int) []int {
   308		startCond := re.cond
   309		if startCond == ^syntax.EmptyOp(0) { // impossible
   310			return nil
   311		}
   312		if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
   313			// Anchored match, past beginning of text.
   314			return nil
   315		}
   316	
   317		b := newBitState()
   318		i, end := b.inputs.init(nil, ib, is)
   319		b.reset(re.prog, end, ncap)
   320	
   321		// Anchored search must start at the beginning of the input
   322		if startCond&syntax.EmptyBeginText != 0 {
   323			if len(b.cap) > 0 {
   324				b.cap[0] = pos
   325			}
   326			if !re.tryBacktrack(b, i, uint32(re.prog.Start), pos) {
   327				freeBitState(b)
   328				return nil
   329			}
   330		} else {
   331	
   332			// Unanchored search, starting from each possible text position.
   333			// Notice that we have to try the empty string at the end of
   334			// the text, so the loop condition is pos <= end, not pos < end.
   335			// This looks like it's quadratic in the size of the text,
   336			// but we are not clearing visited between calls to TrySearch,
   337			// so no work is duplicated and it ends up still being linear.
   338			width := -1
   339			for ; pos <= end && width != 0; pos += width {
   340				if len(re.prefix) > 0 {
   341					// Match requires literal prefix; fast search for it.
   342					advance := i.index(re, pos)
   343					if advance < 0 {
   344						freeBitState(b)
   345						return nil
   346					}
   347					pos += advance
   348				}
   349	
   350				if len(b.cap) > 0 {
   351					b.cap[0] = pos
   352				}
   353				if re.tryBacktrack(b, i, uint32(re.prog.Start), pos) {
   354					// Match must be leftmost; done.
   355					goto Match
   356				}
   357				_, width = i.step(pos)
   358			}
   359			freeBitState(b)
   360			return nil
   361		}
   362	
   363	Match:
   364		dstCap = append(dstCap, b.matchcap...)
   365		freeBitState(b)
   366		return dstCap
   367	}
   368	

View as plain text