Source file src/pkg/regexp/onepass.go

     1	// Copyright 2014 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	package regexp
     6	
     7	import (
     8		"regexp/syntax"
     9		"sort"
    10		"strings"
    11		"unicode"
    12	)
    13	
    14	// "One-pass" regexp execution.
    15	// Some regexps can be analyzed to determine that they never need
    16	// backtracking: they are guaranteed to run in one pass over the string
    17	// without bothering to save all the usual NFA state.
    18	// Detect those and execute them more quickly.
    19	
    20	// A onePassProg is a compiled one-pass regular expression program.
    21	// It is the same as syntax.Prog except for the use of onePassInst.
    22	type onePassProg struct {
    23		Inst   []onePassInst
    24		Start  int // index of start instruction
    25		NumCap int // number of InstCapture insts in re
    26	}
    27	
    28	// A onePassInst is a single instruction in a one-pass regular expression program.
    29	// It is the same as syntax.Inst except for the new 'Next' field.
    30	type onePassInst struct {
    31		syntax.Inst
    32		Next []uint32
    33	}
    34	
    35	// OnePassPrefix returns a literal string that all matches for the
    36	// regexp must start with. Complete is true if the prefix
    37	// is the entire match. Pc is the index of the last rune instruction
    38	// in the string. The OnePassPrefix skips over the mandatory
    39	// EmptyBeginText
    40	func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
    41		i := &p.Inst[p.Start]
    42		if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
    43			return "", i.Op == syntax.InstMatch, uint32(p.Start)
    44		}
    45		pc = i.Out
    46		i = &p.Inst[pc]
    47		for i.Op == syntax.InstNop {
    48			pc = i.Out
    49			i = &p.Inst[pc]
    50		}
    51		// Avoid allocation of buffer if prefix is empty.
    52		if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
    53			return "", i.Op == syntax.InstMatch, uint32(p.Start)
    54		}
    55	
    56		// Have prefix; gather characters.
    57		var buf strings.Builder
    58		for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 {
    59			buf.WriteRune(i.Rune[0])
    60			pc, i = i.Out, &p.Inst[i.Out]
    61		}
    62		if i.Op == syntax.InstEmptyWidth &&
    63			syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 &&
    64			p.Inst[i.Out].Op == syntax.InstMatch {
    65			complete = true
    66		}
    67		return buf.String(), complete, pc
    68	}
    69	
    70	// OnePassNext selects the next actionable state of the prog, based on the input character.
    71	// It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
    72	// One of the alternates may ultimately lead without input to end of line. If the instruction
    73	// is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
    74	func onePassNext(i *onePassInst, r rune) uint32 {
    75		next := i.MatchRunePos(r)
    76		if next >= 0 {
    77			return i.Next[next]
    78		}
    79		if i.Op == syntax.InstAltMatch {
    80			return i.Out
    81		}
    82		return 0
    83	}
    84	
    85	func iop(i *syntax.Inst) syntax.InstOp {
    86		op := i.Op
    87		switch op {
    88		case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
    89			op = syntax.InstRune
    90		}
    91		return op
    92	}
    93	
    94	// Sparse Array implementation is used as a queueOnePass.
    95	type queueOnePass struct {
    96		sparse          []uint32
    97		dense           []uint32
    98		size, nextIndex uint32
    99	}
   100	
   101	func (q *queueOnePass) empty() bool {
   102		return q.nextIndex >= q.size
   103	}
   104	
   105	func (q *queueOnePass) next() (n uint32) {
   106		n = q.dense[q.nextIndex]
   107		q.nextIndex++
   108		return
   109	}
   110	
   111	func (q *queueOnePass) clear() {
   112		q.size = 0
   113		q.nextIndex = 0
   114	}
   115	
   116	func (q *queueOnePass) contains(u uint32) bool {
   117		if u >= uint32(len(q.sparse)) {
   118			return false
   119		}
   120		return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
   121	}
   122	
   123	func (q *queueOnePass) insert(u uint32) {
   124		if !q.contains(u) {
   125			q.insertNew(u)
   126		}
   127	}
   128	
   129	func (q *queueOnePass) insertNew(u uint32) {
   130		if u >= uint32(len(q.sparse)) {
   131			return
   132		}
   133		q.sparse[u] = q.size
   134		q.dense[q.size] = u
   135		q.size++
   136	}
   137	
   138	func newQueue(size int) (q *queueOnePass) {
   139		return &queueOnePass{
   140			sparse: make([]uint32, size),
   141			dense:  make([]uint32, size),
   142		}
   143	}
   144	
   145	// mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
   146	// and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
   147	// i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
   148	// NextIp array with the single element mergeFailed is returned.
   149	// The code assumes that both inputs contain ordered and non-intersecting rune pairs.
   150	const mergeFailed = uint32(0xffffffff)
   151	
   152	var (
   153		noRune = []rune{}
   154		noNext = []uint32{mergeFailed}
   155	)
   156	
   157	func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
   158		leftLen := len(*leftRunes)
   159		rightLen := len(*rightRunes)
   160		if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
   161			panic("mergeRuneSets odd length []rune")
   162		}
   163		var (
   164			lx, rx int
   165		)
   166		merged := make([]rune, 0)
   167		next := make([]uint32, 0)
   168		ok := true
   169		defer func() {
   170			if !ok {
   171				merged = nil
   172				next = nil
   173			}
   174		}()
   175	
   176		ix := -1
   177		extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
   178			if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
   179				return false
   180			}
   181			merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
   182			*newLow += 2
   183			ix += 2
   184			next = append(next, pc)
   185			return true
   186		}
   187	
   188		for lx < leftLen || rx < rightLen {
   189			switch {
   190			case rx >= rightLen:
   191				ok = extend(&lx, leftRunes, leftPC)
   192			case lx >= leftLen:
   193				ok = extend(&rx, rightRunes, rightPC)
   194			case (*rightRunes)[rx] < (*leftRunes)[lx]:
   195				ok = extend(&rx, rightRunes, rightPC)
   196			default:
   197				ok = extend(&lx, leftRunes, leftPC)
   198			}
   199			if !ok {
   200				return noRune, noNext
   201			}
   202		}
   203		return merged, next
   204	}
   205	
   206	// cleanupOnePass drops working memory, and restores certain shortcut instructions.
   207	func cleanupOnePass(prog *onePassProg, original *syntax.Prog) {
   208		for ix, instOriginal := range original.Inst {
   209			switch instOriginal.Op {
   210			case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune:
   211			case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail:
   212				prog.Inst[ix].Next = nil
   213			case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
   214				prog.Inst[ix].Next = nil
   215				prog.Inst[ix] = onePassInst{Inst: instOriginal}
   216			}
   217		}
   218	}
   219	
   220	// onePassCopy creates a copy of the original Prog, as we'll be modifying it
   221	func onePassCopy(prog *syntax.Prog) *onePassProg {
   222		p := &onePassProg{
   223			Start:  prog.Start,
   224			NumCap: prog.NumCap,
   225			Inst:   make([]onePassInst, len(prog.Inst)),
   226		}
   227		for i, inst := range prog.Inst {
   228			p.Inst[i] = onePassInst{Inst: inst}
   229		}
   230	
   231		// rewrites one or more common Prog constructs that enable some otherwise
   232		// non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
   233		// ip A, that points to ips B & C.
   234		// A:BC + B:DA => A:BC + B:CD
   235		// A:BC + B:DC => A:DC + B:DC
   236		for pc := range p.Inst {
   237			switch p.Inst[pc].Op {
   238			default:
   239				continue
   240			case syntax.InstAlt, syntax.InstAltMatch:
   241				// A:Bx + B:Ay
   242				p_A_Other := &p.Inst[pc].Out
   243				p_A_Alt := &p.Inst[pc].Arg
   244				// make sure a target is another Alt
   245				instAlt := p.Inst[*p_A_Alt]
   246				if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
   247					p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
   248					instAlt = p.Inst[*p_A_Alt]
   249					if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
   250						continue
   251					}
   252				}
   253				instOther := p.Inst[*p_A_Other]
   254				// Analyzing both legs pointing to Alts is for another day
   255				if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch {
   256					// too complicated
   257					continue
   258				}
   259				// simple empty transition loop
   260				// A:BC + B:DA => A:BC + B:DC
   261				p_B_Alt := &p.Inst[*p_A_Alt].Out
   262				p_B_Other := &p.Inst[*p_A_Alt].Arg
   263				patch := false
   264				if instAlt.Out == uint32(pc) {
   265					patch = true
   266				} else if instAlt.Arg == uint32(pc) {
   267					patch = true
   268					p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
   269				}
   270				if patch {
   271					*p_B_Alt = *p_A_Other
   272				}
   273	
   274				// empty transition to common target
   275				// A:BC + B:DC => A:DC + B:DC
   276				if *p_A_Other == *p_B_Alt {
   277					*p_A_Alt = *p_B_Other
   278				}
   279			}
   280		}
   281		return p
   282	}
   283	
   284	// runeSlice exists to permit sorting the case-folded rune sets.
   285	type runeSlice []rune
   286	
   287	func (p runeSlice) Len() int           { return len(p) }
   288	func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] }
   289	func (p runeSlice) Swap(i, j int)      { p[i], p[j] = p[j], p[i] }
   290	
   291	var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
   292	var anyRune = []rune{0, unicode.MaxRune}
   293	
   294	// makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
   295	// the match engine can always tell which branch to take. The routine may modify
   296	// p if it is turned into a onepass Prog. If it isn't possible for this to be a
   297	// onepass Prog, the Prog nil is returned. makeOnePass is recursive
   298	// to the size of the Prog.
   299	func makeOnePass(p *onePassProg) *onePassProg {
   300		// If the machine is very long, it's not worth the time to check if we can use one pass.
   301		if len(p.Inst) >= 1000 {
   302			return nil
   303		}
   304	
   305		var (
   306			instQueue    = newQueue(len(p.Inst))
   307			visitQueue   = newQueue(len(p.Inst))
   308			check        func(uint32, []bool) bool
   309			onePassRunes = make([][]rune, len(p.Inst))
   310		)
   311	
   312		// check that paths from Alt instructions are unambiguous, and rebuild the new
   313		// program as a onepass program
   314		check = func(pc uint32, m []bool) (ok bool) {
   315			ok = true
   316			inst := &p.Inst[pc]
   317			if visitQueue.contains(pc) {
   318				return
   319			}
   320			visitQueue.insert(pc)
   321			switch inst.Op {
   322			case syntax.InstAlt, syntax.InstAltMatch:
   323				ok = check(inst.Out, m) && check(inst.Arg, m)
   324				// check no-input paths to InstMatch
   325				matchOut := m[inst.Out]
   326				matchArg := m[inst.Arg]
   327				if matchOut && matchArg {
   328					ok = false
   329					break
   330				}
   331				// Match on empty goes in inst.Out
   332				if matchArg {
   333					inst.Out, inst.Arg = inst.Arg, inst.Out
   334					matchOut, matchArg = matchArg, matchOut
   335				}
   336				if matchOut {
   337					m[pc] = true
   338					inst.Op = syntax.InstAltMatch
   339				}
   340	
   341				// build a dispatch operator from the two legs of the alt.
   342				onePassRunes[pc], inst.Next = mergeRuneSets(
   343					&onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
   344				if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
   345					ok = false
   346					break
   347				}
   348			case syntax.InstCapture, syntax.InstNop:
   349				ok = check(inst.Out, m)
   350				m[pc] = m[inst.Out]
   351				// pass matching runes back through these no-ops.
   352				onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
   353				inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
   354				for i := range inst.Next {
   355					inst.Next[i] = inst.Out
   356				}
   357			case syntax.InstEmptyWidth:
   358				ok = check(inst.Out, m)
   359				m[pc] = m[inst.Out]
   360				onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
   361				inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
   362				for i := range inst.Next {
   363					inst.Next[i] = inst.Out
   364				}
   365			case syntax.InstMatch, syntax.InstFail:
   366				m[pc] = inst.Op == syntax.InstMatch
   367			case syntax.InstRune:
   368				m[pc] = false
   369				if len(inst.Next) > 0 {
   370					break
   371				}
   372				instQueue.insert(inst.Out)
   373				if len(inst.Rune) == 0 {
   374					onePassRunes[pc] = []rune{}
   375					inst.Next = []uint32{inst.Out}
   376					break
   377				}
   378				runes := make([]rune, 0)
   379				if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
   380					r0 := inst.Rune[0]
   381					runes = append(runes, r0, r0)
   382					for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
   383						runes = append(runes, r1, r1)
   384					}
   385					sort.Sort(runeSlice(runes))
   386				} else {
   387					runes = append(runes, inst.Rune...)
   388				}
   389				onePassRunes[pc] = runes
   390				inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
   391				for i := range inst.Next {
   392					inst.Next[i] = inst.Out
   393				}
   394				inst.Op = syntax.InstRune
   395			case syntax.InstRune1:
   396				m[pc] = false
   397				if len(inst.Next) > 0 {
   398					break
   399				}
   400				instQueue.insert(inst.Out)
   401				runes := []rune{}
   402				// expand case-folded runes
   403				if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
   404					r0 := inst.Rune[0]
   405					runes = append(runes, r0, r0)
   406					for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
   407						runes = append(runes, r1, r1)
   408					}
   409					sort.Sort(runeSlice(runes))
   410				} else {
   411					runes = append(runes, inst.Rune[0], inst.Rune[0])
   412				}
   413				onePassRunes[pc] = runes
   414				inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
   415				for i := range inst.Next {
   416					inst.Next[i] = inst.Out
   417				}
   418				inst.Op = syntax.InstRune
   419			case syntax.InstRuneAny:
   420				m[pc] = false
   421				if len(inst.Next) > 0 {
   422					break
   423				}
   424				instQueue.insert(inst.Out)
   425				onePassRunes[pc] = append([]rune{}, anyRune...)
   426				inst.Next = []uint32{inst.Out}
   427			case syntax.InstRuneAnyNotNL:
   428				m[pc] = false
   429				if len(inst.Next) > 0 {
   430					break
   431				}
   432				instQueue.insert(inst.Out)
   433				onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
   434				inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
   435				for i := range inst.Next {
   436					inst.Next[i] = inst.Out
   437				}
   438			}
   439			return
   440		}
   441	
   442		instQueue.clear()
   443		instQueue.insert(uint32(p.Start))
   444		m := make([]bool, len(p.Inst))
   445		for !instQueue.empty() {
   446			visitQueue.clear()
   447			pc := instQueue.next()
   448			if !check(pc, m) {
   449				p = nil
   450				break
   451			}
   452		}
   453		if p != nil {
   454			for i := range p.Inst {
   455				p.Inst[i].Rune = onePassRunes[i]
   456			}
   457		}
   458		return p
   459	}
   460	
   461	// compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
   462	// can be recharacterized as a one-pass regexp program, or syntax.nil if the
   463	// Prog cannot be converted. For a one pass prog, the fundamental condition that must
   464	// be true is: at any InstAlt, there must be no ambiguity about what branch to  take.
   465	func compileOnePass(prog *syntax.Prog) (p *onePassProg) {
   466		if prog.Start == 0 {
   467			return nil
   468		}
   469		// onepass regexp is anchored
   470		if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth ||
   471			syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText {
   472			return nil
   473		}
   474		// every instruction leading to InstMatch must be EmptyEndText
   475		for _, inst := range prog.Inst {
   476			opOut := prog.Inst[inst.Out].Op
   477			switch inst.Op {
   478			default:
   479				if opOut == syntax.InstMatch {
   480					return nil
   481				}
   482			case syntax.InstAlt, syntax.InstAltMatch:
   483				if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch {
   484					return nil
   485				}
   486			case syntax.InstEmptyWidth:
   487				if opOut == syntax.InstMatch {
   488					if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText {
   489						continue
   490					}
   491					return nil
   492				}
   493			}
   494		}
   495		// Creates a slightly optimized copy of the original Prog
   496		// that cleans up some Prog idioms that block valid onepass programs
   497		p = onePassCopy(prog)
   498	
   499		// checkAmbiguity on InstAlts, build onepass Prog if possible
   500		p = makeOnePass(p)
   501	
   502		if p != nil {
   503			cleanupOnePass(p, prog)
   504		}
   505		return p
   506	}
   507
View as plain text