...

Source file src/pkg/regexp/syntax/prog.go

     1	// Copyright 2011 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	package syntax
     6	
     7	import (
     8		"strconv"
     9		"strings"
    10		"unicode"
    11	)
    12	
    13	// Compiled program.
    14	// May not belong in this package, but convenient for now.
    15	
    16	// A Prog is a compiled regular expression program.
    17	type Prog struct {
    18		Inst   []Inst
    19		Start  int // index of start instruction
    20		NumCap int // number of InstCapture insts in re
    21	}
    22	
    23	// An InstOp is an instruction opcode.
    24	type InstOp uint8
    25	
    26	const (
    27		InstAlt InstOp = iota
    28		InstAltMatch
    29		InstCapture
    30		InstEmptyWidth
    31		InstMatch
    32		InstFail
    33		InstNop
    34		InstRune
    35		InstRune1
    36		InstRuneAny
    37		InstRuneAnyNotNL
    38	)
    39	
    40	var instOpNames = []string{
    41		"InstAlt",
    42		"InstAltMatch",
    43		"InstCapture",
    44		"InstEmptyWidth",
    45		"InstMatch",
    46		"InstFail",
    47		"InstNop",
    48		"InstRune",
    49		"InstRune1",
    50		"InstRuneAny",
    51		"InstRuneAnyNotNL",
    52	}
    53	
    54	func (i InstOp) String() string {
    55		if uint(i) >= uint(len(instOpNames)) {
    56			return ""
    57		}
    58		return instOpNames[i]
    59	}
    60	
    61	// An EmptyOp specifies a kind or mixture of zero-width assertions.
    62	type EmptyOp uint8
    63	
    64	const (
    65		EmptyBeginLine EmptyOp = 1 << iota
    66		EmptyEndLine
    67		EmptyBeginText
    68		EmptyEndText
    69		EmptyWordBoundary
    70		EmptyNoWordBoundary
    71	)
    72	
    73	// EmptyOpContext returns the zero-width assertions
    74	// satisfied at the position between the runes r1 and r2.
    75	// Passing r1 == -1 indicates that the position is
    76	// at the beginning of the text.
    77	// Passing r2 == -1 indicates that the position is
    78	// at the end of the text.
    79	func EmptyOpContext(r1, r2 rune) EmptyOp {
    80		var op EmptyOp = EmptyNoWordBoundary
    81		var boundary byte
    82		switch {
    83		case IsWordChar(r1):
    84			boundary = 1
    85		case r1 == '\n':
    86			op |= EmptyBeginLine
    87		case r1 < 0:
    88			op |= EmptyBeginText | EmptyBeginLine
    89		}
    90		switch {
    91		case IsWordChar(r2):
    92			boundary ^= 1
    93		case r2 == '\n':
    94			op |= EmptyEndLine
    95		case r2 < 0:
    96			op |= EmptyEndText | EmptyEndLine
    97		}
    98		if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2)
    99			op ^= (EmptyWordBoundary | EmptyNoWordBoundary)
   100		}
   101		return op
   102	}
   103	
   104	// IsWordChar reports whether r is consider a ``word character''
   105	// during the evaluation of the \b and \B zero-width assertions.
   106	// These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
   107	func IsWordChar(r rune) bool {
   108		return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
   109	}
   110	
   111	// An Inst is a single instruction in a regular expression program.
   112	type Inst struct {
   113		Op   InstOp
   114		Out  uint32 // all but InstMatch, InstFail
   115		Arg  uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
   116		Rune []rune
   117	}
   118	
   119	func (p *Prog) String() string {
   120		var b strings.Builder
   121		dumpProg(&b, p)
   122		return b.String()
   123	}
   124	
   125	// skipNop follows any no-op or capturing instructions.
   126	func (p *Prog) skipNop(pc uint32) *Inst {
   127		i := &p.Inst[pc]
   128		for i.Op == InstNop || i.Op == InstCapture {
   129			i = &p.Inst[i.Out]
   130		}
   131		return i
   132	}
   133	
   134	// op returns i.Op but merges all the Rune special cases into InstRune
   135	func (i *Inst) op() InstOp {
   136		op := i.Op
   137		switch op {
   138		case InstRune1, InstRuneAny, InstRuneAnyNotNL:
   139			op = InstRune
   140		}
   141		return op
   142	}
   143	
   144	// Prefix returns a literal string that all matches for the
   145	// regexp must start with. Complete is true if the prefix
   146	// is the entire match.
   147	func (p *Prog) Prefix() (prefix string, complete bool) {
   148		i := p.skipNop(uint32(p.Start))
   149	
   150		// Avoid allocation of buffer if prefix is empty.
   151		if i.op() != InstRune || len(i.Rune) != 1 {
   152			return "", i.Op == InstMatch
   153		}
   154	
   155		// Have prefix; gather characters.
   156		var buf strings.Builder
   157		for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 {
   158			buf.WriteRune(i.Rune[0])
   159			i = p.skipNop(i.Out)
   160		}
   161		return buf.String(), i.Op == InstMatch
   162	}
   163	
   164	// StartCond returns the leading empty-width conditions that must
   165	// be true in any match. It returns ^EmptyOp(0) if no matches are possible.
   166	func (p *Prog) StartCond() EmptyOp {
   167		var flag EmptyOp
   168		pc := uint32(p.Start)
   169		i := &p.Inst[pc]
   170	Loop:
   171		for {
   172			switch i.Op {
   173			case InstEmptyWidth:
   174				flag |= EmptyOp(i.Arg)
   175			case InstFail:
   176				return ^EmptyOp(0)
   177			case InstCapture, InstNop:
   178				// skip
   179			default:
   180				break Loop
   181			}
   182			pc = i.Out
   183			i = &p.Inst[pc]
   184		}
   185		return flag
   186	}
   187	
   188	const noMatch = -1
   189	
   190	// MatchRune reports whether the instruction matches (and consumes) r.
   191	// It should only be called when i.Op == InstRune.
   192	func (i *Inst) MatchRune(r rune) bool {
   193		return i.MatchRunePos(r) != noMatch
   194	}
   195	
   196	// MatchRunePos checks whether the instruction matches (and consumes) r.
   197	// If so, MatchRunePos returns the index of the matching rune pair
   198	// (or, when len(i.Rune) == 1, rune singleton).
   199	// If not, MatchRunePos returns -1.
   200	// MatchRunePos should only be called when i.Op == InstRune.
   201	func (i *Inst) MatchRunePos(r rune) int {
   202		rune := i.Rune
   203	
   204		switch len(rune) {
   205		case 0:
   206			return noMatch
   207	
   208		case 1:
   209			// Special case: single-rune slice is from literal string, not char class.
   210			r0 := rune[0]
   211			if r == r0 {
   212				return 0
   213			}
   214			if Flags(i.Arg)&FoldCase != 0 {
   215				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
   216					if r == r1 {
   217						return 0
   218					}
   219				}
   220			}
   221			return noMatch
   222	
   223		case 2:
   224			if r >= rune[0] && r <= rune[1] {
   225				return 0
   226			}
   227			return noMatch
   228	
   229		case 4, 6, 8:
   230			// Linear search for a few pairs.
   231			// Should handle ASCII well.
   232			for j := 0; j < len(rune); j += 2 {
   233				if r < rune[j] {
   234					return noMatch
   235				}
   236				if r <= rune[j+1] {
   237					return j / 2
   238				}
   239			}
   240			return noMatch
   241		}
   242	
   243		// Otherwise binary search.
   244		lo := 0
   245		hi := len(rune) / 2
   246		for lo < hi {
   247			m := lo + (hi-lo)/2
   248			if c := rune[2*m]; c <= r {
   249				if r <= rune[2*m+1] {
   250					return m
   251				}
   252				lo = m + 1
   253			} else {
   254				hi = m
   255			}
   256		}
   257		return noMatch
   258	}
   259	
   260	// MatchEmptyWidth reports whether the instruction matches
   261	// an empty string between the runes before and after.
   262	// It should only be called when i.Op == InstEmptyWidth.
   263	func (i *Inst) MatchEmptyWidth(before rune, after rune) bool {
   264		switch EmptyOp(i.Arg) {
   265		case EmptyBeginLine:
   266			return before == '\n' || before == -1
   267		case EmptyEndLine:
   268			return after == '\n' || after == -1
   269		case EmptyBeginText:
   270			return before == -1
   271		case EmptyEndText:
   272			return after == -1
   273		case EmptyWordBoundary:
   274			return IsWordChar(before) != IsWordChar(after)
   275		case EmptyNoWordBoundary:
   276			return IsWordChar(before) == IsWordChar(after)
   277		}
   278		panic("unknown empty width arg")
   279	}
   280	
   281	func (i *Inst) String() string {
   282		var b strings.Builder
   283		dumpInst(&b, i)
   284		return b.String()
   285	}
   286	
   287	func bw(b *strings.Builder, args ...string) {
   288		for _, s := range args {
   289			b.WriteString(s)
   290		}
   291	}
   292	
   293	func dumpProg(b *strings.Builder, p *Prog) {
   294		for j := range p.Inst {
   295			i := &p.Inst[j]
   296			pc := strconv.Itoa(j)
   297			if len(pc) < 3 {
   298				b.WriteString("   "[len(pc):])
   299			}
   300			if j == p.Start {
   301				pc += "*"
   302			}
   303			bw(b, pc, "\t")
   304			dumpInst(b, i)
   305			bw(b, "\n")
   306		}
   307	}
   308	
   309	func u32(i uint32) string {
   310		return strconv.FormatUint(uint64(i), 10)
   311	}
   312	
   313	func dumpInst(b *strings.Builder, i *Inst) {
   314		switch i.Op {
   315		case InstAlt:
   316			bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg))
   317		case InstAltMatch:
   318			bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg))
   319		case InstCapture:
   320			bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out))
   321		case InstEmptyWidth:
   322			bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out))
   323		case InstMatch:
   324			bw(b, "match")
   325		case InstFail:
   326			bw(b, "fail")
   327		case InstNop:
   328			bw(b, "nop -> ", u32(i.Out))
   329		case InstRune:
   330			if i.Rune == nil {
   331				// shouldn't happen
   332				bw(b, "rune <nil>")
   333			}
   334			bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
   335			if Flags(i.Arg)&FoldCase != 0 {
   336				bw(b, "/i")
   337			}
   338			bw(b, " -> ", u32(i.Out))
   339		case InstRune1:
   340			bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
   341		case InstRuneAny:
   342			bw(b, "any -> ", u32(i.Out))
   343		case InstRuneAnyNotNL:
   344			bw(b, "anynotnl -> ", u32(i.Out))
   345		}
   346	}
   347	

View as plain text