...

Source file src/pkg/regexp/syntax/regexp.go

     1	// Copyright 2011 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	package syntax
     6	
     7	// Note to implementers:
     8	// In this package, re is always a *Regexp and r is always a rune.
     9	
    10	import (
    11		"strconv"
    12		"strings"
    13		"unicode"
    14	)
    15	
    16	// A Regexp is a node in a regular expression syntax tree.
    17	type Regexp struct {
    18		Op       Op // operator
    19		Flags    Flags
    20		Sub      []*Regexp  // subexpressions, if any
    21		Sub0     [1]*Regexp // storage for short Sub
    22		Rune     []rune     // matched runes, for OpLiteral, OpCharClass
    23		Rune0    [2]rune    // storage for short Rune
    24		Min, Max int        // min, max for OpRepeat
    25		Cap      int        // capturing index, for OpCapture
    26		Name     string     // capturing name, for OpCapture
    27	}
    28	
    29	//go:generate stringer -type Op -trimprefix Op
    30	
    31	// An Op is a single regular expression operator.
    32	type Op uint8
    33	
    34	// Operators are listed in precedence order, tightest binding to weakest.
    35	// Character class operators are listed simplest to most complex
    36	// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
    37	
    38	const (
    39		OpNoMatch        Op = 1 + iota // matches no strings
    40		OpEmptyMatch                   // matches empty string
    41		OpLiteral                      // matches Runes sequence
    42		OpCharClass                    // matches Runes interpreted as range pair list
    43		OpAnyCharNotNL                 // matches any character except newline
    44		OpAnyChar                      // matches any character
    45		OpBeginLine                    // matches empty string at beginning of line
    46		OpEndLine                      // matches empty string at end of line
    47		OpBeginText                    // matches empty string at beginning of text
    48		OpEndText                      // matches empty string at end of text
    49		OpWordBoundary                 // matches word boundary `\b`
    50		OpNoWordBoundary               // matches word non-boundary `\B`
    51		OpCapture                      // capturing subexpression with index Cap, optional name Name
    52		OpStar                         // matches Sub[0] zero or more times
    53		OpPlus                         // matches Sub[0] one or more times
    54		OpQuest                        // matches Sub[0] zero or one times
    55		OpRepeat                       // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
    56		OpConcat                       // matches concatenation of Subs
    57		OpAlternate                    // matches alternation of Subs
    58	)
    59	
    60	const opPseudo Op = 128 // where pseudo-ops start
    61	
    62	// Equal reports whether x and y have identical structure.
    63	func (x *Regexp) Equal(y *Regexp) bool {
    64		if x == nil || y == nil {
    65			return x == y
    66		}
    67		if x.Op != y.Op {
    68			return false
    69		}
    70		switch x.Op {
    71		case OpEndText:
    72			// The parse flags remember whether this is \z or \Z.
    73			if x.Flags&WasDollar != y.Flags&WasDollar {
    74				return false
    75			}
    76	
    77		case OpLiteral, OpCharClass:
    78			if len(x.Rune) != len(y.Rune) {
    79				return false
    80			}
    81			for i, r := range x.Rune {
    82				if r != y.Rune[i] {
    83					return false
    84				}
    85			}
    86	
    87		case OpAlternate, OpConcat:
    88			if len(x.Sub) != len(y.Sub) {
    89				return false
    90			}
    91			for i, sub := range x.Sub {
    92				if !sub.Equal(y.Sub[i]) {
    93					return false
    94				}
    95			}
    96	
    97		case OpStar, OpPlus, OpQuest:
    98			if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
    99				return false
   100			}
   101	
   102		case OpRepeat:
   103			if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
   104				return false
   105			}
   106	
   107		case OpCapture:
   108			if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
   109				return false
   110			}
   111		}
   112		return true
   113	}
   114	
   115	// writeRegexp writes the Perl syntax for the regular expression re to b.
   116	func writeRegexp(b *strings.Builder, re *Regexp) {
   117		switch re.Op {
   118		default:
   119			b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
   120		case OpNoMatch:
   121			b.WriteString(`[^\x00-\x{10FFFF}]`)
   122		case OpEmptyMatch:
   123			b.WriteString(`(?:)`)
   124		case OpLiteral:
   125			if re.Flags&FoldCase != 0 {
   126				b.WriteString(`(?i:`)
   127			}
   128			for _, r := range re.Rune {
   129				escape(b, r, false)
   130			}
   131			if re.Flags&FoldCase != 0 {
   132				b.WriteString(`)`)
   133			}
   134		case OpCharClass:
   135			if len(re.Rune)%2 != 0 {
   136				b.WriteString(`[invalid char class]`)
   137				break
   138			}
   139			b.WriteRune('[')
   140			if len(re.Rune) == 0 {
   141				b.WriteString(`^\x00-\x{10FFFF}`)
   142			} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 {
   143				// Contains 0 and MaxRune. Probably a negated class.
   144				// Print the gaps.
   145				b.WriteRune('^')
   146				for i := 1; i < len(re.Rune)-1; i += 2 {
   147					lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
   148					escape(b, lo, lo == '-')
   149					if lo != hi {
   150						b.WriteRune('-')
   151						escape(b, hi, hi == '-')
   152					}
   153				}
   154			} else {
   155				for i := 0; i < len(re.Rune); i += 2 {
   156					lo, hi := re.Rune[i], re.Rune[i+1]
   157					escape(b, lo, lo == '-')
   158					if lo != hi {
   159						b.WriteRune('-')
   160						escape(b, hi, hi == '-')
   161					}
   162				}
   163			}
   164			b.WriteRune(']')
   165		case OpAnyCharNotNL:
   166			b.WriteString(`(?-s:.)`)
   167		case OpAnyChar:
   168			b.WriteString(`(?s:.)`)
   169		case OpBeginLine:
   170			b.WriteString(`(?m:^)`)
   171		case OpEndLine:
   172			b.WriteString(`(?m:$)`)
   173		case OpBeginText:
   174			b.WriteString(`\A`)
   175		case OpEndText:
   176			if re.Flags&WasDollar != 0 {
   177				b.WriteString(`(?-m:$)`)
   178			} else {
   179				b.WriteString(`\z`)
   180			}
   181		case OpWordBoundary:
   182			b.WriteString(`\b`)
   183		case OpNoWordBoundary:
   184			b.WriteString(`\B`)
   185		case OpCapture:
   186			if re.Name != "" {
   187				b.WriteString(`(?P<`)
   188				b.WriteString(re.Name)
   189				b.WriteRune('>')
   190			} else {
   191				b.WriteRune('(')
   192			}
   193			if re.Sub[0].Op != OpEmptyMatch {
   194				writeRegexp(b, re.Sub[0])
   195			}
   196			b.WriteRune(')')
   197		case OpStar, OpPlus, OpQuest, OpRepeat:
   198			if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
   199				b.WriteString(`(?:`)
   200				writeRegexp(b, sub)
   201				b.WriteString(`)`)
   202			} else {
   203				writeRegexp(b, sub)
   204			}
   205			switch re.Op {
   206			case OpStar:
   207				b.WriteRune('*')
   208			case OpPlus:
   209				b.WriteRune('+')
   210			case OpQuest:
   211				b.WriteRune('?')
   212			case OpRepeat:
   213				b.WriteRune('{')
   214				b.WriteString(strconv.Itoa(re.Min))
   215				if re.Max != re.Min {
   216					b.WriteRune(',')
   217					if re.Max >= 0 {
   218						b.WriteString(strconv.Itoa(re.Max))
   219					}
   220				}
   221				b.WriteRune('}')
   222			}
   223			if re.Flags&NonGreedy != 0 {
   224				b.WriteRune('?')
   225			}
   226		case OpConcat:
   227			for _, sub := range re.Sub {
   228				if sub.Op == OpAlternate {
   229					b.WriteString(`(?:`)
   230					writeRegexp(b, sub)
   231					b.WriteString(`)`)
   232				} else {
   233					writeRegexp(b, sub)
   234				}
   235			}
   236		case OpAlternate:
   237			for i, sub := range re.Sub {
   238				if i > 0 {
   239					b.WriteRune('|')
   240				}
   241				writeRegexp(b, sub)
   242			}
   243		}
   244	}
   245	
   246	func (re *Regexp) String() string {
   247		var b strings.Builder
   248		writeRegexp(&b, re)
   249		return b.String()
   250	}
   251	
   252	const meta = `\.+*?()|[]{}^$`
   253	
   254	func escape(b *strings.Builder, r rune, force bool) {
   255		if unicode.IsPrint(r) {
   256			if strings.ContainsRune(meta, r) || force {
   257				b.WriteRune('\\')
   258			}
   259			b.WriteRune(r)
   260			return
   261		}
   262	
   263		switch r {
   264		case '\a':
   265			b.WriteString(`\a`)
   266		case '\f':
   267			b.WriteString(`\f`)
   268		case '\n':
   269			b.WriteString(`\n`)
   270		case '\r':
   271			b.WriteString(`\r`)
   272		case '\t':
   273			b.WriteString(`\t`)
   274		case '\v':
   275			b.WriteString(`\v`)
   276		default:
   277			if r < 0x100 {
   278				b.WriteString(`\x`)
   279				s := strconv.FormatInt(int64(r), 16)
   280				if len(s) == 1 {
   281					b.WriteRune('0')
   282				}
   283				b.WriteString(s)
   284				break
   285			}
   286			b.WriteString(`\x{`)
   287			b.WriteString(strconv.FormatInt(int64(r), 16))
   288			b.WriteString(`}`)
   289		}
   290	}
   291	
   292	// MaxCap walks the regexp to find the maximum capture index.
   293	func (re *Regexp) MaxCap() int {
   294		m := 0
   295		if re.Op == OpCapture {
   296			m = re.Cap
   297		}
   298		for _, sub := range re.Sub {
   299			if n := sub.MaxCap(); m < n {
   300				m = n
   301			}
   302		}
   303		return m
   304	}
   305	
   306	// CapNames walks the regexp to find the names of capturing groups.
   307	func (re *Regexp) CapNames() []string {
   308		names := make([]string, re.MaxCap()+1)
   309		re.capNames(names)
   310		return names
   311	}
   312	
   313	func (re *Regexp) capNames(names []string) {
   314		if re.Op == OpCapture {
   315			names[re.Cap] = re.Name
   316		}
   317		for _, sub := range re.Sub {
   318			sub.capNames(names)
   319		}
   320	}
   321	

View as plain text