...

Source file src/regexp/regexp.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package regexp implements regular expression search.
     6	//
     7	// The syntax of the regular expressions accepted is the same
     8	// general syntax used by Perl, Python, and other languages.
     9	// More precisely, it is the syntax accepted by RE2 and described at
    10	// https://golang.org/s/re2syntax, except for \C.
    11	// For an overview of the syntax, run
    12	//   go doc regexp/syntax
    13	//
    14	// The regexp implementation provided by this package is
    15	// guaranteed to run in time linear in the size of the input.
    16	// (This is a property not guaranteed by most open source
    17	// implementations of regular expressions.) For more information
    18	// about this property, see
    19	//	https://swtch.com/~rsc/regexp/regexp1.html
    20	// or any book about automata theory.
    21	//
    22	// All characters are UTF-8-encoded code points.
    23	//
    24	// There are 16 methods of Regexp that match a regular expression and identify
    25	// the matched text. Their names are matched by this regular expression:
    26	//
    27	//	Find(All)?(String)?(Submatch)?(Index)?
    28	//
    29	// If 'All' is present, the routine matches successive non-overlapping
    30	// matches of the entire expression. Empty matches abutting a preceding
    31	// match are ignored. The return value is a slice containing the successive
    32	// return values of the corresponding non-'All' routine. These routines take
    33	// an extra integer argument, n. If n >= 0, the function returns at most n
    34	// matches/submatches; otherwise, it returns all of them.
    35	//
    36	// If 'String' is present, the argument is a string; otherwise it is a slice
    37	// of bytes; return values are adjusted as appropriate.
    38	//
    39	// If 'Submatch' is present, the return value is a slice identifying the
    40	// successive submatches of the expression. Submatches are matches of
    41	// parenthesized subexpressions (also known as capturing groups) within the
    42	// regular expression, numbered from left to right in order of opening
    43	// parenthesis. Submatch 0 is the match of the entire expression, submatch 1
    44	// the match of the first parenthesized subexpression, and so on.
    45	//
    46	// If 'Index' is present, matches and submatches are identified by byte index
    47	// pairs within the input string: result[2*n:2*n+1] identifies the indexes of
    48	// the nth submatch. The pair for n==0 identifies the match of the entire
    49	// expression. If 'Index' is not present, the match is identified by the text
    50	// of the match/submatch. If an index is negative or text is nil, it means that
    51	// subexpression did not match any string in the input. For 'String' versions
    52	// an empty string means either no match or an empty match.
    53	//
    54	// There is also a subset of the methods that can be applied to text read
    55	// from a RuneReader:
    56	//
    57	//	MatchReader, FindReaderIndex, FindReaderSubmatchIndex
    58	//
    59	// This set may grow. Note that regular expression matches may need to
    60	// examine text beyond the text returned by a match, so the methods that
    61	// match text from a RuneReader may read arbitrarily far into the input
    62	// before returning.
    63	//
    64	// (There are a few other methods that do not match this pattern.)
    65	//
    66	package regexp
    67	
    68	import (
    69		"bytes"
    70		"io"
    71		"regexp/syntax"
    72		"strconv"
    73		"strings"
    74		"sync"
    75		"unicode"
    76		"unicode/utf8"
    77	)
    78	
    79	// Regexp is the representation of a compiled regular expression.
    80	// A Regexp is safe for concurrent use by multiple goroutines,
    81	// except for configuration methods, such as Longest.
    82	type Regexp struct {
    83		expr           string       // as passed to Compile
    84		prog           *syntax.Prog // compiled program
    85		onepass        *onePassProg // onepass program or nil
    86		numSubexp      int
    87		maxBitStateLen int
    88		subexpNames    []string
    89		prefix         string         // required prefix in unanchored matches
    90		prefixBytes    []byte         // prefix, as a []byte
    91		prefixRune     rune           // first rune in prefix
    92		prefixEnd      uint32         // pc for last rune in prefix
    93		mpool          int            // pool for machines
    94		matchcap       int            // size of recorded match lengths
    95		prefixComplete bool           // prefix is the entire regexp
    96		cond           syntax.EmptyOp // empty-width conditions required at start of match
    97		minInputLen    int            // minimum length of the input in bytes
    98	
    99		// This field can be modified by the Longest method,
   100		// but it is otherwise read-only.
   101		longest bool // whether regexp prefers leftmost-longest match
   102	}
   103	
   104	// String returns the source text used to compile the regular expression.
   105	func (re *Regexp) String() string {
   106		return re.expr
   107	}
   108	
   109	// Copy returns a new Regexp object copied from re.
   110	// Calling Longest on one copy does not affect another.
   111	//
   112	// Deprecated: In earlier releases, when using a Regexp in multiple goroutines,
   113	// giving each goroutine its own copy helped to avoid lock contention.
   114	// As of Go 1.12, using Copy is no longer necessary to avoid lock contention.
   115	// Copy may still be appropriate if the reason for its use is to make
   116	// two copies with different Longest settings.
   117	func (re *Regexp) Copy() *Regexp {
   118		re2 := *re
   119		return &re2
   120	}
   121	
   122	// Compile parses a regular expression and returns, if successful,
   123	// a Regexp object that can be used to match against text.
   124	//
   125	// When matching against text, the regexp returns a match that
   126	// begins as early as possible in the input (leftmost), and among those
   127	// it chooses the one that a backtracking search would have found first.
   128	// This so-called leftmost-first matching is the same semantics
   129	// that Perl, Python, and other implementations use, although this
   130	// package implements it without the expense of backtracking.
   131	// For POSIX leftmost-longest matching, see CompilePOSIX.
   132	func Compile(expr string) (*Regexp, error) {
   133		return compile(expr, syntax.Perl, false)
   134	}
   135	
   136	// CompilePOSIX is like Compile but restricts the regular expression
   137	// to POSIX ERE (egrep) syntax and changes the match semantics to
   138	// leftmost-longest.
   139	//
   140	// That is, when matching against text, the regexp returns a match that
   141	// begins as early as possible in the input (leftmost), and among those
   142	// it chooses a match that is as long as possible.
   143	// This so-called leftmost-longest matching is the same semantics
   144	// that early regular expression implementations used and that POSIX
   145	// specifies.
   146	//
   147	// However, there can be multiple leftmost-longest matches, with different
   148	// submatch choices, and here this package diverges from POSIX.
   149	// Among the possible leftmost-longest matches, this package chooses
   150	// the one that a backtracking search would have found first, while POSIX
   151	// specifies that the match be chosen to maximize the length of the first
   152	// subexpression, then the second, and so on from left to right.
   153	// The POSIX rule is computationally prohibitive and not even well-defined.
   154	// See https://swtch.com/~rsc/regexp/regexp2.html#posix for details.
   155	func CompilePOSIX(expr string) (*Regexp, error) {
   156		return compile(expr, syntax.POSIX, true)
   157	}
   158	
   159	// Longest makes future searches prefer the leftmost-longest match.
   160	// That is, when matching against text, the regexp returns a match that
   161	// begins as early as possible in the input (leftmost), and among those
   162	// it chooses a match that is as long as possible.
   163	// This method modifies the Regexp and may not be called concurrently
   164	// with any other methods.
   165	func (re *Regexp) Longest() {
   166		re.longest = true
   167	}
   168	
   169	func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) {
   170		re, err := syntax.Parse(expr, mode)
   171		if err != nil {
   172			return nil, err
   173		}
   174		maxCap := re.MaxCap()
   175		capNames := re.CapNames()
   176	
   177		re = re.Simplify()
   178		prog, err := syntax.Compile(re)
   179		if err != nil {
   180			return nil, err
   181		}
   182		matchcap := prog.NumCap
   183		if matchcap < 2 {
   184			matchcap = 2
   185		}
   186		regexp := &Regexp{
   187			expr:        expr,
   188			prog:        prog,
   189			onepass:     compileOnePass(prog),
   190			numSubexp:   maxCap,
   191			subexpNames: capNames,
   192			cond:        prog.StartCond(),
   193			longest:     longest,
   194			matchcap:    matchcap,
   195			minInputLen: minInputLen(re),
   196		}
   197		if regexp.onepass == nil {
   198			regexp.prefix, regexp.prefixComplete = prog.Prefix()
   199			regexp.maxBitStateLen = maxBitStateLen(prog)
   200		} else {
   201			regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog)
   202		}
   203		if regexp.prefix != "" {
   204			// TODO(rsc): Remove this allocation by adding
   205			// IndexString to package bytes.
   206			regexp.prefixBytes = []byte(regexp.prefix)
   207			regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix)
   208		}
   209	
   210		n := len(prog.Inst)
   211		i := 0
   212		for matchSize[i] != 0 && matchSize[i] < n {
   213			i++
   214		}
   215		regexp.mpool = i
   216	
   217		return regexp, nil
   218	}
   219	
   220	// Pools of *machine for use during (*Regexp).doExecute,
   221	// split up by the size of the execution queues.
   222	// matchPool[i] machines have queue size matchSize[i].
   223	// On a 64-bit system each queue entry is 16 bytes,
   224	// so matchPool[0] has 16*2*128 = 4kB queues, etc.
   225	// The final matchPool is a catch-all for very large queues.
   226	var (
   227		matchSize = [...]int{128, 512, 2048, 16384, 0}
   228		matchPool [len(matchSize)]sync.Pool
   229	)
   230	
   231	// get returns a machine to use for matching re.
   232	// It uses the re's machine cache if possible, to avoid
   233	// unnecessary allocation.
   234	func (re *Regexp) get() *machine {
   235		m, ok := matchPool[re.mpool].Get().(*machine)
   236		if !ok {
   237			m = new(machine)
   238		}
   239		m.re = re
   240		m.p = re.prog
   241		if cap(m.matchcap) < re.matchcap {
   242			m.matchcap = make([]int, re.matchcap)
   243			for _, t := range m.pool {
   244				t.cap = make([]int, re.matchcap)
   245			}
   246		}
   247	
   248		// Allocate queues if needed.
   249		// Or reallocate, for "large" match pool.
   250		n := matchSize[re.mpool]
   251		if n == 0 { // large pool
   252			n = len(re.prog.Inst)
   253		}
   254		if len(m.q0.sparse) < n {
   255			m.q0 = queue{make([]uint32, n), make([]entry, 0, n)}
   256			m.q1 = queue{make([]uint32, n), make([]entry, 0, n)}
   257		}
   258		return m
   259	}
   260	
   261	// put returns a machine to the correct machine pool.
   262	func (re *Regexp) put(m *machine) {
   263		m.re = nil
   264		m.p = nil
   265		m.inputs.clear()
   266		matchPool[re.mpool].Put(m)
   267	}
   268	
   269	// minInputLen walks the regexp to find the minimum length of any matchable input
   270	func minInputLen(re *syntax.Regexp) int {
   271		switch re.Op {
   272		default:
   273			return 0
   274		case syntax.OpAnyChar, syntax.OpAnyCharNotNL, syntax.OpCharClass:
   275			return 1
   276		case syntax.OpLiteral:
   277			l := 0
   278			for _, r := range re.Rune {
   279				l += utf8.RuneLen(r)
   280			}
   281			return l
   282		case syntax.OpCapture, syntax.OpPlus:
   283			return minInputLen(re.Sub[0])
   284		case syntax.OpRepeat:
   285			return re.Min * minInputLen(re.Sub[0])
   286		case syntax.OpConcat:
   287			l := 0
   288			for _, sub := range re.Sub {
   289				l += minInputLen(sub)
   290			}
   291			return l
   292		case syntax.OpAlternate:
   293			l := minInputLen(re.Sub[0])
   294			var lnext int
   295			for _, sub := range re.Sub[1:] {
   296				lnext = minInputLen(sub)
   297				if lnext < l {
   298					l = lnext
   299				}
   300			}
   301			return l
   302		}
   303	}
   304	
   305	// MustCompile is like Compile but panics if the expression cannot be parsed.
   306	// It simplifies safe initialization of global variables holding compiled regular
   307	// expressions.
   308	func MustCompile(str string) *Regexp {
   309		regexp, err := Compile(str)
   310		if err != nil {
   311			panic(`regexp: Compile(` + quote(str) + `): ` + err.Error())
   312		}
   313		return regexp
   314	}
   315	
   316	// MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed.
   317	// It simplifies safe initialization of global variables holding compiled regular
   318	// expressions.
   319	func MustCompilePOSIX(str string) *Regexp {
   320		regexp, err := CompilePOSIX(str)
   321		if err != nil {
   322			panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + err.Error())
   323		}
   324		return regexp
   325	}
   326	
   327	func quote(s string) string {
   328		if strconv.CanBackquote(s) {
   329			return "`" + s + "`"
   330		}
   331		return strconv.Quote(s)
   332	}
   333	
   334	// NumSubexp returns the number of parenthesized subexpressions in this Regexp.
   335	func (re *Regexp) NumSubexp() int {
   336		return re.numSubexp
   337	}
   338	
   339	// SubexpNames returns the names of the parenthesized subexpressions
   340	// in this Regexp. The name for the first sub-expression is names[1],
   341	// so that if m is a match slice, the name for m[i] is SubexpNames()[i].
   342	// Since the Regexp as a whole cannot be named, names[0] is always
   343	// the empty string. The slice should not be modified.
   344	func (re *Regexp) SubexpNames() []string {
   345		return re.subexpNames
   346	}
   347	
   348	const endOfText rune = -1
   349	
   350	// input abstracts different representations of the input text. It provides
   351	// one-character lookahead.
   352	type input interface {
   353		step(pos int) (r rune, width int) // advance one rune
   354		canCheckPrefix() bool             // can we look ahead without losing info?
   355		hasPrefix(re *Regexp) bool
   356		index(re *Regexp, pos int) int
   357		context(pos int) lazyFlag
   358	}
   359	
   360	// inputString scans a string.
   361	type inputString struct {
   362		str string
   363	}
   364	
   365	func (i *inputString) step(pos int) (rune, int) {
   366		if pos < len(i.str) {
   367			c := i.str[pos]
   368			if c < utf8.RuneSelf {
   369				return rune(c), 1
   370			}
   371			return utf8.DecodeRuneInString(i.str[pos:])
   372		}
   373		return endOfText, 0
   374	}
   375	
   376	func (i *inputString) canCheckPrefix() bool {
   377		return true
   378	}
   379	
   380	func (i *inputString) hasPrefix(re *Regexp) bool {
   381		return strings.HasPrefix(i.str, re.prefix)
   382	}
   383	
   384	func (i *inputString) index(re *Regexp, pos int) int {
   385		return strings.Index(i.str[pos:], re.prefix)
   386	}
   387	
   388	func (i *inputString) context(pos int) lazyFlag {
   389		r1, r2 := endOfText, endOfText
   390		// 0 < pos && pos <= len(i.str)
   391		if uint(pos-1) < uint(len(i.str)) {
   392			r1 = rune(i.str[pos-1])
   393			if r1 >= utf8.RuneSelf {
   394				r1, _ = utf8.DecodeLastRuneInString(i.str[:pos])
   395			}
   396		}
   397		// 0 <= pos && pos < len(i.str)
   398		if uint(pos) < uint(len(i.str)) {
   399			r2 = rune(i.str[pos])
   400			if r2 >= utf8.RuneSelf {
   401				r2, _ = utf8.DecodeRuneInString(i.str[pos:])
   402			}
   403		}
   404		return newLazyFlag(r1, r2)
   405	}
   406	
   407	// inputBytes scans a byte slice.
   408	type inputBytes struct {
   409		str []byte
   410	}
   411	
   412	func (i *inputBytes) step(pos int) (rune, int) {
   413		if pos < len(i.str) {
   414			c := i.str[pos]
   415			if c < utf8.RuneSelf {
   416				return rune(c), 1
   417			}
   418			return utf8.DecodeRune(i.str[pos:])
   419		}
   420		return endOfText, 0
   421	}
   422	
   423	func (i *inputBytes) canCheckPrefix() bool {
   424		return true
   425	}
   426	
   427	func (i *inputBytes) hasPrefix(re *Regexp) bool {
   428		return bytes.HasPrefix(i.str, re.prefixBytes)
   429	}
   430	
   431	func (i *inputBytes) index(re *Regexp, pos int) int {
   432		return bytes.Index(i.str[pos:], re.prefixBytes)
   433	}
   434	
   435	func (i *inputBytes) context(pos int) lazyFlag {
   436		r1, r2 := endOfText, endOfText
   437		// 0 < pos && pos <= len(i.str)
   438		if uint(pos-1) < uint(len(i.str)) {
   439			r1 = rune(i.str[pos-1])
   440			if r1 >= utf8.RuneSelf {
   441				r1, _ = utf8.DecodeLastRune(i.str[:pos])
   442			}
   443		}
   444		// 0 <= pos && pos < len(i.str)
   445		if uint(pos) < uint(len(i.str)) {
   446			r2 = rune(i.str[pos])
   447			if r2 >= utf8.RuneSelf {
   448				r2, _ = utf8.DecodeRune(i.str[pos:])
   449			}
   450		}
   451		return newLazyFlag(r1, r2)
   452	}
   453	
   454	// inputReader scans a RuneReader.
   455	type inputReader struct {
   456		r     io.RuneReader
   457		atEOT bool
   458		pos   int
   459	}
   460	
   461	func (i *inputReader) step(pos int) (rune, int) {
   462		if !i.atEOT && pos != i.pos {
   463			return endOfText, 0
   464	
   465		}
   466		r, w, err := i.r.ReadRune()
   467		if err != nil {
   468			i.atEOT = true
   469			return endOfText, 0
   470		}
   471		i.pos += w
   472		return r, w
   473	}
   474	
   475	func (i *inputReader) canCheckPrefix() bool {
   476		return false
   477	}
   478	
   479	func (i *inputReader) hasPrefix(re *Regexp) bool {
   480		return false
   481	}
   482	
   483	func (i *inputReader) index(re *Regexp, pos int) int {
   484		return -1
   485	}
   486	
   487	func (i *inputReader) context(pos int) lazyFlag {
   488		return 0 // not used
   489	}
   490	
   491	// LiteralPrefix returns a literal string that must begin any match
   492	// of the regular expression re. It returns the boolean true if the
   493	// literal string comprises the entire regular expression.
   494	func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
   495		return re.prefix, re.prefixComplete
   496	}
   497	
   498	// MatchReader reports whether the text returned by the RuneReader
   499	// contains any match of the regular expression re.
   500	func (re *Regexp) MatchReader(r io.RuneReader) bool {
   501		return re.doMatch(r, nil, "")
   502	}
   503	
   504	// MatchString reports whether the string s
   505	// contains any match of the regular expression re.
   506	func (re *Regexp) MatchString(s string) bool {
   507		return re.doMatch(nil, nil, s)
   508	}
   509	
   510	// Match reports whether the byte slice b
   511	// contains any match of the regular expression re.
   512	func (re *Regexp) Match(b []byte) bool {
   513		return re.doMatch(nil, b, "")
   514	}
   515	
   516	// MatchReader reports whether the text returned by the RuneReader
   517	// contains any match of the regular expression pattern.
   518	// More complicated queries need to use Compile and the full Regexp interface.
   519	func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) {
   520		re, err := Compile(pattern)
   521		if err != nil {
   522			return false, err
   523		}
   524		return re.MatchReader(r), nil
   525	}
   526	
   527	// MatchString reports whether the string s
   528	// contains any match of the regular expression pattern.
   529	// More complicated queries need to use Compile and the full Regexp interface.
   530	func MatchString(pattern string, s string) (matched bool, err error) {
   531		re, err := Compile(pattern)
   532		if err != nil {
   533			return false, err
   534		}
   535		return re.MatchString(s), nil
   536	}
   537	
   538	// Match reports whether the byte slice b
   539	// contains any match of the regular expression pattern.
   540	// More complicated queries need to use Compile and the full Regexp interface.
   541	func Match(pattern string, b []byte) (matched bool, err error) {
   542		re, err := Compile(pattern)
   543		if err != nil {
   544			return false, err
   545		}
   546		return re.Match(b), nil
   547	}
   548	
   549	// ReplaceAllString returns a copy of src, replacing matches of the Regexp
   550	// with the replacement string repl. Inside repl, $ signs are interpreted as
   551	// in Expand, so for instance $1 represents the text of the first submatch.
   552	func (re *Regexp) ReplaceAllString(src, repl string) string {
   553		n := 2
   554		if strings.Contains(repl, "$") {
   555			n = 2 * (re.numSubexp + 1)
   556		}
   557		b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte {
   558			return re.expand(dst, repl, nil, src, match)
   559		})
   560		return string(b)
   561	}
   562	
   563	// ReplaceAllLiteralString returns a copy of src, replacing matches of the Regexp
   564	// with the replacement string repl. The replacement repl is substituted directly,
   565	// without using Expand.
   566	func (re *Regexp) ReplaceAllLiteralString(src, repl string) string {
   567		return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
   568			return append(dst, repl...)
   569		}))
   570	}
   571	
   572	// ReplaceAllStringFunc returns a copy of src in which all matches of the
   573	// Regexp have been replaced by the return value of function repl applied
   574	// to the matched substring. The replacement returned by repl is substituted
   575	// directly, without using Expand.
   576	func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
   577		b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
   578			return append(dst, repl(src[match[0]:match[1]])...)
   579		})
   580		return string(b)
   581	}
   582	
   583	func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte {
   584		lastMatchEnd := 0 // end position of the most recent match
   585		searchPos := 0    // position where we next look for a match
   586		var buf []byte
   587		var endPos int
   588		if bsrc != nil {
   589			endPos = len(bsrc)
   590		} else {
   591			endPos = len(src)
   592		}
   593		if nmatch > re.prog.NumCap {
   594			nmatch = re.prog.NumCap
   595		}
   596	
   597		var dstCap [2]int
   598		for searchPos <= endPos {
   599			a := re.doExecute(nil, bsrc, src, searchPos, nmatch, dstCap[:0])
   600			if len(a) == 0 {
   601				break // no more matches
   602			}
   603	
   604			// Copy the unmatched characters before this match.
   605			if bsrc != nil {
   606				buf = append(buf, bsrc[lastMatchEnd:a[0]]...)
   607			} else {
   608				buf = append(buf, src[lastMatchEnd:a[0]]...)
   609			}
   610	
   611			// Now insert a copy of the replacement string, but not for a
   612			// match of the empty string immediately after another match.
   613			// (Otherwise, we get double replacement for patterns that
   614			// match both empty and nonempty strings.)
   615			if a[1] > lastMatchEnd || a[0] == 0 {
   616				buf = repl(buf, a)
   617			}
   618			lastMatchEnd = a[1]
   619	
   620			// Advance past this match; always advance at least one character.
   621			var width int
   622			if bsrc != nil {
   623				_, width = utf8.DecodeRune(bsrc[searchPos:])
   624			} else {
   625				_, width = utf8.DecodeRuneInString(src[searchPos:])
   626			}
   627			if searchPos+width > a[1] {
   628				searchPos += width
   629			} else if searchPos+1 > a[1] {
   630				// This clause is only needed at the end of the input
   631				// string. In that case, DecodeRuneInString returns width=0.
   632				searchPos++
   633			} else {
   634				searchPos = a[1]
   635			}
   636		}
   637	
   638		// Copy the unmatched characters after the last match.
   639		if bsrc != nil {
   640			buf = append(buf, bsrc[lastMatchEnd:]...)
   641		} else {
   642			buf = append(buf, src[lastMatchEnd:]...)
   643		}
   644	
   645		return buf
   646	}
   647	
   648	// ReplaceAll returns a copy of src, replacing matches of the Regexp
   649	// with the replacement text repl. Inside repl, $ signs are interpreted as
   650	// in Expand, so for instance $1 represents the text of the first submatch.
   651	func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
   652		n := 2
   653		if bytes.IndexByte(repl, '$') >= 0 {
   654			n = 2 * (re.numSubexp + 1)
   655		}
   656		srepl := ""
   657		b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte {
   658			if len(srepl) != len(repl) {
   659				srepl = string(repl)
   660			}
   661			return re.expand(dst, srepl, src, "", match)
   662		})
   663		return b
   664	}
   665	
   666	// ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp
   667	// with the replacement bytes repl. The replacement repl is substituted directly,
   668	// without using Expand.
   669	func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte {
   670		return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
   671			return append(dst, repl...)
   672		})
   673	}
   674	
   675	// ReplaceAllFunc returns a copy of src in which all matches of the
   676	// Regexp have been replaced by the return value of function repl applied
   677	// to the matched byte slice. The replacement returned by repl is substituted
   678	// directly, without using Expand.
   679	func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
   680		return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
   681			return append(dst, repl(src[match[0]:match[1]])...)
   682		})
   683	}
   684	
   685	// Bitmap used by func special to check whether a character needs to be escaped.
   686	var specialBytes [16]byte
   687	
   688	// special reports whether byte b needs to be escaped by QuoteMeta.
   689	func special(b byte) bool {
   690		return b < utf8.RuneSelf && specialBytes[b%16]&(1<<(b/16)) != 0
   691	}
   692	
   693	func init() {
   694		for _, b := range []byte(`\.+*?()|[]{}^$`) {
   695			specialBytes[b%16] |= 1 << (b / 16)
   696		}
   697	}
   698	
   699	// QuoteMeta returns a string that escapes all regular expression metacharacters
   700	// inside the argument text; the returned string is a regular expression matching
   701	// the literal text.
   702	func QuoteMeta(s string) string {
   703		// A byte loop is correct because all metacharacters are ASCII.
   704		var i int
   705		for i = 0; i < len(s); i++ {
   706			if special(s[i]) {
   707				break
   708			}
   709		}
   710		// No meta characters found, so return original string.
   711		if i >= len(s) {
   712			return s
   713		}
   714	
   715		b := make([]byte, 2*len(s)-i)
   716		copy(b, s[:i])
   717		j := i
   718		for ; i < len(s); i++ {
   719			if special(s[i]) {
   720				b[j] = '\\'
   721				j++
   722			}
   723			b[j] = s[i]
   724			j++
   725		}
   726		return string(b[:j])
   727	}
   728	
   729	// The number of capture values in the program may correspond
   730	// to fewer capturing expressions than are in the regexp.
   731	// For example, "(a){0}" turns into an empty program, so the
   732	// maximum capture in the program is 0 but we need to return
   733	// an expression for \1.  Pad appends -1s to the slice a as needed.
   734	func (re *Regexp) pad(a []int) []int {
   735		if a == nil {
   736			// No match.
   737			return nil
   738		}
   739		n := (1 + re.numSubexp) * 2
   740		for len(a) < n {
   741			a = append(a, -1)
   742		}
   743		return a
   744	}
   745	
   746	// allMatches calls deliver at most n times
   747	// with the location of successive matches in the input text.
   748	// The input text is b if non-nil, otherwise s.
   749	func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
   750		var end int
   751		if b == nil {
   752			end = len(s)
   753		} else {
   754			end = len(b)
   755		}
   756	
   757		for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
   758			matches := re.doExecute(nil, b, s, pos, re.prog.NumCap, nil)
   759			if len(matches) == 0 {
   760				break
   761			}
   762	
   763			accept := true
   764			if matches[1] == pos {
   765				// We've found an empty match.
   766				if matches[0] == prevMatchEnd {
   767					// We don't allow an empty match right
   768					// after a previous match, so ignore it.
   769					accept = false
   770				}
   771				var width int
   772				// TODO: use step()
   773				if b == nil {
   774					_, width = utf8.DecodeRuneInString(s[pos:end])
   775				} else {
   776					_, width = utf8.DecodeRune(b[pos:end])
   777				}
   778				if width > 0 {
   779					pos += width
   780				} else {
   781					pos = end + 1
   782				}
   783			} else {
   784				pos = matches[1]
   785			}
   786			prevMatchEnd = matches[1]
   787	
   788			if accept {
   789				deliver(re.pad(matches))
   790				i++
   791			}
   792		}
   793	}
   794	
   795	// Find returns a slice holding the text of the leftmost match in b of the regular expression.
   796	// A return value of nil indicates no match.
   797	func (re *Regexp) Find(b []byte) []byte {
   798		var dstCap [2]int
   799		a := re.doExecute(nil, b, "", 0, 2, dstCap[:0])
   800		if a == nil {
   801			return nil
   802		}
   803		return b[a[0]:a[1]:a[1]]
   804	}
   805	
   806	// FindIndex returns a two-element slice of integers defining the location of
   807	// the leftmost match in b of the regular expression. The match itself is at
   808	// b[loc[0]:loc[1]].
   809	// A return value of nil indicates no match.
   810	func (re *Regexp) FindIndex(b []byte) (loc []int) {
   811		a := re.doExecute(nil, b, "", 0, 2, nil)
   812		if a == nil {
   813			return nil
   814		}
   815		return a[0:2]
   816	}
   817	
   818	// FindString returns a string holding the text of the leftmost match in s of the regular
   819	// expression. If there is no match, the return value is an empty string,
   820	// but it will also be empty if the regular expression successfully matches
   821	// an empty string. Use FindStringIndex or FindStringSubmatch if it is
   822	// necessary to distinguish these cases.
   823	func (re *Regexp) FindString(s string) string {
   824		var dstCap [2]int
   825		a := re.doExecute(nil, nil, s, 0, 2, dstCap[:0])
   826		if a == nil {
   827			return ""
   828		}
   829		return s[a[0]:a[1]]
   830	}
   831	
   832	// FindStringIndex returns a two-element slice of integers defining the
   833	// location of the leftmost match in s of the regular expression. The match
   834	// itself is at s[loc[0]:loc[1]].
   835	// A return value of nil indicates no match.
   836	func (re *Regexp) FindStringIndex(s string) (loc []int) {
   837		a := re.doExecute(nil, nil, s, 0, 2, nil)
   838		if a == nil {
   839			return nil
   840		}
   841		return a[0:2]
   842	}
   843	
   844	// FindReaderIndex returns a two-element slice of integers defining the
   845	// location of the leftmost match of the regular expression in text read from
   846	// the RuneReader. The match text was found in the input stream at
   847	// byte offset loc[0] through loc[1]-1.
   848	// A return value of nil indicates no match.
   849	func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) {
   850		a := re.doExecute(r, nil, "", 0, 2, nil)
   851		if a == nil {
   852			return nil
   853		}
   854		return a[0:2]
   855	}
   856	
   857	// FindSubmatch returns a slice of slices holding the text of the leftmost
   858	// match of the regular expression in b and the matches, if any, of its
   859	// subexpressions, as defined by the 'Submatch' descriptions in the package
   860	// comment.
   861	// A return value of nil indicates no match.
   862	func (re *Regexp) FindSubmatch(b []byte) [][]byte {
   863		var dstCap [4]int
   864		a := re.doExecute(nil, b, "", 0, re.prog.NumCap, dstCap[:0])
   865		if a == nil {
   866			return nil
   867		}
   868		ret := make([][]byte, 1+re.numSubexp)
   869		for i := range ret {
   870			if 2*i < len(a) && a[2*i] >= 0 {
   871				ret[i] = b[a[2*i]:a[2*i+1]:a[2*i+1]]
   872			}
   873		}
   874		return ret
   875	}
   876	
   877	// Expand appends template to dst and returns the result; during the
   878	// append, Expand replaces variables in the template with corresponding
   879	// matches drawn from src. The match slice should have been returned by
   880	// FindSubmatchIndex.
   881	//
   882	// In the template, a variable is denoted by a substring of the form
   883	// $name or ${name}, where name is a non-empty sequence of letters,
   884	// digits, and underscores. A purely numeric name like $1 refers to
   885	// the submatch with the corresponding index; other names refer to
   886	// capturing parentheses named with the (?P<name>...) syntax. A
   887	// reference to an out of range or unmatched index or a name that is not
   888	// present in the regular expression is replaced with an empty slice.
   889	//
   890	// In the $name form, name is taken to be as long as possible: $1x is
   891	// equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0.
   892	//
   893	// To insert a literal $ in the output, use $$ in the template.
   894	func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte {
   895		return re.expand(dst, string(template), src, "", match)
   896	}
   897	
   898	// ExpandString is like Expand but the template and source are strings.
   899	// It appends to and returns a byte slice in order to give the calling
   900	// code control over allocation.
   901	func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte {
   902		return re.expand(dst, template, nil, src, match)
   903	}
   904	
   905	func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte {
   906		for len(template) > 0 {
   907			i := strings.Index(template, "$")
   908			if i < 0 {
   909				break
   910			}
   911			dst = append(dst, template[:i]...)
   912			template = template[i:]
   913			if len(template) > 1 && template[1] == '$' {
   914				// Treat $$ as $.
   915				dst = append(dst, '$')
   916				template = template[2:]
   917				continue
   918			}
   919			name, num, rest, ok := extract(template)
   920			if !ok {
   921				// Malformed; treat $ as raw text.
   922				dst = append(dst, '$')
   923				template = template[1:]
   924				continue
   925			}
   926			template = rest
   927			if num >= 0 {
   928				if 2*num+1 < len(match) && match[2*num] >= 0 {
   929					if bsrc != nil {
   930						dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...)
   931					} else {
   932						dst = append(dst, src[match[2*num]:match[2*num+1]]...)
   933					}
   934				}
   935			} else {
   936				for i, namei := range re.subexpNames {
   937					if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 {
   938						if bsrc != nil {
   939							dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...)
   940						} else {
   941							dst = append(dst, src[match[2*i]:match[2*i+1]]...)
   942						}
   943						break
   944					}
   945				}
   946			}
   947		}
   948		dst = append(dst, template...)
   949		return dst
   950	}
   951	
   952	// extract returns the name from a leading "$name" or "${name}" in str.
   953	// If it is a number, extract returns num set to that number; otherwise num = -1.
   954	func extract(str string) (name string, num int, rest string, ok bool) {
   955		if len(str) < 2 || str[0] != '$' {
   956			return
   957		}
   958		brace := false
   959		if str[1] == '{' {
   960			brace = true
   961			str = str[2:]
   962		} else {
   963			str = str[1:]
   964		}
   965		i := 0
   966		for i < len(str) {
   967			rune, size := utf8.DecodeRuneInString(str[i:])
   968			if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' {
   969				break
   970			}
   971			i += size
   972		}
   973		if i == 0 {
   974			// empty name is not okay
   975			return
   976		}
   977		name = str[:i]
   978		if brace {
   979			if i >= len(str) || str[i] != '}' {
   980				// missing closing brace
   981				return
   982			}
   983			i++
   984		}
   985	
   986		// Parse number.
   987		num = 0
   988		for i := 0; i < len(name); i++ {
   989			if name[i] < '0' || '9' < name[i] || num >= 1e8 {
   990				num = -1
   991				break
   992			}
   993			num = num*10 + int(name[i]) - '0'
   994		}
   995		// Disallow leading zeros.
   996		if name[0] == '0' && len(name) > 1 {
   997			num = -1
   998		}
   999	
  1000		rest = str[i:]
  1001		ok = true
  1002		return
  1003	}
  1004	
  1005	// FindSubmatchIndex returns a slice holding the index pairs identifying the
  1006	// leftmost match of the regular expression in b and the matches, if any, of
  1007	// its subexpressions, as defined by the 'Submatch' and 'Index' descriptions
  1008	// in the package comment.
  1009	// A return value of nil indicates no match.
  1010	func (re *Regexp) FindSubmatchIndex(b []byte) []int {
  1011		return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap, nil))
  1012	}
  1013	
  1014	// FindStringSubmatch returns a slice of strings holding the text of the
  1015	// leftmost match of the regular expression in s and the matches, if any, of
  1016	// its subexpressions, as defined by the 'Submatch' description in the
  1017	// package comment.
  1018	// A return value of nil indicates no match.
  1019	func (re *Regexp) FindStringSubmatch(s string) []string {
  1020		var dstCap [4]int
  1021		a := re.doExecute(nil, nil, s, 0, re.prog.NumCap, dstCap[:0])
  1022		if a == nil {
  1023			return nil
  1024		}
  1025		ret := make([]string, 1+re.numSubexp)
  1026		for i := range ret {
  1027			if 2*i < len(a) && a[2*i] >= 0 {
  1028				ret[i] = s[a[2*i]:a[2*i+1]]
  1029			}
  1030		}
  1031		return ret
  1032	}
  1033	
  1034	// FindStringSubmatchIndex returns a slice holding the index pairs
  1035	// identifying the leftmost match of the regular expression in s and the
  1036	// matches, if any, of its subexpressions, as defined by the 'Submatch' and
  1037	// 'Index' descriptions in the package comment.
  1038	// A return value of nil indicates no match.
  1039	func (re *Regexp) FindStringSubmatchIndex(s string) []int {
  1040		return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap, nil))
  1041	}
  1042	
  1043	// FindReaderSubmatchIndex returns a slice holding the index pairs
  1044	// identifying the leftmost match of the regular expression of text read by
  1045	// the RuneReader, and the matches, if any, of its subexpressions, as defined
  1046	// by the 'Submatch' and 'Index' descriptions in the package comment. A
  1047	// return value of nil indicates no match.
  1048	func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
  1049		return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap, nil))
  1050	}
  1051	
  1052	const startSize = 10 // The size at which to start a slice in the 'All' routines.
  1053	
  1054	// FindAll is the 'All' version of Find; it returns a slice of all successive
  1055	// matches of the expression, as defined by the 'All' description in the
  1056	// package comment.
  1057	// A return value of nil indicates no match.
  1058	func (re *Regexp) FindAll(b []byte, n int) [][]byte {
  1059		if n < 0 {
  1060			n = len(b) + 1
  1061		}
  1062		var result [][]byte
  1063		re.allMatches("", b, n, func(match []int) {
  1064			if result == nil {
  1065				result = make([][]byte, 0, startSize)
  1066			}
  1067			result = append(result, b[match[0]:match[1]:match[1]])
  1068		})
  1069		return result
  1070	}
  1071	
  1072	// FindAllIndex is the 'All' version of FindIndex; it returns a slice of all
  1073	// successive matches of the expression, as defined by the 'All' description
  1074	// in the package comment.
  1075	// A return value of nil indicates no match.
  1076	func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
  1077		if n < 0 {
  1078			n = len(b) + 1
  1079		}
  1080		var result [][]int
  1081		re.allMatches("", b, n, func(match []int) {
  1082			if result == nil {
  1083				result = make([][]int, 0, startSize)
  1084			}
  1085			result = append(result, match[0:2])
  1086		})
  1087		return result
  1088	}
  1089	
  1090	// FindAllString is the 'All' version of FindString; it returns a slice of all
  1091	// successive matches of the expression, as defined by the 'All' description
  1092	// in the package comment.
  1093	// A return value of nil indicates no match.
  1094	func (re *Regexp) FindAllString(s string, n int) []string {
  1095		if n < 0 {
  1096			n = len(s) + 1
  1097		}
  1098		var result []string
  1099		re.allMatches(s, nil, n, func(match []int) {
  1100			if result == nil {
  1101				result = make([]string, 0, startSize)
  1102			}
  1103			result = append(result, s[match[0]:match[1]])
  1104		})
  1105		return result
  1106	}
  1107	
  1108	// FindAllStringIndex is the 'All' version of FindStringIndex; it returns a
  1109	// slice of all successive matches of the expression, as defined by the 'All'
  1110	// description in the package comment.
  1111	// A return value of nil indicates no match.
  1112	func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
  1113		if n < 0 {
  1114			n = len(s) + 1
  1115		}
  1116		var result [][]int
  1117		re.allMatches(s, nil, n, func(match []int) {
  1118			if result == nil {
  1119				result = make([][]int, 0, startSize)
  1120			}
  1121			result = append(result, match[0:2])
  1122		})
  1123		return result
  1124	}
  1125	
  1126	// FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice
  1127	// of all successive matches of the expression, as defined by the 'All'
  1128	// description in the package comment.
  1129	// A return value of nil indicates no match.
  1130	func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
  1131		if n < 0 {
  1132			n = len(b) + 1
  1133		}
  1134		var result [][][]byte
  1135		re.allMatches("", b, n, func(match []int) {
  1136			if result == nil {
  1137				result = make([][][]byte, 0, startSize)
  1138			}
  1139			slice := make([][]byte, len(match)/2)
  1140			for j := range slice {
  1141				if match[2*j] >= 0 {
  1142					slice[j] = b[match[2*j]:match[2*j+1]:match[2*j+1]]
  1143				}
  1144			}
  1145			result = append(result, slice)
  1146		})
  1147		return result
  1148	}
  1149	
  1150	// FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns
  1151	// a slice of all successive matches of the expression, as defined by the
  1152	// 'All' description in the package comment.
  1153	// A return value of nil indicates no match.
  1154	func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
  1155		if n < 0 {
  1156			n = len(b) + 1
  1157		}
  1158		var result [][]int
  1159		re.allMatches("", b, n, func(match []int) {
  1160			if result == nil {
  1161				result = make([][]int, 0, startSize)
  1162			}
  1163			result = append(result, match)
  1164		})
  1165		return result
  1166	}
  1167	
  1168	// FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it
  1169	// returns a slice of all successive matches of the expression, as defined by
  1170	// the 'All' description in the package comment.
  1171	// A return value of nil indicates no match.
  1172	func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
  1173		if n < 0 {
  1174			n = len(s) + 1
  1175		}
  1176		var result [][]string
  1177		re.allMatches(s, nil, n, func(match []int) {
  1178			if result == nil {
  1179				result = make([][]string, 0, startSize)
  1180			}
  1181			slice := make([]string, len(match)/2)
  1182			for j := range slice {
  1183				if match[2*j] >= 0 {
  1184					slice[j] = s[match[2*j]:match[2*j+1]]
  1185				}
  1186			}
  1187			result = append(result, slice)
  1188		})
  1189		return result
  1190	}
  1191	
  1192	// FindAllStringSubmatchIndex is the 'All' version of
  1193	// FindStringSubmatchIndex; it returns a slice of all successive matches of
  1194	// the expression, as defined by the 'All' description in the package
  1195	// comment.
  1196	// A return value of nil indicates no match.
  1197	func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
  1198		if n < 0 {
  1199			n = len(s) + 1
  1200		}
  1201		var result [][]int
  1202		re.allMatches(s, nil, n, func(match []int) {
  1203			if result == nil {
  1204				result = make([][]int, 0, startSize)
  1205			}
  1206			result = append(result, match)
  1207		})
  1208		return result
  1209	}
  1210	
  1211	// Split slices s into substrings separated by the expression and returns a slice of
  1212	// the substrings between those expression matches.
  1213	//
  1214	// The slice returned by this method consists of all the substrings of s
  1215	// not contained in the slice returned by FindAllString. When called on an expression
  1216	// that contains no metacharacters, it is equivalent to strings.SplitN.
  1217	//
  1218	// Example:
  1219	//   s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5)
  1220	//   // s: ["", "b", "b", "c", "cadaaae"]
  1221	//
  1222	// The count determines the number of substrings to return:
  1223	//   n > 0: at most n substrings; the last substring will be the unsplit remainder.
  1224	//   n == 0: the result is nil (zero substrings)
  1225	//   n < 0: all substrings
  1226	func (re *Regexp) Split(s string, n int) []string {
  1227	
  1228		if n == 0 {
  1229			return nil
  1230		}
  1231	
  1232		if len(re.expr) > 0 && len(s) == 0 {
  1233			return []string{""}
  1234		}
  1235	
  1236		matches := re.FindAllStringIndex(s, n)
  1237		strings := make([]string, 0, len(matches))
  1238	
  1239		beg := 0
  1240		end := 0
  1241		for _, match := range matches {
  1242			if n > 0 && len(strings) >= n-1 {
  1243				break
  1244			}
  1245	
  1246			end = match[0]
  1247			if match[1] != 0 {
  1248				strings = append(strings, s[beg:end])
  1249			}
  1250			beg = match[1]
  1251		}
  1252	
  1253		if end != len(s) {
  1254			strings = append(strings, s[beg:])
  1255		}
  1256	
  1257		return strings
  1258	}
  1259	

View as plain text