...

Source file src/pkg/strings/replace.go

     1	// Copyright 2011 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	package strings
     6	
     7	import (
     8		"io"
     9		"sync"
    10	)
    11	
    12	// Replacer replaces a list of strings with replacements.
    13	// It is safe for concurrent use by multiple goroutines.
    14	type Replacer struct {
    15		once   sync.Once // guards buildOnce method
    16		r      replacer
    17		oldnew []string
    18	}
    19	
    20	// replacer is the interface that a replacement algorithm needs to implement.
    21	type replacer interface {
    22		Replace(s string) string
    23		WriteString(w io.Writer, s string) (n int, err error)
    24	}
    25	
    26	// NewReplacer returns a new Replacer from a list of old, new string
    27	// pairs. Replacements are performed in the order they appear in the
    28	// target string, without overlapping matches. The old string
    29	// comparisons are done in argument order.
    30	//
    31	// NewReplacer panics if given an odd number of arguments.
    32	func NewReplacer(oldnew ...string) *Replacer {
    33		if len(oldnew)%2 == 1 {
    34			panic("strings.NewReplacer: odd argument count")
    35		}
    36		return &Replacer{oldnew: append([]string(nil), oldnew...)}
    37	}
    38	
    39	func (r *Replacer) buildOnce() {
    40		r.r = r.build()
    41		r.oldnew = nil
    42	}
    43	
    44	func (b *Replacer) build() replacer {
    45		oldnew := b.oldnew
    46		if len(oldnew) == 2 && len(oldnew[0]) > 1 {
    47			return makeSingleStringReplacer(oldnew[0], oldnew[1])
    48		}
    49	
    50		allNewBytes := true
    51		for i := 0; i < len(oldnew); i += 2 {
    52			if len(oldnew[i]) != 1 {
    53				return makeGenericReplacer(oldnew)
    54			}
    55			if len(oldnew[i+1]) != 1 {
    56				allNewBytes = false
    57			}
    58		}
    59	
    60		if allNewBytes {
    61			r := byteReplacer{}
    62			for i := range r {
    63				r[i] = byte(i)
    64			}
    65			// The first occurrence of old->new map takes precedence
    66			// over the others with the same old string.
    67			for i := len(oldnew) - 2; i >= 0; i -= 2 {
    68				o := oldnew[i][0]
    69				n := oldnew[i+1][0]
    70				r[o] = n
    71			}
    72			return &r
    73		}
    74	
    75		r := byteStringReplacer{toReplace: make([]string, 0, len(oldnew)/2)}
    76		// The first occurrence of old->new map takes precedence
    77		// over the others with the same old string.
    78		for i := len(oldnew) - 2; i >= 0; i -= 2 {
    79			o := oldnew[i][0]
    80			n := oldnew[i+1]
    81			// To avoid counting repetitions multiple times.
    82			if r.replacements[o] == nil {
    83				// We need to use string([]byte{o}) instead of string(o),
    84				// to avoid utf8 encoding of o.
    85				// E. g. byte(150) produces string of length 2.
    86				r.toReplace = append(r.toReplace, string([]byte{o}))
    87			}
    88			r.replacements[o] = []byte(n)
    89	
    90		}
    91		return &r
    92	}
    93	
    94	// Replace returns a copy of s with all replacements performed.
    95	func (r *Replacer) Replace(s string) string {
    96		r.once.Do(r.buildOnce)
    97		return r.r.Replace(s)
    98	}
    99	
   100	// WriteString writes s to w with all replacements performed.
   101	func (r *Replacer) WriteString(w io.Writer, s string) (n int, err error) {
   102		r.once.Do(r.buildOnce)
   103		return r.r.WriteString(w, s)
   104	}
   105	
   106	// trieNode is a node in a lookup trie for prioritized key/value pairs. Keys
   107	// and values may be empty. For example, the trie containing keys "ax", "ay",
   108	// "bcbc", "x" and "xy" could have eight nodes:
   109	//
   110	//  n0  -
   111	//  n1  a-
   112	//  n2  .x+
   113	//  n3  .y+
   114	//  n4  b-
   115	//  n5  .cbc+
   116	//  n6  x+
   117	//  n7  .y+
   118	//
   119	// n0 is the root node, and its children are n1, n4 and n6; n1's children are
   120	// n2 and n3; n4's child is n5; n6's child is n7. Nodes n0, n1 and n4 (marked
   121	// with a trailing "-") are partial keys, and nodes n2, n3, n5, n6 and n7
   122	// (marked with a trailing "+") are complete keys.
   123	type trieNode struct {
   124		// value is the value of the trie node's key/value pair. It is empty if
   125		// this node is not a complete key.
   126		value string
   127		// priority is the priority (higher is more important) of the trie node's
   128		// key/value pair; keys are not necessarily matched shortest- or longest-
   129		// first. Priority is positive if this node is a complete key, and zero
   130		// otherwise. In the example above, positive/zero priorities are marked
   131		// with a trailing "+" or "-".
   132		priority int
   133	
   134		// A trie node may have zero, one or more child nodes:
   135		//  * if the remaining fields are zero, there are no children.
   136		//  * if prefix and next are non-zero, there is one child in next.
   137		//  * if table is non-zero, it defines all the children.
   138		//
   139		// Prefixes are preferred over tables when there is one child, but the
   140		// root node always uses a table for lookup efficiency.
   141	
   142		// prefix is the difference in keys between this trie node and the next.
   143		// In the example above, node n4 has prefix "cbc" and n4's next node is n5.
   144		// Node n5 has no children and so has zero prefix, next and table fields.
   145		prefix string
   146		next   *trieNode
   147	
   148		// table is a lookup table indexed by the next byte in the key, after
   149		// remapping that byte through genericReplacer.mapping to create a dense
   150		// index. In the example above, the keys only use 'a', 'b', 'c', 'x' and
   151		// 'y', which remap to 0, 1, 2, 3 and 4. All other bytes remap to 5, and
   152		// genericReplacer.tableSize will be 5. Node n0's table will be
   153		// []*trieNode{ 0:n1, 1:n4, 3:n6 }, where the 0, 1 and 3 are the remapped
   154		// 'a', 'b' and 'x'.
   155		table []*trieNode
   156	}
   157	
   158	func (t *trieNode) add(key, val string, priority int, r *genericReplacer) {
   159		if key == "" {
   160			if t.priority == 0 {
   161				t.value = val
   162				t.priority = priority
   163			}
   164			return
   165		}
   166	
   167		if t.prefix != "" {
   168			// Need to split the prefix among multiple nodes.
   169			var n int // length of the longest common prefix
   170			for ; n < len(t.prefix) && n < len(key); n++ {
   171				if t.prefix[n] != key[n] {
   172					break
   173				}
   174			}
   175			if n == len(t.prefix) {
   176				t.next.add(key[n:], val, priority, r)
   177			} else if n == 0 {
   178				// First byte differs, start a new lookup table here. Looking up
   179				// what is currently t.prefix[0] will lead to prefixNode, and
   180				// looking up key[0] will lead to keyNode.
   181				var prefixNode *trieNode
   182				if len(t.prefix) == 1 {
   183					prefixNode = t.next
   184				} else {
   185					prefixNode = &trieNode{
   186						prefix: t.prefix[1:],
   187						next:   t.next,
   188					}
   189				}
   190				keyNode := new(trieNode)
   191				t.table = make([]*trieNode, r.tableSize)
   192				t.table[r.mapping[t.prefix[0]]] = prefixNode
   193				t.table[r.mapping[key[0]]] = keyNode
   194				t.prefix = ""
   195				t.next = nil
   196				keyNode.add(key[1:], val, priority, r)
   197			} else {
   198				// Insert new node after the common section of the prefix.
   199				next := &trieNode{
   200					prefix: t.prefix[n:],
   201					next:   t.next,
   202				}
   203				t.prefix = t.prefix[:n]
   204				t.next = next
   205				next.add(key[n:], val, priority, r)
   206			}
   207		} else if t.table != nil {
   208			// Insert into existing table.
   209			m := r.mapping[key[0]]
   210			if t.table[m] == nil {
   211				t.table[m] = new(trieNode)
   212			}
   213			t.table[m].add(key[1:], val, priority, r)
   214		} else {
   215			t.prefix = key
   216			t.next = new(trieNode)
   217			t.next.add("", val, priority, r)
   218		}
   219	}
   220	
   221	func (r *genericReplacer) lookup(s string, ignoreRoot bool) (val string, keylen int, found bool) {
   222		// Iterate down the trie to the end, and grab the value and keylen with
   223		// the highest priority.
   224		bestPriority := 0
   225		node := &r.root
   226		n := 0
   227		for node != nil {
   228			if node.priority > bestPriority && !(ignoreRoot && node == &r.root) {
   229				bestPriority = node.priority
   230				val = node.value
   231				keylen = n
   232				found = true
   233			}
   234	
   235			if s == "" {
   236				break
   237			}
   238			if node.table != nil {
   239				index := r.mapping[s[0]]
   240				if int(index) == r.tableSize {
   241					break
   242				}
   243				node = node.table[index]
   244				s = s[1:]
   245				n++
   246			} else if node.prefix != "" && HasPrefix(s, node.prefix) {
   247				n += len(node.prefix)
   248				s = s[len(node.prefix):]
   249				node = node.next
   250			} else {
   251				break
   252			}
   253		}
   254		return
   255	}
   256	
   257	// genericReplacer is the fully generic algorithm.
   258	// It's used as a fallback when nothing faster can be used.
   259	type genericReplacer struct {
   260		root trieNode
   261		// tableSize is the size of a trie node's lookup table. It is the number
   262		// of unique key bytes.
   263		tableSize int
   264		// mapping maps from key bytes to a dense index for trieNode.table.
   265		mapping [256]byte
   266	}
   267	
   268	func makeGenericReplacer(oldnew []string) *genericReplacer {
   269		r := new(genericReplacer)
   270		// Find each byte used, then assign them each an index.
   271		for i := 0; i < len(oldnew); i += 2 {
   272			key := oldnew[i]
   273			for j := 0; j < len(key); j++ {
   274				r.mapping[key[j]] = 1
   275			}
   276		}
   277	
   278		for _, b := range r.mapping {
   279			r.tableSize += int(b)
   280		}
   281	
   282		var index byte
   283		for i, b := range r.mapping {
   284			if b == 0 {
   285				r.mapping[i] = byte(r.tableSize)
   286			} else {
   287				r.mapping[i] = index
   288				index++
   289			}
   290		}
   291		// Ensure root node uses a lookup table (for performance).
   292		r.root.table = make([]*trieNode, r.tableSize)
   293	
   294		for i := 0; i < len(oldnew); i += 2 {
   295			r.root.add(oldnew[i], oldnew[i+1], len(oldnew)-i, r)
   296		}
   297		return r
   298	}
   299	
   300	type appendSliceWriter []byte
   301	
   302	// Write writes to the buffer to satisfy io.Writer.
   303	func (w *appendSliceWriter) Write(p []byte) (int, error) {
   304		*w = append(*w, p...)
   305		return len(p), nil
   306	}
   307	
   308	// WriteString writes to the buffer without string->[]byte->string allocations.
   309	func (w *appendSliceWriter) WriteString(s string) (int, error) {
   310		*w = append(*w, s...)
   311		return len(s), nil
   312	}
   313	
   314	type stringWriter struct {
   315		w io.Writer
   316	}
   317	
   318	func (w stringWriter) WriteString(s string) (int, error) {
   319		return w.w.Write([]byte(s))
   320	}
   321	
   322	func getStringWriter(w io.Writer) io.StringWriter {
   323		sw, ok := w.(io.StringWriter)
   324		if !ok {
   325			sw = stringWriter{w}
   326		}
   327		return sw
   328	}
   329	
   330	func (r *genericReplacer) Replace(s string) string {
   331		buf := make(appendSliceWriter, 0, len(s))
   332		r.WriteString(&buf, s)
   333		return string(buf)
   334	}
   335	
   336	func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error) {
   337		sw := getStringWriter(w)
   338		var last, wn int
   339		var prevMatchEmpty bool
   340		for i := 0; i <= len(s); {
   341			// Fast path: s[i] is not a prefix of any pattern.
   342			if i != len(s) && r.root.priority == 0 {
   343				index := int(r.mapping[s[i]])
   344				if index == r.tableSize || r.root.table[index] == nil {
   345					i++
   346					continue
   347				}
   348			}
   349	
   350			// Ignore the empty match iff the previous loop found the empty match.
   351			val, keylen, match := r.lookup(s[i:], prevMatchEmpty)
   352			prevMatchEmpty = match && keylen == 0
   353			if match {
   354				wn, err = sw.WriteString(s[last:i])
   355				n += wn
   356				if err != nil {
   357					return
   358				}
   359				wn, err = sw.WriteString(val)
   360				n += wn
   361				if err != nil {
   362					return
   363				}
   364				i += keylen
   365				last = i
   366				continue
   367			}
   368			i++
   369		}
   370		if last != len(s) {
   371			wn, err = sw.WriteString(s[last:])
   372			n += wn
   373		}
   374		return
   375	}
   376	
   377	// singleStringReplacer is the implementation that's used when there is only
   378	// one string to replace (and that string has more than one byte).
   379	type singleStringReplacer struct {
   380		finder *stringFinder
   381		// value is the new string that replaces that pattern when it's found.
   382		value string
   383	}
   384	
   385	func makeSingleStringReplacer(pattern string, value string) *singleStringReplacer {
   386		return &singleStringReplacer{finder: makeStringFinder(pattern), value: value}
   387	}
   388	
   389	func (r *singleStringReplacer) Replace(s string) string {
   390		var buf []byte
   391		i, matched := 0, false
   392		for {
   393			match := r.finder.next(s[i:])
   394			if match == -1 {
   395				break
   396			}
   397			matched = true
   398			buf = append(buf, s[i:i+match]...)
   399			buf = append(buf, r.value...)
   400			i += match + len(r.finder.pattern)
   401		}
   402		if !matched {
   403			return s
   404		}
   405		buf = append(buf, s[i:]...)
   406		return string(buf)
   407	}
   408	
   409	func (r *singleStringReplacer) WriteString(w io.Writer, s string) (n int, err error) {
   410		sw := getStringWriter(w)
   411		var i, wn int
   412		for {
   413			match := r.finder.next(s[i:])
   414			if match == -1 {
   415				break
   416			}
   417			wn, err = sw.WriteString(s[i : i+match])
   418			n += wn
   419			if err != nil {
   420				return
   421			}
   422			wn, err = sw.WriteString(r.value)
   423			n += wn
   424			if err != nil {
   425				return
   426			}
   427			i += match + len(r.finder.pattern)
   428		}
   429		wn, err = sw.WriteString(s[i:])
   430		n += wn
   431		return
   432	}
   433	
   434	// byteReplacer is the implementation that's used when all the "old"
   435	// and "new" values are single ASCII bytes.
   436	// The array contains replacement bytes indexed by old byte.
   437	type byteReplacer [256]byte
   438	
   439	func (r *byteReplacer) Replace(s string) string {
   440		var buf []byte // lazily allocated
   441		for i := 0; i < len(s); i++ {
   442			b := s[i]
   443			if r[b] != b {
   444				if buf == nil {
   445					buf = []byte(s)
   446				}
   447				buf[i] = r[b]
   448			}
   449		}
   450		if buf == nil {
   451			return s
   452		}
   453		return string(buf)
   454	}
   455	
   456	func (r *byteReplacer) WriteString(w io.Writer, s string) (n int, err error) {
   457		// TODO(bradfitz): use io.WriteString with slices of s, avoiding allocation.
   458		bufsize := 32 << 10
   459		if len(s) < bufsize {
   460			bufsize = len(s)
   461		}
   462		buf := make([]byte, bufsize)
   463	
   464		for len(s) > 0 {
   465			ncopy := copy(buf, s)
   466			s = s[ncopy:]
   467			for i, b := range buf[:ncopy] {
   468				buf[i] = r[b]
   469			}
   470			wn, err := w.Write(buf[:ncopy])
   471			n += wn
   472			if err != nil {
   473				return n, err
   474			}
   475		}
   476		return n, nil
   477	}
   478	
   479	// byteStringReplacer is the implementation that's used when all the
   480	// "old" values are single ASCII bytes but the "new" values vary in size.
   481	type byteStringReplacer struct {
   482		// replacements contains replacement byte slices indexed by old byte.
   483		// A nil []byte means that the old byte should not be replaced.
   484		replacements [256][]byte
   485		// toReplace keeps a list of bytes to replace. Depending on length of toReplace
   486		// and length of target string it may be faster to use Count, or a plain loop.
   487		// We store single byte as a string, because Count takes a string.
   488		toReplace []string
   489	}
   490	
   491	// countCutOff controls the ratio of a string length to a number of replacements
   492	// at which (*byteStringReplacer).Replace switches algorithms.
   493	// For strings with higher ration of length to replacements than that value,
   494	// we call Count, for each replacement from toReplace.
   495	// For strings, with a lower ratio we use simple loop, because of Count overhead.
   496	// countCutOff is an empirically determined overhead multiplier.
   497	// TODO(tocarip) revisit once we have register-based abi/mid-stack inlining.
   498	const countCutOff = 8
   499	
   500	func (r *byteStringReplacer) Replace(s string) string {
   501		newSize := len(s)
   502		anyChanges := false
   503		// Is it faster to use Count?
   504		if len(r.toReplace)*countCutOff <= len(s) {
   505			for _, x := range r.toReplace {
   506				if c := Count(s, x); c != 0 {
   507					// The -1 is because we are replacing 1 byte with len(replacements[b]) bytes.
   508					newSize += c * (len(r.replacements[x[0]]) - 1)
   509					anyChanges = true
   510				}
   511	
   512			}
   513		} else {
   514			for i := 0; i < len(s); i++ {
   515				b := s[i]
   516				if r.replacements[b] != nil {
   517					// See above for explanation of -1
   518					newSize += len(r.replacements[b]) - 1
   519					anyChanges = true
   520				}
   521			}
   522		}
   523		if !anyChanges {
   524			return s
   525		}
   526		buf := make([]byte, newSize)
   527		j := 0
   528		for i := 0; i < len(s); i++ {
   529			b := s[i]
   530			if r.replacements[b] != nil {
   531				j += copy(buf[j:], r.replacements[b])
   532			} else {
   533				buf[j] = b
   534				j++
   535			}
   536		}
   537		return string(buf)
   538	}
   539	
   540	func (r *byteStringReplacer) WriteString(w io.Writer, s string) (n int, err error) {
   541		sw := getStringWriter(w)
   542		last := 0
   543		for i := 0; i < len(s); i++ {
   544			b := s[i]
   545			if r.replacements[b] == nil {
   546				continue
   547			}
   548			if last != i {
   549				nw, err := sw.WriteString(s[last:i])
   550				n += nw
   551				if err != nil {
   552					return n, err
   553				}
   554			}
   555			last = i + 1
   556			nw, err := w.Write(r.replacements[b])
   557			n += nw
   558			if err != nil {
   559				return n, err
   560			}
   561		}
   562		if last != len(s) {
   563			var nw int
   564			nw, err = sw.WriteString(s[last:])
   565			n += nw
   566		}
   567		return
   568	}
   569	

View as plain text