...

Source file src/pkg/bytes/bytes.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package bytes implements functions for the manipulation of byte slices.
     6	// It is analogous to the facilities of the strings package.
     7	package bytes
     8	
     9	import (
    10		"internal/bytealg"
    11		"unicode"
    12		"unicode/utf8"
    13	)
    14	
    15	// Equal reports whether a and b
    16	// are the same length and contain the same bytes.
    17	// A nil argument is equivalent to an empty slice.
    18	func Equal(a, b []byte) bool {
    19		// Neither cmd/compile nor gccgo allocates for these string conversions.
    20		return string(a) == string(b)
    21	}
    22	
    23	// Compare returns an integer comparing two byte slices lexicographically.
    24	// The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
    25	// A nil argument is equivalent to an empty slice.
    26	func Compare(a, b []byte) int {
    27		return bytealg.Compare(a, b)
    28	}
    29	
    30	// explode splits s into a slice of UTF-8 sequences, one per Unicode code point (still slices of bytes),
    31	// up to a maximum of n byte slices. Invalid UTF-8 sequences are chopped into individual bytes.
    32	func explode(s []byte, n int) [][]byte {
    33		if n <= 0 {
    34			n = len(s)
    35		}
    36		a := make([][]byte, n)
    37		var size int
    38		na := 0
    39		for len(s) > 0 {
    40			if na+1 >= n {
    41				a[na] = s
    42				na++
    43				break
    44			}
    45			_, size = utf8.DecodeRune(s)
    46			a[na] = s[0:size:size]
    47			s = s[size:]
    48			na++
    49		}
    50		return a[0:na]
    51	}
    52	
    53	// Count counts the number of non-overlapping instances of sep in s.
    54	// If sep is an empty slice, Count returns 1 + the number of UTF-8-encoded code points in s.
    55	func Count(s, sep []byte) int {
    56		// special case
    57		if len(sep) == 0 {
    58			return utf8.RuneCount(s) + 1
    59		}
    60		if len(sep) == 1 {
    61			return bytealg.Count(s, sep[0])
    62		}
    63		n := 0
    64		for {
    65			i := Index(s, sep)
    66			if i == -1 {
    67				return n
    68			}
    69			n++
    70			s = s[i+len(sep):]
    71		}
    72	}
    73	
    74	// Contains reports whether subslice is within b.
    75	func Contains(b, subslice []byte) bool {
    76		return Index(b, subslice) != -1
    77	}
    78	
    79	// ContainsAny reports whether any of the UTF-8-encoded code points in chars are within b.
    80	func ContainsAny(b []byte, chars string) bool {
    81		return IndexAny(b, chars) >= 0
    82	}
    83	
    84	// ContainsRune reports whether the rune is contained in the UTF-8-encoded byte slice b.
    85	func ContainsRune(b []byte, r rune) bool {
    86		return IndexRune(b, r) >= 0
    87	}
    88	
    89	// IndexByte returns the index of the first instance of c in b, or -1 if c is not present in b.
    90	func IndexByte(b []byte, c byte) int {
    91		return bytealg.IndexByte(b, c)
    92	}
    93	
    94	func indexBytePortable(s []byte, c byte) int {
    95		for i, b := range s {
    96			if b == c {
    97				return i
    98			}
    99		}
   100		return -1
   101	}
   102	
   103	// LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s.
   104	func LastIndex(s, sep []byte) int {
   105		n := len(sep)
   106		switch {
   107		case n == 0:
   108			return len(s)
   109		case n == 1:
   110			return LastIndexByte(s, sep[0])
   111		case n == len(s):
   112			if Equal(s, sep) {
   113				return 0
   114			}
   115			return -1
   116		case n > len(s):
   117			return -1
   118		}
   119		// Rabin-Karp search from the end of the string
   120		hashss, pow := hashStrRev(sep)
   121		last := len(s) - n
   122		var h uint32
   123		for i := len(s) - 1; i >= last; i-- {
   124			h = h*primeRK + uint32(s[i])
   125		}
   126		if h == hashss && Equal(s[last:], sep) {
   127			return last
   128		}
   129		for i := last - 1; i >= 0; i-- {
   130			h *= primeRK
   131			h += uint32(s[i])
   132			h -= pow * uint32(s[i+n])
   133			if h == hashss && Equal(s[i:i+n], sep) {
   134				return i
   135			}
   136		}
   137		return -1
   138	}
   139	
   140	// LastIndexByte returns the index of the last instance of c in s, or -1 if c is not present in s.
   141	func LastIndexByte(s []byte, c byte) int {
   142		for i := len(s) - 1; i >= 0; i-- {
   143			if s[i] == c {
   144				return i
   145			}
   146		}
   147		return -1
   148	}
   149	
   150	// IndexRune interprets s as a sequence of UTF-8-encoded code points.
   151	// It returns the byte index of the first occurrence in s of the given rune.
   152	// It returns -1 if rune is not present in s.
   153	// If r is utf8.RuneError, it returns the first instance of any
   154	// invalid UTF-8 byte sequence.
   155	func IndexRune(s []byte, r rune) int {
   156		switch {
   157		case 0 <= r && r < utf8.RuneSelf:
   158			return IndexByte(s, byte(r))
   159		case r == utf8.RuneError:
   160			for i := 0; i < len(s); {
   161				r1, n := utf8.DecodeRune(s[i:])
   162				if r1 == utf8.RuneError {
   163					return i
   164				}
   165				i += n
   166			}
   167			return -1
   168		case !utf8.ValidRune(r):
   169			return -1
   170		default:
   171			var b [utf8.UTFMax]byte
   172			n := utf8.EncodeRune(b[:], r)
   173			return Index(s, b[:n])
   174		}
   175	}
   176	
   177	// IndexAny interprets s as a sequence of UTF-8-encoded Unicode code points.
   178	// It returns the byte index of the first occurrence in s of any of the Unicode
   179	// code points in chars. It returns -1 if chars is empty or if there is no code
   180	// point in common.
   181	func IndexAny(s []byte, chars string) int {
   182		if chars == "" {
   183			// Avoid scanning all of s.
   184			return -1
   185		}
   186		if len(s) > 8 {
   187			if as, isASCII := makeASCIISet(chars); isASCII {
   188				for i, c := range s {
   189					if as.contains(c) {
   190						return i
   191					}
   192				}
   193				return -1
   194			}
   195		}
   196		var width int
   197		for i := 0; i < len(s); i += width {
   198			r := rune(s[i])
   199			if r < utf8.RuneSelf {
   200				width = 1
   201			} else {
   202				r, width = utf8.DecodeRune(s[i:])
   203			}
   204			for _, ch := range chars {
   205				if r == ch {
   206					return i
   207				}
   208			}
   209		}
   210		return -1
   211	}
   212	
   213	// LastIndexAny interprets s as a sequence of UTF-8-encoded Unicode code
   214	// points. It returns the byte index of the last occurrence in s of any of
   215	// the Unicode code points in chars. It returns -1 if chars is empty or if
   216	// there is no code point in common.
   217	func LastIndexAny(s []byte, chars string) int {
   218		if chars == "" {
   219			// Avoid scanning all of s.
   220			return -1
   221		}
   222		if len(s) > 8 {
   223			if as, isASCII := makeASCIISet(chars); isASCII {
   224				for i := len(s) - 1; i >= 0; i-- {
   225					if as.contains(s[i]) {
   226						return i
   227					}
   228				}
   229				return -1
   230			}
   231		}
   232		for i := len(s); i > 0; {
   233			r, size := utf8.DecodeLastRune(s[:i])
   234			i -= size
   235			for _, c := range chars {
   236				if r == c {
   237					return i
   238				}
   239			}
   240		}
   241		return -1
   242	}
   243	
   244	// Generic split: splits after each instance of sep,
   245	// including sepSave bytes of sep in the subslices.
   246	func genSplit(s, sep []byte, sepSave, n int) [][]byte {
   247		if n == 0 {
   248			return nil
   249		}
   250		if len(sep) == 0 {
   251			return explode(s, n)
   252		}
   253		if n < 0 {
   254			n = Count(s, sep) + 1
   255		}
   256	
   257		a := make([][]byte, n)
   258		n--
   259		i := 0
   260		for i < n {
   261			m := Index(s, sep)
   262			if m < 0 {
   263				break
   264			}
   265			a[i] = s[: m+sepSave : m+sepSave]
   266			s = s[m+len(sep):]
   267			i++
   268		}
   269		a[i] = s
   270		return a[:i+1]
   271	}
   272	
   273	// SplitN slices s into subslices separated by sep and returns a slice of
   274	// the subslices between those separators.
   275	// If sep is empty, SplitN splits after each UTF-8 sequence.
   276	// The count determines the number of subslices to return:
   277	//   n > 0: at most n subslices; the last subslice will be the unsplit remainder.
   278	//   n == 0: the result is nil (zero subslices)
   279	//   n < 0: all subslices
   280	func SplitN(s, sep []byte, n int) [][]byte { return genSplit(s, sep, 0, n) }
   281	
   282	// SplitAfterN slices s into subslices after each instance of sep and
   283	// returns a slice of those subslices.
   284	// If sep is empty, SplitAfterN splits after each UTF-8 sequence.
   285	// The count determines the number of subslices to return:
   286	//   n > 0: at most n subslices; the last subslice will be the unsplit remainder.
   287	//   n == 0: the result is nil (zero subslices)
   288	//   n < 0: all subslices
   289	func SplitAfterN(s, sep []byte, n int) [][]byte {
   290		return genSplit(s, sep, len(sep), n)
   291	}
   292	
   293	// Split slices s into all subslices separated by sep and returns a slice of
   294	// the subslices between those separators.
   295	// If sep is empty, Split splits after each UTF-8 sequence.
   296	// It is equivalent to SplitN with a count of -1.
   297	func Split(s, sep []byte) [][]byte { return genSplit(s, sep, 0, -1) }
   298	
   299	// SplitAfter slices s into all subslices after each instance of sep and
   300	// returns a slice of those subslices.
   301	// If sep is empty, SplitAfter splits after each UTF-8 sequence.
   302	// It is equivalent to SplitAfterN with a count of -1.
   303	func SplitAfter(s, sep []byte) [][]byte {
   304		return genSplit(s, sep, len(sep), -1)
   305	}
   306	
   307	var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1}
   308	
   309	// Fields interprets s as a sequence of UTF-8-encoded code points.
   310	// It splits the slice s around each instance of one or more consecutive white space
   311	// characters, as defined by unicode.IsSpace, returning a slice of subslices of s or an
   312	// empty slice if s contains only white space.
   313	func Fields(s []byte) [][]byte {
   314		// First count the fields.
   315		// This is an exact count if s is ASCII, otherwise it is an approximation.
   316		n := 0
   317		wasSpace := 1
   318		// setBits is used to track which bits are set in the bytes of s.
   319		setBits := uint8(0)
   320		for i := 0; i < len(s); i++ {
   321			r := s[i]
   322			setBits |= r
   323			isSpace := int(asciiSpace[r])
   324			n += wasSpace & ^isSpace
   325			wasSpace = isSpace
   326		}
   327	
   328		if setBits >= utf8.RuneSelf {
   329			// Some runes in the input slice are not ASCII.
   330			return FieldsFunc(s, unicode.IsSpace)
   331		}
   332	
   333		// ASCII fast path
   334		a := make([][]byte, n)
   335		na := 0
   336		fieldStart := 0
   337		i := 0
   338		// Skip spaces in the front of the input.
   339		for i < len(s) && asciiSpace[s[i]] != 0 {
   340			i++
   341		}
   342		fieldStart = i
   343		for i < len(s) {
   344			if asciiSpace[s[i]] == 0 {
   345				i++
   346				continue
   347			}
   348			a[na] = s[fieldStart:i:i]
   349			na++
   350			i++
   351			// Skip spaces in between fields.
   352			for i < len(s) && asciiSpace[s[i]] != 0 {
   353				i++
   354			}
   355			fieldStart = i
   356		}
   357		if fieldStart < len(s) { // Last field might end at EOF.
   358			a[na] = s[fieldStart:len(s):len(s)]
   359		}
   360		return a
   361	}
   362	
   363	// FieldsFunc interprets s as a sequence of UTF-8-encoded code points.
   364	// It splits the slice s at each run of code points c satisfying f(c) and
   365	// returns a slice of subslices of s. If all code points in s satisfy f(c), or
   366	// len(s) == 0, an empty slice is returned.
   367	// FieldsFunc makes no guarantees about the order in which it calls f(c).
   368	// If f does not return consistent results for a given c, FieldsFunc may crash.
   369	func FieldsFunc(s []byte, f func(rune) bool) [][]byte {
   370		// A span is used to record a slice of s of the form s[start:end].
   371		// The start index is inclusive and the end index is exclusive.
   372		type span struct {
   373			start int
   374			end   int
   375		}
   376		spans := make([]span, 0, 32)
   377	
   378		// Find the field start and end indices.
   379		wasField := false
   380		fromIndex := 0
   381		for i := 0; i < len(s); {
   382			size := 1
   383			r := rune(s[i])
   384			if r >= utf8.RuneSelf {
   385				r, size = utf8.DecodeRune(s[i:])
   386			}
   387			if f(r) {
   388				if wasField {
   389					spans = append(spans, span{start: fromIndex, end: i})
   390					wasField = false
   391				}
   392			} else {
   393				if !wasField {
   394					fromIndex = i
   395					wasField = true
   396				}
   397			}
   398			i += size
   399		}
   400	
   401		// Last field might end at EOF.
   402		if wasField {
   403			spans = append(spans, span{fromIndex, len(s)})
   404		}
   405	
   406		// Create subslices from recorded field indices.
   407		a := make([][]byte, len(spans))
   408		for i, span := range spans {
   409			a[i] = s[span.start:span.end:span.end]
   410		}
   411	
   412		return a
   413	}
   414	
   415	// Join concatenates the elements of s to create a new byte slice. The separator
   416	// sep is placed between elements in the resulting slice.
   417	func Join(s [][]byte, sep []byte) []byte {
   418		if len(s) == 0 {
   419			return []byte{}
   420		}
   421		if len(s) == 1 {
   422			// Just return a copy.
   423			return append([]byte(nil), s[0]...)
   424		}
   425		n := len(sep) * (len(s) - 1)
   426		for _, v := range s {
   427			n += len(v)
   428		}
   429	
   430		b := make([]byte, n)
   431		bp := copy(b, s[0])
   432		for _, v := range s[1:] {
   433			bp += copy(b[bp:], sep)
   434			bp += copy(b[bp:], v)
   435		}
   436		return b
   437	}
   438	
   439	// HasPrefix tests whether the byte slice s begins with prefix.
   440	func HasPrefix(s, prefix []byte) bool {
   441		return len(s) >= len(prefix) && Equal(s[0:len(prefix)], prefix)
   442	}
   443	
   444	// HasSuffix tests whether the byte slice s ends with suffix.
   445	func HasSuffix(s, suffix []byte) bool {
   446		return len(s) >= len(suffix) && Equal(s[len(s)-len(suffix):], suffix)
   447	}
   448	
   449	// Map returns a copy of the byte slice s with all its characters modified
   450	// according to the mapping function. If mapping returns a negative value, the character is
   451	// dropped from the byte slice with no replacement. The characters in s and the
   452	// output are interpreted as UTF-8-encoded code points.
   453	func Map(mapping func(r rune) rune, s []byte) []byte {
   454		// In the worst case, the slice can grow when mapped, making
   455		// things unpleasant. But it's so rare we barge in assuming it's
   456		// fine. It could also shrink but that falls out naturally.
   457		maxbytes := len(s) // length of b
   458		nbytes := 0        // number of bytes encoded in b
   459		b := make([]byte, maxbytes)
   460		for i := 0; i < len(s); {
   461			wid := 1
   462			r := rune(s[i])
   463			if r >= utf8.RuneSelf {
   464				r, wid = utf8.DecodeRune(s[i:])
   465			}
   466			r = mapping(r)
   467			if r >= 0 {
   468				rl := utf8.RuneLen(r)
   469				if rl < 0 {
   470					rl = len(string(utf8.RuneError))
   471				}
   472				if nbytes+rl > maxbytes {
   473					// Grow the buffer.
   474					maxbytes = maxbytes*2 + utf8.UTFMax
   475					nb := make([]byte, maxbytes)
   476					copy(nb, b[0:nbytes])
   477					b = nb
   478				}
   479				nbytes += utf8.EncodeRune(b[nbytes:maxbytes], r)
   480			}
   481			i += wid
   482		}
   483		return b[0:nbytes]
   484	}
   485	
   486	// Repeat returns a new byte slice consisting of count copies of b.
   487	//
   488	// It panics if count is negative or if
   489	// the result of (len(b) * count) overflows.
   490	func Repeat(b []byte, count int) []byte {
   491		if count == 0 {
   492			return []byte{}
   493		}
   494		// Since we cannot return an error on overflow,
   495		// we should panic if the repeat will generate
   496		// an overflow.
   497		// See Issue golang.org/issue/16237.
   498		if count < 0 {
   499			panic("bytes: negative Repeat count")
   500		} else if len(b)*count/count != len(b) {
   501			panic("bytes: Repeat count causes overflow")
   502		}
   503	
   504		nb := make([]byte, len(b)*count)
   505		bp := copy(nb, b)
   506		for bp < len(nb) {
   507			copy(nb[bp:], nb[:bp])
   508			bp *= 2
   509		}
   510		return nb
   511	}
   512	
   513	// ToUpper returns a copy of the byte slice s with all Unicode letters mapped to
   514	// their upper case.
   515	func ToUpper(s []byte) []byte {
   516		isASCII, hasLower := true, false
   517		for i := 0; i < len(s); i++ {
   518			c := s[i]
   519			if c >= utf8.RuneSelf {
   520				isASCII = false
   521				break
   522			}
   523			hasLower = hasLower || ('a' <= c && c <= 'z')
   524		}
   525	
   526		if isASCII { // optimize for ASCII-only byte slices.
   527			if !hasLower {
   528				// Just return a copy.
   529				return append([]byte(""), s...)
   530			}
   531			b := make([]byte, len(s))
   532			for i := 0; i < len(s); i++ {
   533				c := s[i]
   534				if 'a' <= c && c <= 'z' {
   535					c -= 'a' - 'A'
   536				}
   537				b[i] = c
   538			}
   539			return b
   540		}
   541		return Map(unicode.ToUpper, s)
   542	}
   543	
   544	// ToLower returns a copy of the byte slice s with all Unicode letters mapped to
   545	// their lower case.
   546	func ToLower(s []byte) []byte {
   547		isASCII, hasUpper := true, false
   548		for i := 0; i < len(s); i++ {
   549			c := s[i]
   550			if c >= utf8.RuneSelf {
   551				isASCII = false
   552				break
   553			}
   554			hasUpper = hasUpper || ('A' <= c && c <= 'Z')
   555		}
   556	
   557		if isASCII { // optimize for ASCII-only byte slices.
   558			if !hasUpper {
   559				return append([]byte(""), s...)
   560			}
   561			b := make([]byte, len(s))
   562			for i := 0; i < len(s); i++ {
   563				c := s[i]
   564				if 'A' <= c && c <= 'Z' {
   565					c += 'a' - 'A'
   566				}
   567				b[i] = c
   568			}
   569			return b
   570		}
   571		return Map(unicode.ToLower, s)
   572	}
   573	
   574	// ToTitle treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their title case.
   575	func ToTitle(s []byte) []byte { return Map(unicode.ToTitle, s) }
   576	
   577	// ToUpperSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their
   578	// upper case, giving priority to the special casing rules.
   579	func ToUpperSpecial(c unicode.SpecialCase, s []byte) []byte {
   580		return Map(c.ToUpper, s)
   581	}
   582	
   583	// ToLowerSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their
   584	// lower case, giving priority to the special casing rules.
   585	func ToLowerSpecial(c unicode.SpecialCase, s []byte) []byte {
   586		return Map(c.ToLower, s)
   587	}
   588	
   589	// ToTitleSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their
   590	// title case, giving priority to the special casing rules.
   591	func ToTitleSpecial(c unicode.SpecialCase, s []byte) []byte {
   592		return Map(c.ToTitle, s)
   593	}
   594	
   595	// ToValidUTF8 treats s as UTF-8-encoded bytes and returns a copy with each run of bytes
   596	// representing invalid UTF-8 replaced with the bytes in replacement, which may be empty.
   597	func ToValidUTF8(s, replacement []byte) []byte {
   598		b := make([]byte, 0, len(s)+len(replacement))
   599		invalid := false // previous byte was from an invalid UTF-8 sequence
   600		for i := 0; i < len(s); {
   601			c := s[i]
   602			if c < utf8.RuneSelf {
   603				i++
   604				invalid = false
   605				b = append(b, byte(c))
   606				continue
   607			}
   608			_, wid := utf8.DecodeRune(s[i:])
   609			if wid == 1 {
   610				i++
   611				if !invalid {
   612					invalid = true
   613					b = append(b, replacement...)
   614				}
   615				continue
   616			}
   617			invalid = false
   618			b = append(b, s[i:i+wid]...)
   619			i += wid
   620		}
   621		return b
   622	}
   623	
   624	// isSeparator reports whether the rune could mark a word boundary.
   625	// TODO: update when package unicode captures more of the properties.
   626	func isSeparator(r rune) bool {
   627		// ASCII alphanumerics and underscore are not separators
   628		if r <= 0x7F {
   629			switch {
   630			case '0' <= r && r <= '9':
   631				return false
   632			case 'a' <= r && r <= 'z':
   633				return false
   634			case 'A' <= r && r <= 'Z':
   635				return false
   636			case r == '_':
   637				return false
   638			}
   639			return true
   640		}
   641		// Letters and digits are not separators
   642		if unicode.IsLetter(r) || unicode.IsDigit(r) {
   643			return false
   644		}
   645		// Otherwise, all we can do for now is treat spaces as separators.
   646		return unicode.IsSpace(r)
   647	}
   648	
   649	// Title treats s as UTF-8-encoded bytes and returns a copy with all Unicode letters that begin
   650	// words mapped to their title case.
   651	//
   652	// BUG(rsc): The rule Title uses for word boundaries does not handle Unicode punctuation properly.
   653	func Title(s []byte) []byte {
   654		// Use a closure here to remember state.
   655		// Hackish but effective. Depends on Map scanning in order and calling
   656		// the closure once per rune.
   657		prev := ' '
   658		return Map(
   659			func(r rune) rune {
   660				if isSeparator(prev) {
   661					prev = r
   662					return unicode.ToTitle(r)
   663				}
   664				prev = r
   665				return r
   666			},
   667			s)
   668	}
   669	
   670	// TrimLeftFunc treats s as UTF-8-encoded bytes and returns a subslice of s by slicing off
   671	// all leading UTF-8-encoded code points c that satisfy f(c).
   672	func TrimLeftFunc(s []byte, f func(r rune) bool) []byte {
   673		i := indexFunc(s, f, false)
   674		if i == -1 {
   675			return nil
   676		}
   677		return s[i:]
   678	}
   679	
   680	// TrimRightFunc returns a subslice of s by slicing off all trailing
   681	// UTF-8-encoded code points c that satisfy f(c).
   682	func TrimRightFunc(s []byte, f func(r rune) bool) []byte {
   683		i := lastIndexFunc(s, f, false)
   684		if i >= 0 && s[i] >= utf8.RuneSelf {
   685			_, wid := utf8.DecodeRune(s[i:])
   686			i += wid
   687		} else {
   688			i++
   689		}
   690		return s[0:i]
   691	}
   692	
   693	// TrimFunc returns a subslice of s by slicing off all leading and trailing
   694	// UTF-8-encoded code points c that satisfy f(c).
   695	func TrimFunc(s []byte, f func(r rune) bool) []byte {
   696		return TrimRightFunc(TrimLeftFunc(s, f), f)
   697	}
   698	
   699	// TrimPrefix returns s without the provided leading prefix string.
   700	// If s doesn't start with prefix, s is returned unchanged.
   701	func TrimPrefix(s, prefix []byte) []byte {
   702		if HasPrefix(s, prefix) {
   703			return s[len(prefix):]
   704		}
   705		return s
   706	}
   707	
   708	// TrimSuffix returns s without the provided trailing suffix string.
   709	// If s doesn't end with suffix, s is returned unchanged.
   710	func TrimSuffix(s, suffix []byte) []byte {
   711		if HasSuffix(s, suffix) {
   712			return s[:len(s)-len(suffix)]
   713		}
   714		return s
   715	}
   716	
   717	// IndexFunc interprets s as a sequence of UTF-8-encoded code points.
   718	// It returns the byte index in s of the first Unicode
   719	// code point satisfying f(c), or -1 if none do.
   720	func IndexFunc(s []byte, f func(r rune) bool) int {
   721		return indexFunc(s, f, true)
   722	}
   723	
   724	// LastIndexFunc interprets s as a sequence of UTF-8-encoded code points.
   725	// It returns the byte index in s of the last Unicode
   726	// code point satisfying f(c), or -1 if none do.
   727	func LastIndexFunc(s []byte, f func(r rune) bool) int {
   728		return lastIndexFunc(s, f, true)
   729	}
   730	
   731	// indexFunc is the same as IndexFunc except that if
   732	// truth==false, the sense of the predicate function is
   733	// inverted.
   734	func indexFunc(s []byte, f func(r rune) bool, truth bool) int {
   735		start := 0
   736		for start < len(s) {
   737			wid := 1
   738			r := rune(s[start])
   739			if r >= utf8.RuneSelf {
   740				r, wid = utf8.DecodeRune(s[start:])
   741			}
   742			if f(r) == truth {
   743				return start
   744			}
   745			start += wid
   746		}
   747		return -1
   748	}
   749	
   750	// lastIndexFunc is the same as LastIndexFunc except that if
   751	// truth==false, the sense of the predicate function is
   752	// inverted.
   753	func lastIndexFunc(s []byte, f func(r rune) bool, truth bool) int {
   754		for i := len(s); i > 0; {
   755			r, size := rune(s[i-1]), 1
   756			if r >= utf8.RuneSelf {
   757				r, size = utf8.DecodeLastRune(s[0:i])
   758			}
   759			i -= size
   760			if f(r) == truth {
   761				return i
   762			}
   763		}
   764		return -1
   765	}
   766	
   767	// asciiSet is a 32-byte value, where each bit represents the presence of a
   768	// given ASCII character in the set. The 128-bits of the lower 16 bytes,
   769	// starting with the least-significant bit of the lowest word to the
   770	// most-significant bit of the highest word, map to the full range of all
   771	// 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed,
   772	// ensuring that any non-ASCII character will be reported as not in the set.
   773	type asciiSet [8]uint32
   774	
   775	// makeASCIISet creates a set of ASCII characters and reports whether all
   776	// characters in chars are ASCII.
   777	func makeASCIISet(chars string) (as asciiSet, ok bool) {
   778		for i := 0; i < len(chars); i++ {
   779			c := chars[i]
   780			if c >= utf8.RuneSelf {
   781				return as, false
   782			}
   783			as[c>>5] |= 1 << uint(c&31)
   784		}
   785		return as, true
   786	}
   787	
   788	// contains reports whether c is inside the set.
   789	func (as *asciiSet) contains(c byte) bool {
   790		return (as[c>>5] & (1 << uint(c&31))) != 0
   791	}
   792	
   793	func makeCutsetFunc(cutset string) func(r rune) bool {
   794		if len(cutset) == 1 && cutset[0] < utf8.RuneSelf {
   795			return func(r rune) bool {
   796				return r == rune(cutset[0])
   797			}
   798		}
   799		if as, isASCII := makeASCIISet(cutset); isASCII {
   800			return func(r rune) bool {
   801				return r < utf8.RuneSelf && as.contains(byte(r))
   802			}
   803		}
   804		return func(r rune) bool {
   805			for _, c := range cutset {
   806				if c == r {
   807					return true
   808				}
   809			}
   810			return false
   811		}
   812	}
   813	
   814	// Trim returns a subslice of s by slicing off all leading and
   815	// trailing UTF-8-encoded code points contained in cutset.
   816	func Trim(s []byte, cutset string) []byte {
   817		return TrimFunc(s, makeCutsetFunc(cutset))
   818	}
   819	
   820	// TrimLeft returns a subslice of s by slicing off all leading
   821	// UTF-8-encoded code points contained in cutset.
   822	func TrimLeft(s []byte, cutset string) []byte {
   823		return TrimLeftFunc(s, makeCutsetFunc(cutset))
   824	}
   825	
   826	// TrimRight returns a subslice of s by slicing off all trailing
   827	// UTF-8-encoded code points that are contained in cutset.
   828	func TrimRight(s []byte, cutset string) []byte {
   829		return TrimRightFunc(s, makeCutsetFunc(cutset))
   830	}
   831	
   832	// TrimSpace returns a subslice of s by slicing off all leading and
   833	// trailing white space, as defined by Unicode.
   834	func TrimSpace(s []byte) []byte {
   835		// Fast path for ASCII: look for the first ASCII non-space byte
   836		start := 0
   837		for ; start < len(s); start++ {
   838			c := s[start]
   839			if c >= utf8.RuneSelf {
   840				// If we run into a non-ASCII byte, fall back to the
   841				// slower unicode-aware method on the remaining bytes
   842				return TrimFunc(s[start:], unicode.IsSpace)
   843			}
   844			if asciiSpace[c] == 0 {
   845				break
   846			}
   847		}
   848	
   849		// Now look for the first ASCII non-space byte from the end
   850		stop := len(s)
   851		for ; stop > start; stop-- {
   852			c := s[stop-1]
   853			if c >= utf8.RuneSelf {
   854				return TrimFunc(s[start:stop], unicode.IsSpace)
   855			}
   856			if asciiSpace[c] == 0 {
   857				break
   858			}
   859		}
   860	
   861		// At this point s[start:stop] starts and ends with an ASCII
   862		// non-space bytes, so we're done. Non-ASCII cases have already
   863		// been handled above.
   864		if start == stop {
   865			// Special case to preserve previous TrimLeftFunc behavior,
   866			// returning nil instead of empty slice if all spaces.
   867			return nil
   868		}
   869		return s[start:stop]
   870	}
   871	
   872	// Runes interprets s as a sequence of UTF-8-encoded code points.
   873	// It returns a slice of runes (Unicode code points) equivalent to s.
   874	func Runes(s []byte) []rune {
   875		t := make([]rune, utf8.RuneCount(s))
   876		i := 0
   877		for len(s) > 0 {
   878			r, l := utf8.DecodeRune(s)
   879			t[i] = r
   880			i++
   881			s = s[l:]
   882		}
   883		return t
   884	}
   885	
   886	// Replace returns a copy of the slice s with the first n
   887	// non-overlapping instances of old replaced by new.
   888	// If old is empty, it matches at the beginning of the slice
   889	// and after each UTF-8 sequence, yielding up to k+1 replacements
   890	// for a k-rune slice.
   891	// If n < 0, there is no limit on the number of replacements.
   892	func Replace(s, old, new []byte, n int) []byte {
   893		m := 0
   894		if n != 0 {
   895			// Compute number of replacements.
   896			m = Count(s, old)
   897		}
   898		if m == 0 {
   899			// Just return a copy.
   900			return append([]byte(nil), s...)
   901		}
   902		if n < 0 || m < n {
   903			n = m
   904		}
   905	
   906		// Apply replacements to buffer.
   907		t := make([]byte, len(s)+n*(len(new)-len(old)))
   908		w := 0
   909		start := 0
   910		for i := 0; i < n; i++ {
   911			j := start
   912			if len(old) == 0 {
   913				if i > 0 {
   914					_, wid := utf8.DecodeRune(s[start:])
   915					j += wid
   916				}
   917			} else {
   918				j += Index(s[start:], old)
   919			}
   920			w += copy(t[w:], s[start:j])
   921			w += copy(t[w:], new)
   922			start = j + len(old)
   923		}
   924		w += copy(t[w:], s[start:])
   925		return t[0:w]
   926	}
   927	
   928	// ReplaceAll returns a copy of the slice s with all
   929	// non-overlapping instances of old replaced by new.
   930	// If old is empty, it matches at the beginning of the slice
   931	// and after each UTF-8 sequence, yielding up to k+1 replacements
   932	// for a k-rune slice.
   933	func ReplaceAll(s, old, new []byte) []byte {
   934		return Replace(s, old, new, -1)
   935	}
   936	
   937	// EqualFold reports whether s and t, interpreted as UTF-8 strings,
   938	// are equal under Unicode case-folding.
   939	func EqualFold(s, t []byte) bool {
   940		for len(s) != 0 && len(t) != 0 {
   941			// Extract first rune from each.
   942			var sr, tr rune
   943			if s[0] < utf8.RuneSelf {
   944				sr, s = rune(s[0]), s[1:]
   945			} else {
   946				r, size := utf8.DecodeRune(s)
   947				sr, s = r, s[size:]
   948			}
   949			if t[0] < utf8.RuneSelf {
   950				tr, t = rune(t[0]), t[1:]
   951			} else {
   952				r, size := utf8.DecodeRune(t)
   953				tr, t = r, t[size:]
   954			}
   955	
   956			// If they match, keep going; if not, return false.
   957	
   958			// Easy case.
   959			if tr == sr {
   960				continue
   961			}
   962	
   963			// Make sr < tr to simplify what follows.
   964			if tr < sr {
   965				tr, sr = sr, tr
   966			}
   967			// Fast check for ASCII.
   968			if tr < utf8.RuneSelf {
   969				// ASCII only, sr/tr must be upper/lower case
   970				if 'A' <= sr && sr <= 'Z' && tr == sr+'a'-'A' {
   971					continue
   972				}
   973				return false
   974			}
   975	
   976			// General case. SimpleFold(x) returns the next equivalent rune > x
   977			// or wraps around to smaller values.
   978			r := unicode.SimpleFold(sr)
   979			for r != sr && r < tr {
   980				r = unicode.SimpleFold(r)
   981			}
   982			if r == tr {
   983				continue
   984			}
   985			return false
   986		}
   987	
   988		// One string is empty. Are both?
   989		return len(s) == len(t)
   990	}
   991	
   992	// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
   993	func Index(s, sep []byte) int {
   994		n := len(sep)
   995		switch {
   996		case n == 0:
   997			return 0
   998		case n == 1:
   999			return IndexByte(s, sep[0])
  1000		case n == len(s):
  1001			if Equal(sep, s) {
  1002				return 0
  1003			}
  1004			return -1
  1005		case n > len(s):
  1006			return -1
  1007		case n <= bytealg.MaxLen:
  1008			// Use brute force when s and sep both are small
  1009			if len(s) <= bytealg.MaxBruteForce {
  1010				return bytealg.Index(s, sep)
  1011			}
  1012			c0 := sep[0]
  1013			c1 := sep[1]
  1014			i := 0
  1015			t := len(s) - n + 1
  1016			fails := 0
  1017			for i < t {
  1018				if s[i] != c0 {
  1019					// IndexByte is faster than bytealg.Index, so use it as long as
  1020					// we're not getting lots of false positives.
  1021					o := IndexByte(s[i:t], c0)
  1022					if o < 0 {
  1023						return -1
  1024					}
  1025					i += o
  1026				}
  1027				if s[i+1] == c1 && Equal(s[i:i+n], sep) {
  1028					return i
  1029				}
  1030				fails++
  1031				i++
  1032				// Switch to bytealg.Index when IndexByte produces too many false positives.
  1033				if fails > bytealg.Cutover(i) {
  1034					r := bytealg.Index(s[i:], sep)
  1035					if r >= 0 {
  1036						return r + i
  1037					}
  1038					return -1
  1039				}
  1040			}
  1041			return -1
  1042		}
  1043		c0 := sep[0]
  1044		c1 := sep[1]
  1045		i := 0
  1046		fails := 0
  1047		t := len(s) - n + 1
  1048		for i < t {
  1049			if s[i] != c0 {
  1050				o := IndexByte(s[i:t], c0)
  1051				if o < 0 {
  1052					break
  1053				}
  1054				i += o
  1055			}
  1056			if s[i+1] == c1 && Equal(s[i:i+n], sep) {
  1057				return i
  1058			}
  1059			i++
  1060			fails++
  1061			if fails >= 4+i>>4 && i < t {
  1062				// Give up on IndexByte, it isn't skipping ahead
  1063				// far enough to be better than Rabin-Karp.
  1064				// Experiments (using IndexPeriodic) suggest
  1065				// the cutover is about 16 byte skips.
  1066				// TODO: if large prefixes of sep are matching
  1067				// we should cutover at even larger average skips,
  1068				// because Equal becomes that much more expensive.
  1069				// This code does not take that effect into account.
  1070				j := indexRabinKarp(s[i:], sep)
  1071				if j < 0 {
  1072					return -1
  1073				}
  1074				return i + j
  1075			}
  1076		}
  1077		return -1
  1078	}
  1079	
  1080	func indexRabinKarp(s, sep []byte) int {
  1081		// Rabin-Karp search
  1082		hashsep, pow := hashStr(sep)
  1083		n := len(sep)
  1084		var h uint32
  1085		for i := 0; i < n; i++ {
  1086			h = h*primeRK + uint32(s[i])
  1087		}
  1088		if h == hashsep && Equal(s[:n], sep) {
  1089			return 0
  1090		}
  1091		for i := n; i < len(s); {
  1092			h *= primeRK
  1093			h += uint32(s[i])
  1094			h -= pow * uint32(s[i-n])
  1095			i++
  1096			if h == hashsep && Equal(s[i-n:i], sep) {
  1097				return i - n
  1098			}
  1099		}
  1100		return -1
  1101	}
  1102	
  1103	// primeRK is the prime base used in Rabin-Karp algorithm.
  1104	const primeRK = 16777619
  1105	
  1106	// hashStr returns the hash and the appropriate multiplicative
  1107	// factor for use in Rabin-Karp algorithm.
  1108	func hashStr(sep []byte) (uint32, uint32) {
  1109		hash := uint32(0)
  1110		for i := 0; i < len(sep); i++ {
  1111			hash = hash*primeRK + uint32(sep[i])
  1112		}
  1113		var pow, sq uint32 = 1, primeRK
  1114		for i := len(sep); i > 0; i >>= 1 {
  1115			if i&1 != 0 {
  1116				pow *= sq
  1117			}
  1118			sq *= sq
  1119		}
  1120		return hash, pow
  1121	}
  1122	
  1123	// hashStrRev returns the hash of the reverse of sep and the
  1124	// appropriate multiplicative factor for use in Rabin-Karp algorithm.
  1125	func hashStrRev(sep []byte) (uint32, uint32) {
  1126		hash := uint32(0)
  1127		for i := len(sep) - 1; i >= 0; i-- {
  1128			hash = hash*primeRK + uint32(sep[i])
  1129		}
  1130		var pow, sq uint32 = 1, primeRK
  1131		for i := len(sep); i > 0; i >>= 1 {
  1132			if i&1 != 0 {
  1133				pow *= sq
  1134			}
  1135			sq *= sq
  1136		}
  1137		return hash, pow
  1138	}
  1139	

View as plain text