...

Source file src/strconv/quote.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	//go:generate go run makeisprint.go -output isprint.go
     6	
     7	package strconv
     8	
     9	import (
    10		"internal/bytealg"
    11		"unicode/utf8"
    12	)
    13	
    14	const (
    15		lowerhex = "0123456789abcdef"
    16		upperhex = "0123456789ABCDEF"
    17	)
    18	
    19	func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string {
    20		return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly))
    21	}
    22	
    23	func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) string {
    24		return string(appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly))
    25	}
    26	
    27	func appendQuotedWith(buf []byte, s string, quote byte, ASCIIonly, graphicOnly bool) []byte {
    28		// Often called with big strings, so preallocate. If there's quoting,
    29		// this is conservative but still helps a lot.
    30		if cap(buf)-len(buf) < len(s) {
    31			nBuf := make([]byte, len(buf), len(buf)+1+len(s)+1)
    32			copy(nBuf, buf)
    33			buf = nBuf
    34		}
    35		buf = append(buf, quote)
    36		for width := 0; len(s) > 0; s = s[width:] {
    37			r := rune(s[0])
    38			width = 1
    39			if r >= utf8.RuneSelf {
    40				r, width = utf8.DecodeRuneInString(s)
    41			}
    42			if width == 1 && r == utf8.RuneError {
    43				buf = append(buf, `\x`...)
    44				buf = append(buf, lowerhex[s[0]>>4])
    45				buf = append(buf, lowerhex[s[0]&0xF])
    46				continue
    47			}
    48			buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
    49		}
    50		buf = append(buf, quote)
    51		return buf
    52	}
    53	
    54	func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
    55		buf = append(buf, quote)
    56		if !utf8.ValidRune(r) {
    57			r = utf8.RuneError
    58		}
    59		buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
    60		buf = append(buf, quote)
    61		return buf
    62	}
    63	
    64	func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
    65		var runeTmp [utf8.UTFMax]byte
    66		if r == rune(quote) || r == '\\' { // always backslashed
    67			buf = append(buf, '\\')
    68			buf = append(buf, byte(r))
    69			return buf
    70		}
    71		if ASCIIonly {
    72			if r < utf8.RuneSelf && IsPrint(r) {
    73				buf = append(buf, byte(r))
    74				return buf
    75			}
    76		} else if IsPrint(r) || graphicOnly && isInGraphicList(r) {
    77			n := utf8.EncodeRune(runeTmp[:], r)
    78			buf = append(buf, runeTmp[:n]...)
    79			return buf
    80		}
    81		switch r {
    82		case '\a':
    83			buf = append(buf, `\a`...)
    84		case '\b':
    85			buf = append(buf, `\b`...)
    86		case '\f':
    87			buf = append(buf, `\f`...)
    88		case '\n':
    89			buf = append(buf, `\n`...)
    90		case '\r':
    91			buf = append(buf, `\r`...)
    92		case '\t':
    93			buf = append(buf, `\t`...)
    94		case '\v':
    95			buf = append(buf, `\v`...)
    96		default:
    97			switch {
    98			case r < ' ':
    99				buf = append(buf, `\x`...)
   100				buf = append(buf, lowerhex[byte(r)>>4])
   101				buf = append(buf, lowerhex[byte(r)&0xF])
   102			case r > utf8.MaxRune:
   103				r = 0xFFFD
   104				fallthrough
   105			case r < 0x10000:
   106				buf = append(buf, `\u`...)
   107				for s := 12; s >= 0; s -= 4 {
   108					buf = append(buf, lowerhex[r>>uint(s)&0xF])
   109				}
   110			default:
   111				buf = append(buf, `\U`...)
   112				for s := 28; s >= 0; s -= 4 {
   113					buf = append(buf, lowerhex[r>>uint(s)&0xF])
   114				}
   115			}
   116		}
   117		return buf
   118	}
   119	
   120	// Quote returns a double-quoted Go string literal representing s. The
   121	// returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   122	// control characters and non-printable characters as defined by
   123	// IsPrint.
   124	func Quote(s string) string {
   125		return quoteWith(s, '"', false, false)
   126	}
   127	
   128	// AppendQuote appends a double-quoted Go string literal representing s,
   129	// as generated by Quote, to dst and returns the extended buffer.
   130	func AppendQuote(dst []byte, s string) []byte {
   131		return appendQuotedWith(dst, s, '"', false, false)
   132	}
   133	
   134	// QuoteToASCII returns a double-quoted Go string literal representing s.
   135	// The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   136	// non-ASCII characters and non-printable characters as defined by IsPrint.
   137	func QuoteToASCII(s string) string {
   138		return quoteWith(s, '"', true, false)
   139	}
   140	
   141	// AppendQuoteToASCII appends a double-quoted Go string literal representing s,
   142	// as generated by QuoteToASCII, to dst and returns the extended buffer.
   143	func AppendQuoteToASCII(dst []byte, s string) []byte {
   144		return appendQuotedWith(dst, s, '"', true, false)
   145	}
   146	
   147	// QuoteToGraphic returns a double-quoted Go string literal representing s.
   148	// The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   149	// non-ASCII characters and non-printable characters as defined by IsGraphic.
   150	func QuoteToGraphic(s string) string {
   151		return quoteWith(s, '"', false, true)
   152	}
   153	
   154	// AppendQuoteToGraphic appends a double-quoted Go string literal representing s,
   155	// as generated by QuoteToGraphic, to dst and returns the extended buffer.
   156	func AppendQuoteToGraphic(dst []byte, s string) []byte {
   157		return appendQuotedWith(dst, s, '"', false, true)
   158	}
   159	
   160	// QuoteRune returns a single-quoted Go character literal representing the
   161	// rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100)
   162	// for control characters and non-printable characters as defined by IsPrint.
   163	func QuoteRune(r rune) string {
   164		return quoteRuneWith(r, '\'', false, false)
   165	}
   166	
   167	// AppendQuoteRune appends a single-quoted Go character literal representing the rune,
   168	// as generated by QuoteRune, to dst and returns the extended buffer.
   169	func AppendQuoteRune(dst []byte, r rune) []byte {
   170		return appendQuotedRuneWith(dst, r, '\'', false, false)
   171	}
   172	
   173	// QuoteRuneToASCII returns a single-quoted Go character literal representing
   174	// the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
   175	// \u0100) for non-ASCII characters and non-printable characters as defined
   176	// by IsPrint.
   177	func QuoteRuneToASCII(r rune) string {
   178		return quoteRuneWith(r, '\'', true, false)
   179	}
   180	
   181	// AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune,
   182	// as generated by QuoteRuneToASCII, to dst and returns the extended buffer.
   183	func AppendQuoteRuneToASCII(dst []byte, r rune) []byte {
   184		return appendQuotedRuneWith(dst, r, '\'', true, false)
   185	}
   186	
   187	// QuoteRuneToGraphic returns a single-quoted Go character literal representing
   188	// the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
   189	// \u0100) for non-ASCII characters and non-printable characters as defined
   190	// by IsGraphic.
   191	func QuoteRuneToGraphic(r rune) string {
   192		return quoteRuneWith(r, '\'', false, true)
   193	}
   194	
   195	// AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune,
   196	// as generated by QuoteRuneToGraphic, to dst and returns the extended buffer.
   197	func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte {
   198		return appendQuotedRuneWith(dst, r, '\'', false, true)
   199	}
   200	
   201	// CanBackquote reports whether the string s can be represented
   202	// unchanged as a single-line backquoted string without control
   203	// characters other than tab.
   204	func CanBackquote(s string) bool {
   205		for len(s) > 0 {
   206			r, wid := utf8.DecodeRuneInString(s)
   207			s = s[wid:]
   208			if wid > 1 {
   209				if r == '\ufeff' {
   210					return false // BOMs are invisible and should not be quoted.
   211				}
   212				continue // All other multibyte runes are correctly encoded and assumed printable.
   213			}
   214			if r == utf8.RuneError {
   215				return false
   216			}
   217			if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' {
   218				return false
   219			}
   220		}
   221		return true
   222	}
   223	
   224	func unhex(b byte) (v rune, ok bool) {
   225		c := rune(b)
   226		switch {
   227		case '0' <= c && c <= '9':
   228			return c - '0', true
   229		case 'a' <= c && c <= 'f':
   230			return c - 'a' + 10, true
   231		case 'A' <= c && c <= 'F':
   232			return c - 'A' + 10, true
   233		}
   234		return
   235	}
   236	
   237	// UnquoteChar decodes the first character or byte in the escaped string
   238	// or character literal represented by the string s.
   239	// It returns four values:
   240	//
   241	//	1) value, the decoded Unicode code point or byte value;
   242	//	2) multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
   243	//	3) tail, the remainder of the string after the character; and
   244	//	4) an error that will be nil if the character is syntactically valid.
   245	//
   246	// The second argument, quote, specifies the type of literal being parsed
   247	// and therefore which escaped quote character is permitted.
   248	// If set to a single quote, it permits the sequence \' and disallows unescaped '.
   249	// If set to a double quote, it permits \" and disallows unescaped ".
   250	// If set to zero, it does not permit either escape and allows both quote characters to appear unescaped.
   251	func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) {
   252		// easy cases
   253		if len(s) == 0 {
   254			err = ErrSyntax
   255			return
   256		}
   257		switch c := s[0]; {
   258		case c == quote && (quote == '\'' || quote == '"'):
   259			err = ErrSyntax
   260			return
   261		case c >= utf8.RuneSelf:
   262			r, size := utf8.DecodeRuneInString(s)
   263			return r, true, s[size:], nil
   264		case c != '\\':
   265			return rune(s[0]), false, s[1:], nil
   266		}
   267	
   268		// hard case: c is backslash
   269		if len(s) <= 1 {
   270			err = ErrSyntax
   271			return
   272		}
   273		c := s[1]
   274		s = s[2:]
   275	
   276		switch c {
   277		case 'a':
   278			value = '\a'
   279		case 'b':
   280			value = '\b'
   281		case 'f':
   282			value = '\f'
   283		case 'n':
   284			value = '\n'
   285		case 'r':
   286			value = '\r'
   287		case 't':
   288			value = '\t'
   289		case 'v':
   290			value = '\v'
   291		case 'x', 'u', 'U':
   292			n := 0
   293			switch c {
   294			case 'x':
   295				n = 2
   296			case 'u':
   297				n = 4
   298			case 'U':
   299				n = 8
   300			}
   301			var v rune
   302			if len(s) < n {
   303				err = ErrSyntax
   304				return
   305			}
   306			for j := 0; j < n; j++ {
   307				x, ok := unhex(s[j])
   308				if !ok {
   309					err = ErrSyntax
   310					return
   311				}
   312				v = v<<4 | x
   313			}
   314			s = s[n:]
   315			if c == 'x' {
   316				// single-byte string, possibly not UTF-8
   317				value = v
   318				break
   319			}
   320			if v > utf8.MaxRune {
   321				err = ErrSyntax
   322				return
   323			}
   324			value = v
   325			multibyte = true
   326		case '0', '1', '2', '3', '4', '5', '6', '7':
   327			v := rune(c) - '0'
   328			if len(s) < 2 {
   329				err = ErrSyntax
   330				return
   331			}
   332			for j := 0; j < 2; j++ { // one digit already; two more
   333				x := rune(s[j]) - '0'
   334				if x < 0 || x > 7 {
   335					err = ErrSyntax
   336					return
   337				}
   338				v = (v << 3) | x
   339			}
   340			s = s[2:]
   341			if v > 255 {
   342				err = ErrSyntax
   343				return
   344			}
   345			value = v
   346		case '\\':
   347			value = '\\'
   348		case '\'', '"':
   349			if c != quote {
   350				err = ErrSyntax
   351				return
   352			}
   353			value = rune(c)
   354		default:
   355			err = ErrSyntax
   356			return
   357		}
   358		tail = s
   359		return
   360	}
   361	
   362	// Unquote interprets s as a single-quoted, double-quoted,
   363	// or backquoted Go string literal, returning the string value
   364	// that s quotes.  (If s is single-quoted, it would be a Go
   365	// character literal; Unquote returns the corresponding
   366	// one-character string.)
   367	func Unquote(s string) (string, error) {
   368		n := len(s)
   369		if n < 2 {
   370			return "", ErrSyntax
   371		}
   372		quote := s[0]
   373		if quote != s[n-1] {
   374			return "", ErrSyntax
   375		}
   376		s = s[1 : n-1]
   377	
   378		if quote == '`' {
   379			if contains(s, '`') {
   380				return "", ErrSyntax
   381			}
   382			if contains(s, '\r') {
   383				// -1 because we know there is at least one \r to remove.
   384				buf := make([]byte, 0, len(s)-1)
   385				for i := 0; i < len(s); i++ {
   386					if s[i] != '\r' {
   387						buf = append(buf, s[i])
   388					}
   389				}
   390				return string(buf), nil
   391			}
   392			return s, nil
   393		}
   394		if quote != '"' && quote != '\'' {
   395			return "", ErrSyntax
   396		}
   397		if contains(s, '\n') {
   398			return "", ErrSyntax
   399		}
   400	
   401		// Is it trivial? Avoid allocation.
   402		if !contains(s, '\\') && !contains(s, quote) {
   403			switch quote {
   404			case '"':
   405				if utf8.ValidString(s) {
   406					return s, nil
   407				}
   408			case '\'':
   409				r, size := utf8.DecodeRuneInString(s)
   410				if size == len(s) && (r != utf8.RuneError || size != 1) {
   411					return s, nil
   412				}
   413			}
   414		}
   415	
   416		var runeTmp [utf8.UTFMax]byte
   417		buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations.
   418		for len(s) > 0 {
   419			c, multibyte, ss, err := UnquoteChar(s, quote)
   420			if err != nil {
   421				return "", err
   422			}
   423			s = ss
   424			if c < utf8.RuneSelf || !multibyte {
   425				buf = append(buf, byte(c))
   426			} else {
   427				n := utf8.EncodeRune(runeTmp[:], c)
   428				buf = append(buf, runeTmp[:n]...)
   429			}
   430			if quote == '\'' && len(s) != 0 {
   431				// single-quoted must be single character
   432				return "", ErrSyntax
   433			}
   434		}
   435		return string(buf), nil
   436	}
   437	
   438	// contains reports whether the string contains the byte c.
   439	func contains(s string, c byte) bool {
   440		return bytealg.IndexByteString(s, c) != -1
   441	}
   442	
   443	// bsearch16 returns the smallest i such that a[i] >= x.
   444	// If there is no such i, bsearch16 returns len(a).
   445	func bsearch16(a []uint16, x uint16) int {
   446		i, j := 0, len(a)
   447		for i < j {
   448			h := i + (j-i)/2
   449			if a[h] < x {
   450				i = h + 1
   451			} else {
   452				j = h
   453			}
   454		}
   455		return i
   456	}
   457	
   458	// bsearch32 returns the smallest i such that a[i] >= x.
   459	// If there is no such i, bsearch32 returns len(a).
   460	func bsearch32(a []uint32, x uint32) int {
   461		i, j := 0, len(a)
   462		for i < j {
   463			h := i + (j-i)/2
   464			if a[h] < x {
   465				i = h + 1
   466			} else {
   467				j = h
   468			}
   469		}
   470		return i
   471	}
   472	
   473	// TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests
   474	// to give the same answer. It allows this package not to depend on unicode,
   475	// and therefore not pull in all the Unicode tables. If the linker were better
   476	// at tossing unused tables, we could get rid of this implementation.
   477	// That would be nice.
   478	
   479	// IsPrint reports whether the rune is defined as printable by Go, with
   480	// the same definition as unicode.IsPrint: letters, numbers, punctuation,
   481	// symbols and ASCII space.
   482	func IsPrint(r rune) bool {
   483		// Fast check for Latin-1
   484		if r <= 0xFF {
   485			if 0x20 <= r && r <= 0x7E {
   486				// All the ASCII is printable from space through DEL-1.
   487				return true
   488			}
   489			if 0xA1 <= r && r <= 0xFF {
   490				// Similarly for ¡ through ÿ...
   491				return r != 0xAD // ...except for the bizarre soft hyphen.
   492			}
   493			return false
   494		}
   495	
   496		// Same algorithm, either on uint16 or uint32 value.
   497		// First, find first i such that isPrint[i] >= x.
   498		// This is the index of either the start or end of a pair that might span x.
   499		// The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]).
   500		// If we find x in a range, make sure x is not in isNotPrint list.
   501	
   502		if 0 <= r && r < 1<<16 {
   503			rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16
   504			i := bsearch16(isPrint, rr)
   505			if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
   506				return false
   507			}
   508			j := bsearch16(isNotPrint, rr)
   509			return j >= len(isNotPrint) || isNotPrint[j] != rr
   510		}
   511	
   512		rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32
   513		i := bsearch32(isPrint, rr)
   514		if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
   515			return false
   516		}
   517		if r >= 0x20000 {
   518			return true
   519		}
   520		r -= 0x10000
   521		j := bsearch16(isNotPrint, uint16(r))
   522		return j >= len(isNotPrint) || isNotPrint[j] != uint16(r)
   523	}
   524	
   525	// IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such
   526	// characters include letters, marks, numbers, punctuation, symbols, and
   527	// spaces, from categories L, M, N, P, S, and Zs.
   528	func IsGraphic(r rune) bool {
   529		if IsPrint(r) {
   530			return true
   531		}
   532		return isInGraphicList(r)
   533	}
   534	
   535	// isInGraphicList reports whether the rune is in the isGraphic list. This separation
   536	// from IsGraphic allows quoteWith to avoid two calls to IsPrint.
   537	// Should be called only if IsPrint fails.
   538	func isInGraphicList(r rune) bool {
   539		// We know r must fit in 16 bits - see makeisprint.go.
   540		if r > 0xFFFF {
   541			return false
   542		}
   543		rr := uint16(r)
   544		i := bsearch16(isGraphic, rr)
   545		return i < len(isGraphic) && rr == isGraphic[i]
   546	}
   547	

View as plain text