Source file src/unicode/utf8/utf8.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package utf8 implements functions and constants to support text encoded in
     6	// UTF-8. It includes functions to translate between runes and UTF-8 byte sequences.
     7	package utf8
     8	
     9	// The conditions RuneError==unicode.ReplacementChar and
    10	// MaxRune==unicode.MaxRune are verified in the tests.
    11	// Defining them locally avoids this package depending on package unicode.
    12	
    13	// Numbers fundamental to the encoding.
    14	const (
    15		RuneError = '\uFFFD'     // the "error" Rune or "Unicode replacement character"
    16		RuneSelf  = 0x80         // characters below Runeself are represented as themselves in a single byte.
    17		MaxRune   = '\U0010FFFF' // Maximum valid Unicode code point.
    18		UTFMax    = 4            // maximum number of bytes of a UTF-8 encoded Unicode character.
    19	)
    20	
    21	// Code points in the surrogate range are not valid for UTF-8.
    22	const (
    23		surrogateMin = 0xD800
    24		surrogateMax = 0xDFFF
    25	)
    26	
    27	const (
    28		t1 = 0b00000000
    29		tx = 0b10000000
    30		t2 = 0b11000000
    31		t3 = 0b11100000
    32		t4 = 0b11110000
    33		t5 = 0b11111000
    34	
    35		maskx = 0b00111111
    36		mask2 = 0b00011111
    37		mask3 = 0b00001111
    38		mask4 = 0b00000111
    39	
    40		rune1Max = 1<<7 - 1
    41		rune2Max = 1<<11 - 1
    42		rune3Max = 1<<16 - 1
    43	
    44		// The default lowest and highest continuation byte.
    45		locb = 0b10000000
    46		hicb = 0b10111111
    47	
    48		// These names of these constants are chosen to give nice alignment in the
    49		// table below. The first nibble is an index into acceptRanges or F for
    50		// special one-byte cases. The second nibble is the Rune length or the
    51		// Status for the special one-byte case.
    52		xx = 0xF1 // invalid: size 1
    53		as = 0xF0 // ASCII: size 1
    54		s1 = 0x02 // accept 0, size 2
    55		s2 = 0x13 // accept 1, size 3
    56		s3 = 0x03 // accept 0, size 3
    57		s4 = 0x23 // accept 2, size 3
    58		s5 = 0x34 // accept 3, size 4
    59		s6 = 0x04 // accept 0, size 4
    60		s7 = 0x44 // accept 4, size 4
    61	)
    62	
    63	// first is information about the first byte in a UTF-8 sequence.
    64	var first = [256]uint8{
    65		//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
    66		as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
    67		as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
    68		as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
    69		as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
    70		as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
    71		as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
    72		as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
    73		as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
    74		//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
    75		xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
    76		xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
    77		xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
    78		xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
    79		xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
    80		s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
    81		s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
    82		s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
    83	}
    84	
    85	// acceptRange gives the range of valid values for the second byte in a UTF-8
    86	// sequence.
    87	type acceptRange struct {
    88		lo uint8 // lowest value for second byte.
    89		hi uint8 // highest value for second byte.
    90	}
    91	
    92	// acceptRanges has size 16 to avoid bounds checks in the code that uses it.
    93	var acceptRanges = [16]acceptRange{
    94		0: {locb, hicb},
    95		1: {0xA0, hicb},
    96		2: {locb, 0x9F},
    97		3: {0x90, hicb},
    98		4: {locb, 0x8F},
    99	}
   100	
   101	// FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune.
   102	// An invalid encoding is considered a full Rune since it will convert as a width-1 error rune.
   103	func FullRune(p []byte) bool {
   104		n := len(p)
   105		if n == 0 {
   106			return false
   107		}
   108		x := first[p[0]]
   109		if n >= int(x&7) {
   110			return true // ASCII, invalid or valid.
   111		}
   112		// Must be short or invalid.
   113		accept := acceptRanges[x>>4]
   114		if n > 1 && (p[1] < accept.lo || accept.hi < p[1]) {
   115			return true
   116		} else if n > 2 && (p[2] < locb || hicb < p[2]) {
   117			return true
   118		}
   119		return false
   120	}
   121	
   122	// FullRuneInString is like FullRune but its input is a string.
   123	func FullRuneInString(s string) bool {
   124		n := len(s)
   125		if n == 0 {
   126			return false
   127		}
   128		x := first[s[0]]
   129		if n >= int(x&7) {
   130			return true // ASCII, invalid, or valid.
   131		}
   132		// Must be short or invalid.
   133		accept := acceptRanges[x>>4]
   134		if n > 1 && (s[1] < accept.lo || accept.hi < s[1]) {
   135			return true
   136		} else if n > 2 && (s[2] < locb || hicb < s[2]) {
   137			return true
   138		}
   139		return false
   140	}
   141	
   142	// DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and
   143	// its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if
   144	// the encoding is invalid, it returns (RuneError, 1). Both are impossible
   145	// results for correct, non-empty UTF-8.
   146	//
   147	// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
   148	// out of range, or is not the shortest possible UTF-8 encoding for the
   149	// value. No other validation is performed.
   150	func DecodeRune(p []byte) (r rune, size int) {
   151		n := len(p)
   152		if n < 1 {
   153			return RuneError, 0
   154		}
   155		p0 := p[0]
   156		x := first[p0]
   157		if x >= as {
   158			// The following code simulates an additional check for x == xx and
   159			// handling the ASCII and invalid cases accordingly. This mask-and-or
   160			// approach prevents an additional branch.
   161			mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
   162			return rune(p[0])&^mask | RuneError&mask, 1
   163		}
   164		sz := int(x & 7)
   165		accept := acceptRanges[x>>4]
   166		if n < sz {
   167			return RuneError, 1
   168		}
   169		b1 := p[1]
   170		if b1 < accept.lo || accept.hi < b1 {
   171			return RuneError, 1
   172		}
   173		if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks
   174			return rune(p0&mask2)<<6 | rune(b1&maskx), 2
   175		}
   176		b2 := p[2]
   177		if b2 < locb || hicb < b2 {
   178			return RuneError, 1
   179		}
   180		if sz <= 3 {
   181			return rune(p0&mask3)<<12 | rune(b1&maskx)<<6 | rune(b2&maskx), 3
   182		}
   183		b3 := p[3]
   184		if b3 < locb || hicb < b3 {
   185			return RuneError, 1
   186		}
   187		return rune(p0&mask4)<<18 | rune(b1&maskx)<<12 | rune(b2&maskx)<<6 | rune(b3&maskx), 4
   188	}
   189	
   190	// DecodeRuneInString is like DecodeRune but its input is a string. If s is
   191	// empty it returns (RuneError, 0). Otherwise, if the encoding is invalid, it
   192	// returns (RuneError, 1). Both are impossible results for correct, non-empty
   193	// UTF-8.
   194	//
   195	// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
   196	// out of range, or is not the shortest possible UTF-8 encoding for the
   197	// value. No other validation is performed.
   198	func DecodeRuneInString(s string) (r rune, size int) {
   199		n := len(s)
   200		if n < 1 {
   201			return RuneError, 0
   202		}
   203		s0 := s[0]
   204		x := first[s0]
   205		if x >= as {
   206			// The following code simulates an additional check for x == xx and
   207			// handling the ASCII and invalid cases accordingly. This mask-and-or
   208			// approach prevents an additional branch.
   209			mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
   210			return rune(s[0])&^mask | RuneError&mask, 1
   211		}
   212		sz := int(x & 7)
   213		accept := acceptRanges[x>>4]
   214		if n < sz {
   215			return RuneError, 1
   216		}
   217		s1 := s[1]
   218		if s1 < accept.lo || accept.hi < s1 {
   219			return RuneError, 1
   220		}
   221		if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks
   222			return rune(s0&mask2)<<6 | rune(s1&maskx), 2
   223		}
   224		s2 := s[2]
   225		if s2 < locb || hicb < s2 {
   226			return RuneError, 1
   227		}
   228		if sz <= 3 {
   229			return rune(s0&mask3)<<12 | rune(s1&maskx)<<6 | rune(s2&maskx), 3
   230		}
   231		s3 := s[3]
   232		if s3 < locb || hicb < s3 {
   233			return RuneError, 1
   234		}
   235		return rune(s0&mask4)<<18 | rune(s1&maskx)<<12 | rune(s2&maskx)<<6 | rune(s3&maskx), 4
   236	}
   237	
   238	// DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and
   239	// its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if
   240	// the encoding is invalid, it returns (RuneError, 1). Both are impossible
   241	// results for correct, non-empty UTF-8.
   242	//
   243	// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
   244	// out of range, or is not the shortest possible UTF-8 encoding for the
   245	// value. No other validation is performed.
   246	func DecodeLastRune(p []byte) (r rune, size int) {
   247		end := len(p)
   248		if end == 0 {
   249			return RuneError, 0
   250		}
   251		start := end - 1
   252		r = rune(p[start])
   253		if r < RuneSelf {
   254			return r, 1
   255		}
   256		// guard against O(n^2) behavior when traversing
   257		// backwards through strings with long sequences of
   258		// invalid UTF-8.
   259		lim := end - UTFMax
   260		if lim < 0 {
   261			lim = 0
   262		}
   263		for start--; start >= lim; start-- {
   264			if RuneStart(p[start]) {
   265				break
   266			}
   267		}
   268		if start < 0 {
   269			start = 0
   270		}
   271		r, size = DecodeRune(p[start:end])
   272		if start+size != end {
   273			return RuneError, 1
   274		}
   275		return r, size
   276	}
   277	
   278	// DecodeLastRuneInString is like DecodeLastRune but its input is a string. If
   279	// s is empty it returns (RuneError, 0). Otherwise, if the encoding is invalid,
   280	// it returns (RuneError, 1). Both are impossible results for correct,
   281	// non-empty UTF-8.
   282	//
   283	// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
   284	// out of range, or is not the shortest possible UTF-8 encoding for the
   285	// value. No other validation is performed.
   286	func DecodeLastRuneInString(s string) (r rune, size int) {
   287		end := len(s)
   288		if end == 0 {
   289			return RuneError, 0
   290		}
   291		start := end - 1
   292		r = rune(s[start])
   293		if r < RuneSelf {
   294			return r, 1
   295		}
   296		// guard against O(n^2) behavior when traversing
   297		// backwards through strings with long sequences of
   298		// invalid UTF-8.
   299		lim := end - UTFMax
   300		if lim < 0 {
   301			lim = 0
   302		}
   303		for start--; start >= lim; start-- {
   304			if RuneStart(s[start]) {
   305				break
   306			}
   307		}
   308		if start < 0 {
   309			start = 0
   310		}
   311		r, size = DecodeRuneInString(s[start:end])
   312		if start+size != end {
   313			return RuneError, 1
   314		}
   315		return r, size
   316	}
   317	
   318	// RuneLen returns the number of bytes required to encode the rune.
   319	// It returns -1 if the rune is not a valid value to encode in UTF-8.
   320	func RuneLen(r rune) int {
   321		switch {
   322		case r < 0:
   323			return -1
   324		case r <= rune1Max:
   325			return 1
   326		case r <= rune2Max:
   327			return 2
   328		case surrogateMin <= r && r <= surrogateMax:
   329			return -1
   330		case r <= rune3Max:
   331			return 3
   332		case r <= MaxRune:
   333			return 4
   334		}
   335		return -1
   336	}
   337	
   338	// EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune.
   339	// It returns the number of bytes written.
   340	func EncodeRune(p []byte, r rune) int {
   341		// Negative values are erroneous. Making it unsigned addresses the problem.
   342		switch i := uint32(r); {
   343		case i <= rune1Max:
   344			p[0] = byte(r)
   345			return 1
   346		case i <= rune2Max:
   347			_ = p[1] // eliminate bounds checks
   348			p[0] = t2 | byte(r>>6)
   349			p[1] = tx | byte(r)&maskx
   350			return 2
   351		case i > MaxRune, surrogateMin <= i && i <= surrogateMax:
   352			r = RuneError
   353			fallthrough
   354		case i <= rune3Max:
   355			_ = p[2] // eliminate bounds checks
   356			p[0] = t3 | byte(r>>12)
   357			p[1] = tx | byte(r>>6)&maskx
   358			p[2] = tx | byte(r)&maskx
   359			return 3
   360		default:
   361			_ = p[3] // eliminate bounds checks
   362			p[0] = t4 | byte(r>>18)
   363			p[1] = tx | byte(r>>12)&maskx
   364			p[2] = tx | byte(r>>6)&maskx
   365			p[3] = tx | byte(r)&maskx
   366			return 4
   367		}
   368	}
   369	
   370	// RuneCount returns the number of runes in p. Erroneous and short
   371	// encodings are treated as single runes of width 1 byte.
   372	func RuneCount(p []byte) int {
   373		np := len(p)
   374		var n int
   375		for i := 0; i < np; {
   376			n++
   377			c := p[i]
   378			if c < RuneSelf {
   379				// ASCII fast path
   380				i++
   381				continue
   382			}
   383			x := first[c]
   384			if x == xx {
   385				i++ // invalid.
   386				continue
   387			}
   388			size := int(x & 7)
   389			if i+size > np {
   390				i++ // Short or invalid.
   391				continue
   392			}
   393			accept := acceptRanges[x>>4]
   394			if c := p[i+1]; c < accept.lo || accept.hi < c {
   395				size = 1
   396			} else if size == 2 {
   397			} else if c := p[i+2]; c < locb || hicb < c {
   398				size = 1
   399			} else if size == 3 {
   400			} else if c := p[i+3]; c < locb || hicb < c {
   401				size = 1
   402			}
   403			i += size
   404		}
   405		return n
   406	}
   407	
   408	// RuneCountInString is like RuneCount but its input is a string.
   409	func RuneCountInString(s string) (n int) {
   410		ns := len(s)
   411		for i := 0; i < ns; n++ {
   412			c := s[i]
   413			if c < RuneSelf {
   414				// ASCII fast path
   415				i++
   416				continue
   417			}
   418			x := first[c]
   419			if x == xx {
   420				i++ // invalid.
   421				continue
   422			}
   423			size := int(x & 7)
   424			if i+size > ns {
   425				i++ // Short or invalid.
   426				continue
   427			}
   428			accept := acceptRanges[x>>4]
   429			if c := s[i+1]; c < accept.lo || accept.hi < c {
   430				size = 1
   431			} else if size == 2 {
   432			} else if c := s[i+2]; c < locb || hicb < c {
   433				size = 1
   434			} else if size == 3 {
   435			} else if c := s[i+3]; c < locb || hicb < c {
   436				size = 1
   437			}
   438			i += size
   439		}
   440		return n
   441	}
   442	
   443	// RuneStart reports whether the byte could be the first byte of an encoded,
   444	// possibly invalid rune. Second and subsequent bytes always have the top two
   445	// bits set to 10.
   446	func RuneStart(b byte) bool { return b&0xC0 != 0x80 }
   447	
   448	// Valid reports whether p consists entirely of valid UTF-8-encoded runes.
   449	func Valid(p []byte) bool {
   450		n := len(p)
   451		for i := 0; i < n; {
   452			pi := p[i]
   453			if pi < RuneSelf {
   454				i++
   455				continue
   456			}
   457			x := first[pi]
   458			if x == xx {
   459				return false // Illegal starter byte.
   460			}
   461			size := int(x & 7)
   462			if i+size > n {
   463				return false // Short or invalid.
   464			}
   465			accept := acceptRanges[x>>4]
   466			if c := p[i+1]; c < accept.lo || accept.hi < c {
   467				return false
   468			} else if size == 2 {
   469			} else if c := p[i+2]; c < locb || hicb < c {
   470				return false
   471			} else if size == 3 {
   472			} else if c := p[i+3]; c < locb || hicb < c {
   473				return false
   474			}
   475			i += size
   476		}
   477		return true
   478	}
   479	
   480	// ValidString reports whether s consists entirely of valid UTF-8-encoded runes.
   481	func ValidString(s string) bool {
   482		n := len(s)
   483		for i := 0; i < n; {
   484			si := s[i]
   485			if si < RuneSelf {
   486				i++
   487				continue
   488			}
   489			x := first[si]
   490			if x == xx {
   491				return false // Illegal starter byte.
   492			}
   493			size := int(x & 7)
   494			if i+size > n {
   495				return false // Short or invalid.
   496			}
   497			accept := acceptRanges[x>>4]
   498			if c := s[i+1]; c < accept.lo || accept.hi < c {
   499				return false
   500			} else if size == 2 {
   501			} else if c := s[i+2]; c < locb || hicb < c {
   502				return false
   503			} else if size == 3 {
   504			} else if c := s[i+3]; c < locb || hicb < c {
   505				return false
   506			}
   507			i += size
   508		}
   509		return true
   510	}
   511	
   512	// ValidRune reports whether r can be legally encoded as UTF-8.
   513	// Code points that are out of range or a surrogate half are illegal.
   514	func ValidRune(r rune) bool {
   515		switch {
   516		case 0 <= r && r < surrogateMin:
   517			return true
   518		case surrogateMax < r && r <= MaxRune:
   519			return true
   520		}
   521		return false
   522	}
   523
View as plain text