...

Source file src/pkg/vendor/golang.org/x/text/unicode/norm/normalize.go

     1	// Copyright 2011 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Note: the file data_test.go that is generated should not be checked in.
     6	//go:generate go run maketables.go triegen.go
     7	//go:generate go test -tags test
     8	
     9	// Package norm contains types and functions for normalizing Unicode strings.
    10	package norm // import "golang.org/x/text/unicode/norm"
    11	
    12	import (
    13		"unicode/utf8"
    14	
    15		"golang.org/x/text/transform"
    16	)
    17	
    18	// A Form denotes a canonical representation of Unicode code points.
    19	// The Unicode-defined normalization and equivalence forms are:
    20	//
    21	//   NFC   Unicode Normalization Form C
    22	//   NFD   Unicode Normalization Form D
    23	//   NFKC  Unicode Normalization Form KC
    24	//   NFKD  Unicode Normalization Form KD
    25	//
    26	// For a Form f, this documentation uses the notation f(x) to mean
    27	// the bytes or string x converted to the given form.
    28	// A position n in x is called a boundary if conversion to the form can
    29	// proceed independently on both sides:
    30	//   f(x) == append(f(x[0:n]), f(x[n:])...)
    31	//
    32	// References: https://unicode.org/reports/tr15/ and
    33	// https://unicode.org/notes/tn5/.
    34	type Form int
    35	
    36	const (
    37		NFC Form = iota
    38		NFD
    39		NFKC
    40		NFKD
    41	)
    42	
    43	// Bytes returns f(b). May return b if f(b) = b.
    44	func (f Form) Bytes(b []byte) []byte {
    45		src := inputBytes(b)
    46		ft := formTable[f]
    47		n, ok := ft.quickSpan(src, 0, len(b), true)
    48		if ok {
    49			return b
    50		}
    51		out := make([]byte, n, len(b))
    52		copy(out, b[0:n])
    53		rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush}
    54		return doAppendInner(&rb, n)
    55	}
    56	
    57	// String returns f(s).
    58	func (f Form) String(s string) string {
    59		src := inputString(s)
    60		ft := formTable[f]
    61		n, ok := ft.quickSpan(src, 0, len(s), true)
    62		if ok {
    63			return s
    64		}
    65		out := make([]byte, n, len(s))
    66		copy(out, s[0:n])
    67		rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush}
    68		return string(doAppendInner(&rb, n))
    69	}
    70	
    71	// IsNormal returns true if b == f(b).
    72	func (f Form) IsNormal(b []byte) bool {
    73		src := inputBytes(b)
    74		ft := formTable[f]
    75		bp, ok := ft.quickSpan(src, 0, len(b), true)
    76		if ok {
    77			return true
    78		}
    79		rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)}
    80		rb.setFlusher(nil, cmpNormalBytes)
    81		for bp < len(b) {
    82			rb.out = b[bp:]
    83			if bp = decomposeSegment(&rb, bp, true); bp < 0 {
    84				return false
    85			}
    86			bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true)
    87		}
    88		return true
    89	}
    90	
    91	func cmpNormalBytes(rb *reorderBuffer) bool {
    92		b := rb.out
    93		for i := 0; i < rb.nrune; i++ {
    94			info := rb.rune[i]
    95			if int(info.size) > len(b) {
    96				return false
    97			}
    98			p := info.pos
    99			pe := p + info.size
   100			for ; p < pe; p++ {
   101				if b[0] != rb.byte[p] {
   102					return false
   103				}
   104				b = b[1:]
   105			}
   106		}
   107		return true
   108	}
   109	
   110	// IsNormalString returns true if s == f(s).
   111	func (f Form) IsNormalString(s string) bool {
   112		src := inputString(s)
   113		ft := formTable[f]
   114		bp, ok := ft.quickSpan(src, 0, len(s), true)
   115		if ok {
   116			return true
   117		}
   118		rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)}
   119		rb.setFlusher(nil, func(rb *reorderBuffer) bool {
   120			for i := 0; i < rb.nrune; i++ {
   121				info := rb.rune[i]
   122				if bp+int(info.size) > len(s) {
   123					return false
   124				}
   125				p := info.pos
   126				pe := p + info.size
   127				for ; p < pe; p++ {
   128					if s[bp] != rb.byte[p] {
   129						return false
   130					}
   131					bp++
   132				}
   133			}
   134			return true
   135		})
   136		for bp < len(s) {
   137			if bp = decomposeSegment(&rb, bp, true); bp < 0 {
   138				return false
   139			}
   140			bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true)
   141		}
   142		return true
   143	}
   144	
   145	// patchTail fixes a case where a rune may be incorrectly normalized
   146	// if it is followed by illegal continuation bytes. It returns the
   147	// patched buffer and whether the decomposition is still in progress.
   148	func patchTail(rb *reorderBuffer) bool {
   149		info, p := lastRuneStart(&rb.f, rb.out)
   150		if p == -1 || info.size == 0 {
   151			return true
   152		}
   153		end := p + int(info.size)
   154		extra := len(rb.out) - end
   155		if extra > 0 {
   156			// Potentially allocating memory. However, this only
   157			// happens with ill-formed UTF-8.
   158			x := make([]byte, 0)
   159			x = append(x, rb.out[len(rb.out)-extra:]...)
   160			rb.out = rb.out[:end]
   161			decomposeToLastBoundary(rb)
   162			rb.doFlush()
   163			rb.out = append(rb.out, x...)
   164			return false
   165		}
   166		buf := rb.out[p:]
   167		rb.out = rb.out[:p]
   168		decomposeToLastBoundary(rb)
   169		if s := rb.ss.next(info); s == ssStarter {
   170			rb.doFlush()
   171			rb.ss.first(info)
   172		} else if s == ssOverflow {
   173			rb.doFlush()
   174			rb.insertCGJ()
   175			rb.ss = 0
   176		}
   177		rb.insertUnsafe(inputBytes(buf), 0, info)
   178		return true
   179	}
   180	
   181	func appendQuick(rb *reorderBuffer, i int) int {
   182		if rb.nsrc == i {
   183			return i
   184		}
   185		end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true)
   186		rb.out = rb.src.appendSlice(rb.out, i, end)
   187		return end
   188	}
   189	
   190	// Append returns f(append(out, b...)).
   191	// The buffer out must be nil, empty, or equal to f(out).
   192	func (f Form) Append(out []byte, src ...byte) []byte {
   193		return f.doAppend(out, inputBytes(src), len(src))
   194	}
   195	
   196	func (f Form) doAppend(out []byte, src input, n int) []byte {
   197		if n == 0 {
   198			return out
   199		}
   200		ft := formTable[f]
   201		// Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer.
   202		if len(out) == 0 {
   203			p, _ := ft.quickSpan(src, 0, n, true)
   204			out = src.appendSlice(out, 0, p)
   205			if p == n {
   206				return out
   207			}
   208			rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush}
   209			return doAppendInner(&rb, p)
   210		}
   211		rb := reorderBuffer{f: *ft, src: src, nsrc: n}
   212		return doAppend(&rb, out, 0)
   213	}
   214	
   215	func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
   216		rb.setFlusher(out, appendFlush)
   217		src, n := rb.src, rb.nsrc
   218		doMerge := len(out) > 0
   219		if q := src.skipContinuationBytes(p); q > p {
   220			// Move leading non-starters to destination.
   221			rb.out = src.appendSlice(rb.out, p, q)
   222			p = q
   223			doMerge = patchTail(rb)
   224		}
   225		fd := &rb.f
   226		if doMerge {
   227			var info Properties
   228			if p < n {
   229				info = fd.info(src, p)
   230				if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 {
   231					if p == 0 {
   232						decomposeToLastBoundary(rb)
   233					}
   234					p = decomposeSegment(rb, p, true)
   235				}
   236			}
   237			if info.size == 0 {
   238				rb.doFlush()
   239				// Append incomplete UTF-8 encoding.
   240				return src.appendSlice(rb.out, p, n)
   241			}
   242			if rb.nrune > 0 {
   243				return doAppendInner(rb, p)
   244			}
   245		}
   246		p = appendQuick(rb, p)
   247		return doAppendInner(rb, p)
   248	}
   249	
   250	func doAppendInner(rb *reorderBuffer, p int) []byte {
   251		for n := rb.nsrc; p < n; {
   252			p = decomposeSegment(rb, p, true)
   253			p = appendQuick(rb, p)
   254		}
   255		return rb.out
   256	}
   257	
   258	// AppendString returns f(append(out, []byte(s))).
   259	// The buffer out must be nil, empty, or equal to f(out).
   260	func (f Form) AppendString(out []byte, src string) []byte {
   261		return f.doAppend(out, inputString(src), len(src))
   262	}
   263	
   264	// QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
   265	// It is not guaranteed to return the largest such n.
   266	func (f Form) QuickSpan(b []byte) int {
   267		n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true)
   268		return n
   269	}
   270	
   271	// Span implements transform.SpanningTransformer. It returns a boundary n such
   272	// that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n.
   273	func (f Form) Span(b []byte, atEOF bool) (n int, err error) {
   274		n, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), atEOF)
   275		if n < len(b) {
   276			if !ok {
   277				err = transform.ErrEndOfSpan
   278			} else {
   279				err = transform.ErrShortSrc
   280			}
   281		}
   282		return n, err
   283	}
   284	
   285	// SpanString returns a boundary n such that s[0:n] == f(s[0:n]).
   286	// It is not guaranteed to return the largest such n.
   287	func (f Form) SpanString(s string, atEOF bool) (n int, err error) {
   288		n, ok := formTable[f].quickSpan(inputString(s), 0, len(s), atEOF)
   289		if n < len(s) {
   290			if !ok {
   291				err = transform.ErrEndOfSpan
   292			} else {
   293				err = transform.ErrShortSrc
   294			}
   295		}
   296		return n, err
   297	}
   298	
   299	// quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and
   300	// whether any non-normalized parts were found. If atEOF is false, n will
   301	// not point past the last segment if this segment might be become
   302	// non-normalized by appending other runes.
   303	func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) {
   304		var lastCC uint8
   305		ss := streamSafe(0)
   306		lastSegStart := i
   307		for n = end; i < n; {
   308			if j := src.skipASCII(i, n); i != j {
   309				i = j
   310				lastSegStart = i - 1
   311				lastCC = 0
   312				ss = 0
   313				continue
   314			}
   315			info := f.info(src, i)
   316			if info.size == 0 {
   317				if atEOF {
   318					// include incomplete runes
   319					return n, true
   320				}
   321				return lastSegStart, true
   322			}
   323			// This block needs to be before the next, because it is possible to
   324			// have an overflow for runes that are starters (e.g. with U+FF9E).
   325			switch ss.next(info) {
   326			case ssStarter:
   327				lastSegStart = i
   328			case ssOverflow:
   329				return lastSegStart, false
   330			case ssSuccess:
   331				if lastCC > info.ccc {
   332					return lastSegStart, false
   333				}
   334			}
   335			if f.composing {
   336				if !info.isYesC() {
   337					break
   338				}
   339			} else {
   340				if !info.isYesD() {
   341					break
   342				}
   343			}
   344			lastCC = info.ccc
   345			i += int(info.size)
   346		}
   347		if i == n {
   348			if !atEOF {
   349				n = lastSegStart
   350			}
   351			return n, true
   352		}
   353		return lastSegStart, false
   354	}
   355	
   356	// QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]).
   357	// It is not guaranteed to return the largest such n.
   358	func (f Form) QuickSpanString(s string) int {
   359		n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true)
   360		return n
   361	}
   362	
   363	// FirstBoundary returns the position i of the first boundary in b
   364	// or -1 if b contains no boundary.
   365	func (f Form) FirstBoundary(b []byte) int {
   366		return f.firstBoundary(inputBytes(b), len(b))
   367	}
   368	
   369	func (f Form) firstBoundary(src input, nsrc int) int {
   370		i := src.skipContinuationBytes(0)
   371		if i >= nsrc {
   372			return -1
   373		}
   374		fd := formTable[f]
   375		ss := streamSafe(0)
   376		// We should call ss.first here, but we can't as the first rune is
   377		// skipped already. This means FirstBoundary can't really determine
   378		// CGJ insertion points correctly. Luckily it doesn't have to.
   379		for {
   380			info := fd.info(src, i)
   381			if info.size == 0 {
   382				return -1
   383			}
   384			if s := ss.next(info); s != ssSuccess {
   385				return i
   386			}
   387			i += int(info.size)
   388			if i >= nsrc {
   389				if !info.BoundaryAfter() && !ss.isMax() {
   390					return -1
   391				}
   392				return nsrc
   393			}
   394		}
   395	}
   396	
   397	// FirstBoundaryInString returns the position i of the first boundary in s
   398	// or -1 if s contains no boundary.
   399	func (f Form) FirstBoundaryInString(s string) int {
   400		return f.firstBoundary(inputString(s), len(s))
   401	}
   402	
   403	// NextBoundary reports the index of the boundary between the first and next
   404	// segment in b or -1 if atEOF is false and there are not enough bytes to
   405	// determine this boundary.
   406	func (f Form) NextBoundary(b []byte, atEOF bool) int {
   407		return f.nextBoundary(inputBytes(b), len(b), atEOF)
   408	}
   409	
   410	// NextBoundaryInString reports the index of the boundary between the first and
   411	// next segment in b or -1 if atEOF is false and there are not enough bytes to
   412	// determine this boundary.
   413	func (f Form) NextBoundaryInString(s string, atEOF bool) int {
   414		return f.nextBoundary(inputString(s), len(s), atEOF)
   415	}
   416	
   417	func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int {
   418		if nsrc == 0 {
   419			if atEOF {
   420				return 0
   421			}
   422			return -1
   423		}
   424		fd := formTable[f]
   425		info := fd.info(src, 0)
   426		if info.size == 0 {
   427			if atEOF {
   428				return 1
   429			}
   430			return -1
   431		}
   432		ss := streamSafe(0)
   433		ss.first(info)
   434	
   435		for i := int(info.size); i < nsrc; i += int(info.size) {
   436			info = fd.info(src, i)
   437			if info.size == 0 {
   438				if atEOF {
   439					return i
   440				}
   441				return -1
   442			}
   443			// TODO: Using streamSafe to determine the boundary isn't the same as
   444			// using BoundaryBefore. Determine which should be used.
   445			if s := ss.next(info); s != ssSuccess {
   446				return i
   447			}
   448		}
   449		if !atEOF && !info.BoundaryAfter() && !ss.isMax() {
   450			return -1
   451		}
   452		return nsrc
   453	}
   454	
   455	// LastBoundary returns the position i of the last boundary in b
   456	// or -1 if b contains no boundary.
   457	func (f Form) LastBoundary(b []byte) int {
   458		return lastBoundary(formTable[f], b)
   459	}
   460	
   461	func lastBoundary(fd *formInfo, b []byte) int {
   462		i := len(b)
   463		info, p := lastRuneStart(fd, b)
   464		if p == -1 {
   465			return -1
   466		}
   467		if info.size == 0 { // ends with incomplete rune
   468			if p == 0 { // starts with incomplete rune
   469				return -1
   470			}
   471			i = p
   472			info, p = lastRuneStart(fd, b[:i])
   473			if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
   474				return i
   475			}
   476		}
   477		if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
   478			return i
   479		}
   480		if info.BoundaryAfter() {
   481			return i
   482		}
   483		ss := streamSafe(0)
   484		v := ss.backwards(info)
   485		for i = p; i >= 0 && v != ssStarter; i = p {
   486			info, p = lastRuneStart(fd, b[:i])
   487			if v = ss.backwards(info); v == ssOverflow {
   488				break
   489			}
   490			if p+int(info.size) != i {
   491				if p == -1 { // no boundary found
   492					return -1
   493				}
   494				return i // boundary after an illegal UTF-8 encoding
   495			}
   496		}
   497		return i
   498	}
   499	
   500	// decomposeSegment scans the first segment in src into rb. It inserts 0x034f
   501	// (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters
   502	// and returns the number of bytes consumed from src or iShortDst or iShortSrc.
   503	func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int {
   504		// Force one character to be consumed.
   505		info := rb.f.info(rb.src, sp)
   506		if info.size == 0 {
   507			return 0
   508		}
   509		if s := rb.ss.next(info); s == ssStarter {
   510			// TODO: this could be removed if we don't support merging.
   511			if rb.nrune > 0 {
   512				goto end
   513			}
   514		} else if s == ssOverflow {
   515			rb.insertCGJ()
   516			goto end
   517		}
   518		if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
   519			return int(err)
   520		}
   521		for {
   522			sp += int(info.size)
   523			if sp >= rb.nsrc {
   524				if !atEOF && !info.BoundaryAfter() {
   525					return int(iShortSrc)
   526				}
   527				break
   528			}
   529			info = rb.f.info(rb.src, sp)
   530			if info.size == 0 {
   531				if !atEOF {
   532					return int(iShortSrc)
   533				}
   534				break
   535			}
   536			if s := rb.ss.next(info); s == ssStarter {
   537				break
   538			} else if s == ssOverflow {
   539				rb.insertCGJ()
   540				break
   541			}
   542			if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
   543				return int(err)
   544			}
   545		}
   546	end:
   547		if !rb.doFlush() {
   548			return int(iShortDst)
   549		}
   550		return sp
   551	}
   552	
   553	// lastRuneStart returns the runeInfo and position of the last
   554	// rune in buf or the zero runeInfo and -1 if no rune was found.
   555	func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) {
   556		p := len(buf) - 1
   557		for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
   558		}
   559		if p < 0 {
   560			return Properties{}, -1
   561		}
   562		return fd.info(inputBytes(buf), p), p
   563	}
   564	
   565	// decomposeToLastBoundary finds an open segment at the end of the buffer
   566	// and scans it into rb. Returns the buffer minus the last segment.
   567	func decomposeToLastBoundary(rb *reorderBuffer) {
   568		fd := &rb.f
   569		info, i := lastRuneStart(fd, rb.out)
   570		if int(info.size) != len(rb.out)-i {
   571			// illegal trailing continuation bytes
   572			return
   573		}
   574		if info.BoundaryAfter() {
   575			return
   576		}
   577		var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order
   578		padd := 0
   579		ss := streamSafe(0)
   580		p := len(rb.out)
   581		for {
   582			add[padd] = info
   583			v := ss.backwards(info)
   584			if v == ssOverflow {
   585				// Note that if we have an overflow, it the string we are appending to
   586				// is not correctly normalized. In this case the behavior is undefined.
   587				break
   588			}
   589			padd++
   590			p -= int(info.size)
   591			if v == ssStarter || p < 0 {
   592				break
   593			}
   594			info, i = lastRuneStart(fd, rb.out[:p])
   595			if int(info.size) != p-i {
   596				break
   597			}
   598		}
   599		rb.ss = ss
   600		// Copy bytes for insertion as we may need to overwrite rb.out.
   601		var buf [maxBufferSize * utf8.UTFMax]byte
   602		cp := buf[:copy(buf[:], rb.out[p:])]
   603		rb.out = rb.out[:p]
   604		for padd--; padd >= 0; padd-- {
   605			info = add[padd]
   606			rb.insertUnsafe(inputBytes(cp), 0, info)
   607			cp = cp[info.size:]
   608		}
   609	}
   610	

View as plain text