...

Source file src/pkg/vendor/golang.org/x/text/unicode/norm/iter.go

     1	// Copyright 2011 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	package norm
     6	
     7	import (
     8		"fmt"
     9		"unicode/utf8"
    10	)
    11	
    12	// MaxSegmentSize is the maximum size of a byte buffer needed to consider any
    13	// sequence of starter and non-starter runes for the purpose of normalization.
    14	const MaxSegmentSize = maxByteBufferSize
    15	
    16	// An Iter iterates over a string or byte slice, while normalizing it
    17	// to a given Form.
    18	type Iter struct {
    19		rb     reorderBuffer
    20		buf    [maxByteBufferSize]byte
    21		info   Properties // first character saved from previous iteration
    22		next   iterFunc   // implementation of next depends on form
    23		asciiF iterFunc
    24	
    25		p        int    // current position in input source
    26		multiSeg []byte // remainder of multi-segment decomposition
    27	}
    28	
    29	type iterFunc func(*Iter) []byte
    30	
    31	// Init initializes i to iterate over src after normalizing it to Form f.
    32	func (i *Iter) Init(f Form, src []byte) {
    33		i.p = 0
    34		if len(src) == 0 {
    35			i.setDone()
    36			i.rb.nsrc = 0
    37			return
    38		}
    39		i.multiSeg = nil
    40		i.rb.init(f, src)
    41		i.next = i.rb.f.nextMain
    42		i.asciiF = nextASCIIBytes
    43		i.info = i.rb.f.info(i.rb.src, i.p)
    44		i.rb.ss.first(i.info)
    45	}
    46	
    47	// InitString initializes i to iterate over src after normalizing it to Form f.
    48	func (i *Iter) InitString(f Form, src string) {
    49		i.p = 0
    50		if len(src) == 0 {
    51			i.setDone()
    52			i.rb.nsrc = 0
    53			return
    54		}
    55		i.multiSeg = nil
    56		i.rb.initString(f, src)
    57		i.next = i.rb.f.nextMain
    58		i.asciiF = nextASCIIString
    59		i.info = i.rb.f.info(i.rb.src, i.p)
    60		i.rb.ss.first(i.info)
    61	}
    62	
    63	// Seek sets the segment to be returned by the next call to Next to start
    64	// at position p.  It is the responsibility of the caller to set p to the
    65	// start of a segment.
    66	func (i *Iter) Seek(offset int64, whence int) (int64, error) {
    67		var abs int64
    68		switch whence {
    69		case 0:
    70			abs = offset
    71		case 1:
    72			abs = int64(i.p) + offset
    73		case 2:
    74			abs = int64(i.rb.nsrc) + offset
    75		default:
    76			return 0, fmt.Errorf("norm: invalid whence")
    77		}
    78		if abs < 0 {
    79			return 0, fmt.Errorf("norm: negative position")
    80		}
    81		if int(abs) >= i.rb.nsrc {
    82			i.setDone()
    83			return int64(i.p), nil
    84		}
    85		i.p = int(abs)
    86		i.multiSeg = nil
    87		i.next = i.rb.f.nextMain
    88		i.info = i.rb.f.info(i.rb.src, i.p)
    89		i.rb.ss.first(i.info)
    90		return abs, nil
    91	}
    92	
    93	// returnSlice returns a slice of the underlying input type as a byte slice.
    94	// If the underlying is of type []byte, it will simply return a slice.
    95	// If the underlying is of type string, it will copy the slice to the buffer
    96	// and return that.
    97	func (i *Iter) returnSlice(a, b int) []byte {
    98		if i.rb.src.bytes == nil {
    99			return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
   100		}
   101		return i.rb.src.bytes[a:b]
   102	}
   103	
   104	// Pos returns the byte position at which the next call to Next will commence processing.
   105	func (i *Iter) Pos() int {
   106		return i.p
   107	}
   108	
   109	func (i *Iter) setDone() {
   110		i.next = nextDone
   111		i.p = i.rb.nsrc
   112	}
   113	
   114	// Done returns true if there is no more input to process.
   115	func (i *Iter) Done() bool {
   116		return i.p >= i.rb.nsrc
   117	}
   118	
   119	// Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
   120	// For any input a and b for which f(a) == f(b), subsequent calls
   121	// to Next will return the same segments.
   122	// Modifying runes are grouped together with the preceding starter, if such a starter exists.
   123	// Although not guaranteed, n will typically be the smallest possible n.
   124	func (i *Iter) Next() []byte {
   125		return i.next(i)
   126	}
   127	
   128	func nextASCIIBytes(i *Iter) []byte {
   129		p := i.p + 1
   130		if p >= i.rb.nsrc {
   131			p0 := i.p
   132			i.setDone()
   133			return i.rb.src.bytes[p0:p]
   134		}
   135		if i.rb.src.bytes[p] < utf8.RuneSelf {
   136			p0 := i.p
   137			i.p = p
   138			return i.rb.src.bytes[p0:p]
   139		}
   140		i.info = i.rb.f.info(i.rb.src, i.p)
   141		i.next = i.rb.f.nextMain
   142		return i.next(i)
   143	}
   144	
   145	func nextASCIIString(i *Iter) []byte {
   146		p := i.p + 1
   147		if p >= i.rb.nsrc {
   148			i.buf[0] = i.rb.src.str[i.p]
   149			i.setDone()
   150			return i.buf[:1]
   151		}
   152		if i.rb.src.str[p] < utf8.RuneSelf {
   153			i.buf[0] = i.rb.src.str[i.p]
   154			i.p = p
   155			return i.buf[:1]
   156		}
   157		i.info = i.rb.f.info(i.rb.src, i.p)
   158		i.next = i.rb.f.nextMain
   159		return i.next(i)
   160	}
   161	
   162	func nextHangul(i *Iter) []byte {
   163		p := i.p
   164		next := p + hangulUTF8Size
   165		if next >= i.rb.nsrc {
   166			i.setDone()
   167		} else if i.rb.src.hangul(next) == 0 {
   168			i.rb.ss.next(i.info)
   169			i.info = i.rb.f.info(i.rb.src, i.p)
   170			i.next = i.rb.f.nextMain
   171			return i.next(i)
   172		}
   173		i.p = next
   174		return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))]
   175	}
   176	
   177	func nextDone(i *Iter) []byte {
   178		return nil
   179	}
   180	
   181	// nextMulti is used for iterating over multi-segment decompositions
   182	// for decomposing normal forms.
   183	func nextMulti(i *Iter) []byte {
   184		j := 0
   185		d := i.multiSeg
   186		// skip first rune
   187		for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
   188		}
   189		for j < len(d) {
   190			info := i.rb.f.info(input{bytes: d}, j)
   191			if info.BoundaryBefore() {
   192				i.multiSeg = d[j:]
   193				return d[:j]
   194			}
   195			j += int(info.size)
   196		}
   197		// treat last segment as normal decomposition
   198		i.next = i.rb.f.nextMain
   199		return i.next(i)
   200	}
   201	
   202	// nextMultiNorm is used for iterating over multi-segment decompositions
   203	// for composing normal forms.
   204	func nextMultiNorm(i *Iter) []byte {
   205		j := 0
   206		d := i.multiSeg
   207		for j < len(d) {
   208			info := i.rb.f.info(input{bytes: d}, j)
   209			if info.BoundaryBefore() {
   210				i.rb.compose()
   211				seg := i.buf[:i.rb.flushCopy(i.buf[:])]
   212				i.rb.insertUnsafe(input{bytes: d}, j, info)
   213				i.multiSeg = d[j+int(info.size):]
   214				return seg
   215			}
   216			i.rb.insertUnsafe(input{bytes: d}, j, info)
   217			j += int(info.size)
   218		}
   219		i.multiSeg = nil
   220		i.next = nextComposed
   221		return doNormComposed(i)
   222	}
   223	
   224	// nextDecomposed is the implementation of Next for forms NFD and NFKD.
   225	func nextDecomposed(i *Iter) (next []byte) {
   226		outp := 0
   227		inCopyStart, outCopyStart := i.p, 0
   228		for {
   229			if sz := int(i.info.size); sz <= 1 {
   230				i.rb.ss = 0
   231				p := i.p
   232				i.p++ // ASCII or illegal byte.  Either way, advance by 1.
   233				if i.p >= i.rb.nsrc {
   234					i.setDone()
   235					return i.returnSlice(p, i.p)
   236				} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
   237					i.next = i.asciiF
   238					return i.returnSlice(p, i.p)
   239				}
   240				outp++
   241			} else if d := i.info.Decomposition(); d != nil {
   242				// Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
   243				// Case 1: there is a leftover to copy.  In this case the decomposition
   244				// must begin with a modifier and should always be appended.
   245				// Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
   246				p := outp + len(d)
   247				if outp > 0 {
   248					i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
   249					// TODO: this condition should not be possible, but we leave it
   250					// in for defensive purposes.
   251					if p > len(i.buf) {
   252						return i.buf[:outp]
   253					}
   254				} else if i.info.multiSegment() {
   255					// outp must be 0 as multi-segment decompositions always
   256					// start a new segment.
   257					if i.multiSeg == nil {
   258						i.multiSeg = d
   259						i.next = nextMulti
   260						return nextMulti(i)
   261					}
   262					// We are in the last segment.  Treat as normal decomposition.
   263					d = i.multiSeg
   264					i.multiSeg = nil
   265					p = len(d)
   266				}
   267				prevCC := i.info.tccc
   268				if i.p += sz; i.p >= i.rb.nsrc {
   269					i.setDone()
   270					i.info = Properties{} // Force BoundaryBefore to succeed.
   271				} else {
   272					i.info = i.rb.f.info(i.rb.src, i.p)
   273				}
   274				switch i.rb.ss.next(i.info) {
   275				case ssOverflow:
   276					i.next = nextCGJDecompose
   277					fallthrough
   278				case ssStarter:
   279					if outp > 0 {
   280						copy(i.buf[outp:], d)
   281						return i.buf[:p]
   282					}
   283					return d
   284				}
   285				copy(i.buf[outp:], d)
   286				outp = p
   287				inCopyStart, outCopyStart = i.p, outp
   288				if i.info.ccc < prevCC {
   289					goto doNorm
   290				}
   291				continue
   292			} else if r := i.rb.src.hangul(i.p); r != 0 {
   293				outp = decomposeHangul(i.buf[:], r)
   294				i.p += hangulUTF8Size
   295				inCopyStart, outCopyStart = i.p, outp
   296				if i.p >= i.rb.nsrc {
   297					i.setDone()
   298					break
   299				} else if i.rb.src.hangul(i.p) != 0 {
   300					i.next = nextHangul
   301					return i.buf[:outp]
   302				}
   303			} else {
   304				p := outp + sz
   305				if p > len(i.buf) {
   306					break
   307				}
   308				outp = p
   309				i.p += sz
   310			}
   311			if i.p >= i.rb.nsrc {
   312				i.setDone()
   313				break
   314			}
   315			prevCC := i.info.tccc
   316			i.info = i.rb.f.info(i.rb.src, i.p)
   317			if v := i.rb.ss.next(i.info); v == ssStarter {
   318				break
   319			} else if v == ssOverflow {
   320				i.next = nextCGJDecompose
   321				break
   322			}
   323			if i.info.ccc < prevCC {
   324				goto doNorm
   325			}
   326		}
   327		if outCopyStart == 0 {
   328			return i.returnSlice(inCopyStart, i.p)
   329		} else if inCopyStart < i.p {
   330			i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
   331		}
   332		return i.buf[:outp]
   333	doNorm:
   334		// Insert what we have decomposed so far in the reorderBuffer.
   335		// As we will only reorder, there will always be enough room.
   336		i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
   337		i.rb.insertDecomposed(i.buf[0:outp])
   338		return doNormDecomposed(i)
   339	}
   340	
   341	func doNormDecomposed(i *Iter) []byte {
   342		for {
   343			i.rb.insertUnsafe(i.rb.src, i.p, i.info)
   344			if i.p += int(i.info.size); i.p >= i.rb.nsrc {
   345				i.setDone()
   346				break
   347			}
   348			i.info = i.rb.f.info(i.rb.src, i.p)
   349			if i.info.ccc == 0 {
   350				break
   351			}
   352			if s := i.rb.ss.next(i.info); s == ssOverflow {
   353				i.next = nextCGJDecompose
   354				break
   355			}
   356		}
   357		// new segment or too many combining characters: exit normalization
   358		return i.buf[:i.rb.flushCopy(i.buf[:])]
   359	}
   360	
   361	func nextCGJDecompose(i *Iter) []byte {
   362		i.rb.ss = 0
   363		i.rb.insertCGJ()
   364		i.next = nextDecomposed
   365		i.rb.ss.first(i.info)
   366		buf := doNormDecomposed(i)
   367		return buf
   368	}
   369	
   370	// nextComposed is the implementation of Next for forms NFC and NFKC.
   371	func nextComposed(i *Iter) []byte {
   372		outp, startp := 0, i.p
   373		var prevCC uint8
   374		for {
   375			if !i.info.isYesC() {
   376				goto doNorm
   377			}
   378			prevCC = i.info.tccc
   379			sz := int(i.info.size)
   380			if sz == 0 {
   381				sz = 1 // illegal rune: copy byte-by-byte
   382			}
   383			p := outp + sz
   384			if p > len(i.buf) {
   385				break
   386			}
   387			outp = p
   388			i.p += sz
   389			if i.p >= i.rb.nsrc {
   390				i.setDone()
   391				break
   392			} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
   393				i.rb.ss = 0
   394				i.next = i.asciiF
   395				break
   396			}
   397			i.info = i.rb.f.info(i.rb.src, i.p)
   398			if v := i.rb.ss.next(i.info); v == ssStarter {
   399				break
   400			} else if v == ssOverflow {
   401				i.next = nextCGJCompose
   402				break
   403			}
   404			if i.info.ccc < prevCC {
   405				goto doNorm
   406			}
   407		}
   408		return i.returnSlice(startp, i.p)
   409	doNorm:
   410		// reset to start position
   411		i.p = startp
   412		i.info = i.rb.f.info(i.rb.src, i.p)
   413		i.rb.ss.first(i.info)
   414		if i.info.multiSegment() {
   415			d := i.info.Decomposition()
   416			info := i.rb.f.info(input{bytes: d}, 0)
   417			i.rb.insertUnsafe(input{bytes: d}, 0, info)
   418			i.multiSeg = d[int(info.size):]
   419			i.next = nextMultiNorm
   420			return nextMultiNorm(i)
   421		}
   422		i.rb.ss.first(i.info)
   423		i.rb.insertUnsafe(i.rb.src, i.p, i.info)
   424		return doNormComposed(i)
   425	}
   426	
   427	func doNormComposed(i *Iter) []byte {
   428		// First rune should already be inserted.
   429		for {
   430			if i.p += int(i.info.size); i.p >= i.rb.nsrc {
   431				i.setDone()
   432				break
   433			}
   434			i.info = i.rb.f.info(i.rb.src, i.p)
   435			if s := i.rb.ss.next(i.info); s == ssStarter {
   436				break
   437			} else if s == ssOverflow {
   438				i.next = nextCGJCompose
   439				break
   440			}
   441			i.rb.insertUnsafe(i.rb.src, i.p, i.info)
   442		}
   443		i.rb.compose()
   444		seg := i.buf[:i.rb.flushCopy(i.buf[:])]
   445		return seg
   446	}
   447	
   448	func nextCGJCompose(i *Iter) []byte {
   449		i.rb.ss = 0 // instead of first
   450		i.rb.insertCGJ()
   451		i.next = nextComposed
   452		// Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter,
   453		// even if they are not. This is particularly dubious for U+FF9E and UFF9A.
   454		// If we ever change that, insert a check here.
   455		i.rb.ss.first(i.info)
   456		i.rb.insertUnsafe(i.rb.src, i.p, i.info)
   457		return doNormComposed(i)
   458	}
   459	

View as plain text