...

Source file src/pkg/vendor/golang.org/x/text/unicode/norm/forminfo.go

     1	// Copyright 2011 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	package norm
     6	
     7	import "encoding/binary"
     8	
     9	// This file contains Form-specific logic and wrappers for data in tables.go.
    10	
    11	// Rune info is stored in a separate trie per composing form. A composing form
    12	// and its corresponding decomposing form share the same trie.  Each trie maps
    13	// a rune to a uint16. The values take two forms.  For v >= 0x8000:
    14	//   bits
    15	//   15:    1 (inverse of NFD_QC bit of qcInfo)
    16	//   13..7: qcInfo (see below). isYesD is always true (no decompostion).
    17	//    6..0: ccc (compressed CCC value).
    18	// For v < 0x8000, the respective rune has a decomposition and v is an index
    19	// into a byte array of UTF-8 decomposition sequences and additional info and
    20	// has the form:
    21	//    <header> <decomp_byte>* [<tccc> [<lccc>]]
    22	// The header contains the number of bytes in the decomposition (excluding this
    23	// length byte). The two most significant bits of this length byte correspond
    24	// to bit 5 and 4 of qcInfo (see below).  The byte sequence itself starts at v+1.
    25	// The byte sequence is followed by a trailing and leading CCC if the values
    26	// for these are not zero.  The value of v determines which ccc are appended
    27	// to the sequences.  For v < firstCCC, there are none, for v >= firstCCC,
    28	// the sequence is followed by a trailing ccc, and for v >= firstLeadingCC
    29	// there is an additional leading ccc. The value of tccc itself is the
    30	// trailing CCC shifted left 2 bits. The two least-significant bits of tccc
    31	// are the number of trailing non-starters.
    32	
    33	const (
    34		qcInfoMask      = 0x3F // to clear all but the relevant bits in a qcInfo
    35		headerLenMask   = 0x3F // extract the length value from the header byte
    36		headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
    37	)
    38	
    39	// Properties provides access to normalization properties of a rune.
    40	type Properties struct {
    41		pos   uint8  // start position in reorderBuffer; used in composition.go
    42		size  uint8  // length of UTF-8 encoding of this rune
    43		ccc   uint8  // leading canonical combining class (ccc if not decomposition)
    44		tccc  uint8  // trailing canonical combining class (ccc if not decomposition)
    45		nLead uint8  // number of leading non-starters.
    46		flags qcInfo // quick check flags
    47		index uint16
    48	}
    49	
    50	// functions dispatchable per form
    51	type lookupFunc func(b input, i int) Properties
    52	
    53	// formInfo holds Form-specific functions and tables.
    54	type formInfo struct {
    55		form                     Form
    56		composing, compatibility bool // form type
    57		info                     lookupFunc
    58		nextMain                 iterFunc
    59	}
    60	
    61	var formTable = []*formInfo{{
    62		form:          NFC,
    63		composing:     true,
    64		compatibility: false,
    65		info:          lookupInfoNFC,
    66		nextMain:      nextComposed,
    67	}, {
    68		form:          NFD,
    69		composing:     false,
    70		compatibility: false,
    71		info:          lookupInfoNFC,
    72		nextMain:      nextDecomposed,
    73	}, {
    74		form:          NFKC,
    75		composing:     true,
    76		compatibility: true,
    77		info:          lookupInfoNFKC,
    78		nextMain:      nextComposed,
    79	}, {
    80		form:          NFKD,
    81		composing:     false,
    82		compatibility: true,
    83		info:          lookupInfoNFKC,
    84		nextMain:      nextDecomposed,
    85	}}
    86	
    87	// We do not distinguish between boundaries for NFC, NFD, etc. to avoid
    88	// unexpected behavior for the user.  For example, in NFD, there is a boundary
    89	// after 'a'.  However, 'a' might combine with modifiers, so from the application's
    90	// perspective it is not a good boundary. We will therefore always use the
    91	// boundaries for the combining variants.
    92	
    93	// BoundaryBefore returns true if this rune starts a new segment and
    94	// cannot combine with any rune on the left.
    95	func (p Properties) BoundaryBefore() bool {
    96		if p.ccc == 0 && !p.combinesBackward() {
    97			return true
    98		}
    99		// We assume that the CCC of the first character in a decomposition
   100		// is always non-zero if different from info.ccc and that we can return
   101		// false at this point. This is verified by maketables.
   102		return false
   103	}
   104	
   105	// BoundaryAfter returns true if runes cannot combine with or otherwise
   106	// interact with this or previous runes.
   107	func (p Properties) BoundaryAfter() bool {
   108		// TODO: loosen these conditions.
   109		return p.isInert()
   110	}
   111	
   112	// We pack quick check data in 4 bits:
   113	//   5:    Combines forward  (0 == false, 1 == true)
   114	//   4..3: NFC_QC Yes(00), No (10), or Maybe (11)
   115	//   2:    NFD_QC Yes (0) or No (1). No also means there is a decomposition.
   116	//   1..0: Number of trailing non-starters.
   117	//
   118	// When all 4 bits are zero, the character is inert, meaning it is never
   119	// influenced by normalization.
   120	type qcInfo uint8
   121	
   122	func (p Properties) isYesC() bool { return p.flags&0x10 == 0 }
   123	func (p Properties) isYesD() bool { return p.flags&0x4 == 0 }
   124	
   125	func (p Properties) combinesForward() bool  { return p.flags&0x20 != 0 }
   126	func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe
   127	func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD
   128	
   129	func (p Properties) isInert() bool {
   130		return p.flags&qcInfoMask == 0 && p.ccc == 0
   131	}
   132	
   133	func (p Properties) multiSegment() bool {
   134		return p.index >= firstMulti && p.index < endMulti
   135	}
   136	
   137	func (p Properties) nLeadingNonStarters() uint8 {
   138		return p.nLead
   139	}
   140	
   141	func (p Properties) nTrailingNonStarters() uint8 {
   142		return uint8(p.flags & 0x03)
   143	}
   144	
   145	// Decomposition returns the decomposition for the underlying rune
   146	// or nil if there is none.
   147	func (p Properties) Decomposition() []byte {
   148		// TODO: create the decomposition for Hangul?
   149		if p.index == 0 {
   150			return nil
   151		}
   152		i := p.index
   153		n := decomps[i] & headerLenMask
   154		i++
   155		return decomps[i : i+uint16(n)]
   156	}
   157	
   158	// Size returns the length of UTF-8 encoding of the rune.
   159	func (p Properties) Size() int {
   160		return int(p.size)
   161	}
   162	
   163	// CCC returns the canonical combining class of the underlying rune.
   164	func (p Properties) CCC() uint8 {
   165		if p.index >= firstCCCZeroExcept {
   166			return 0
   167		}
   168		return ccc[p.ccc]
   169	}
   170	
   171	// LeadCCC returns the CCC of the first rune in the decomposition.
   172	// If there is no decomposition, LeadCCC equals CCC.
   173	func (p Properties) LeadCCC() uint8 {
   174		return ccc[p.ccc]
   175	}
   176	
   177	// TrailCCC returns the CCC of the last rune in the decomposition.
   178	// If there is no decomposition, TrailCCC equals CCC.
   179	func (p Properties) TrailCCC() uint8 {
   180		return ccc[p.tccc]
   181	}
   182	
   183	func buildRecompMap() {
   184		recompMap = make(map[uint32]rune, len(recompMapPacked)/8)
   185		var buf [8]byte
   186		for i := 0; i < len(recompMapPacked); i += 8 {
   187			copy(buf[:], recompMapPacked[i:i+8])
   188			key := binary.BigEndian.Uint32(buf[:4])
   189			val := binary.BigEndian.Uint32(buf[4:])
   190			recompMap[key] = rune(val)
   191		}
   192	}
   193	
   194	// Recomposition
   195	// We use 32-bit keys instead of 64-bit for the two codepoint keys.
   196	// This clips off the bits of three entries, but we know this will not
   197	// result in a collision. In the unlikely event that changes to
   198	// UnicodeData.txt introduce collisions, the compiler will catch it.
   199	// Note that the recomposition map for NFC and NFKC are identical.
   200	
   201	// combine returns the combined rune or 0 if it doesn't exist.
   202	//
   203	// The caller is responsible for calling
   204	// recompMapOnce.Do(buildRecompMap) sometime before this is called.
   205	func combine(a, b rune) rune {
   206		key := uint32(uint16(a))<<16 + uint32(uint16(b))
   207		if recompMap == nil {
   208			panic("caller error") // see func comment
   209		}
   210		return recompMap[key]
   211	}
   212	
   213	func lookupInfoNFC(b input, i int) Properties {
   214		v, sz := b.charinfoNFC(i)
   215		return compInfo(v, sz)
   216	}
   217	
   218	func lookupInfoNFKC(b input, i int) Properties {
   219		v, sz := b.charinfoNFKC(i)
   220		return compInfo(v, sz)
   221	}
   222	
   223	// Properties returns properties for the first rune in s.
   224	func (f Form) Properties(s []byte) Properties {
   225		if f == NFC || f == NFD {
   226			return compInfo(nfcData.lookup(s))
   227		}
   228		return compInfo(nfkcData.lookup(s))
   229	}
   230	
   231	// PropertiesString returns properties for the first rune in s.
   232	func (f Form) PropertiesString(s string) Properties {
   233		if f == NFC || f == NFD {
   234			return compInfo(nfcData.lookupString(s))
   235		}
   236		return compInfo(nfkcData.lookupString(s))
   237	}
   238	
   239	// compInfo converts the information contained in v and sz
   240	// to a Properties.  See the comment at the top of the file
   241	// for more information on the format.
   242	func compInfo(v uint16, sz int) Properties {
   243		if v == 0 {
   244			return Properties{size: uint8(sz)}
   245		} else if v >= 0x8000 {
   246			p := Properties{
   247				size:  uint8(sz),
   248				ccc:   uint8(v),
   249				tccc:  uint8(v),
   250				flags: qcInfo(v >> 8),
   251			}
   252			if p.ccc > 0 || p.combinesBackward() {
   253				p.nLead = uint8(p.flags & 0x3)
   254			}
   255			return p
   256		}
   257		// has decomposition
   258		h := decomps[v]
   259		f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4
   260		p := Properties{size: uint8(sz), flags: f, index: v}
   261		if v >= firstCCC {
   262			v += uint16(h&headerLenMask) + 1
   263			c := decomps[v]
   264			p.tccc = c >> 2
   265			p.flags |= qcInfo(c & 0x3)
   266			if v >= firstLeadingCCC {
   267				p.nLead = c & 0x3
   268				if v >= firstStarterWithNLead {
   269					// We were tricked. Remove the decomposition.
   270					p.flags &= 0x03
   271					p.index = 0
   272					return p
   273				}
   274				p.ccc = decomps[v+1]
   275			}
   276		}
   277		return p
   278	}
   279	

View as plain text