...

Source file src/unicode/letter.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package unicode provides data and functions to test some properties of
     6	// Unicode code points.
     7	package unicode
     8	
     9	const (
    10		MaxRune         = '\U0010FFFF' // Maximum valid Unicode code point.
    11		ReplacementChar = '\uFFFD'     // Represents invalid code points.
    12		MaxASCII        = '\u007F'     // maximum ASCII value.
    13		MaxLatin1       = '\u00FF'     // maximum Latin-1 value.
    14	)
    15	
    16	// RangeTable defines a set of Unicode code points by listing the ranges of
    17	// code points within the set. The ranges are listed in two slices
    18	// to save space: a slice of 16-bit ranges and a slice of 32-bit ranges.
    19	// The two slices must be in sorted order and non-overlapping.
    20	// Also, R32 should contain only values >= 0x10000 (1<<16).
    21	type RangeTable struct {
    22		R16         []Range16
    23		R32         []Range32
    24		LatinOffset int // number of entries in R16 with Hi <= MaxLatin1
    25	}
    26	
    27	// Range16 represents of a range of 16-bit Unicode code points. The range runs from Lo to Hi
    28	// inclusive and has the specified stride.
    29	type Range16 struct {
    30		Lo     uint16
    31		Hi     uint16
    32		Stride uint16
    33	}
    34	
    35	// Range32 represents of a range of Unicode code points and is used when one or
    36	// more of the values will not fit in 16 bits. The range runs from Lo to Hi
    37	// inclusive and has the specified stride. Lo and Hi must always be >= 1<<16.
    38	type Range32 struct {
    39		Lo     uint32
    40		Hi     uint32
    41		Stride uint32
    42	}
    43	
    44	// CaseRange represents a range of Unicode code points for simple (one
    45	// code point to one code point) case conversion.
    46	// The range runs from Lo to Hi inclusive, with a fixed stride of 1. Deltas
    47	// are the number to add to the code point to reach the code point for a
    48	// different case for that character. They may be negative. If zero, it
    49	// means the character is in the corresponding case. There is a special
    50	// case representing sequences of alternating corresponding Upper and Lower
    51	// pairs. It appears with a fixed Delta of
    52	//	{UpperLower, UpperLower, UpperLower}
    53	// The constant UpperLower has an otherwise impossible delta value.
    54	type CaseRange struct {
    55		Lo    uint32
    56		Hi    uint32
    57		Delta d
    58	}
    59	
    60	// SpecialCase represents language-specific case mappings such as Turkish.
    61	// Methods of SpecialCase customize (by overriding) the standard mappings.
    62	type SpecialCase []CaseRange
    63	
    64	// BUG(r): There is no mechanism for full case folding, that is, for
    65	// characters that involve multiple runes in the input or output.
    66	
    67	// Indices into the Delta arrays inside CaseRanges for case mapping.
    68	const (
    69		UpperCase = iota
    70		LowerCase
    71		TitleCase
    72		MaxCase
    73	)
    74	
    75	type d [MaxCase]rune // to make the CaseRanges text shorter
    76	
    77	// If the Delta field of a CaseRange is UpperLower, it means
    78	// this CaseRange represents a sequence of the form (say)
    79	// Upper Lower Upper Lower.
    80	const (
    81		UpperLower = MaxRune + 1 // (Cannot be a valid delta.)
    82	)
    83	
    84	// linearMax is the maximum size table for linear search for non-Latin1 rune.
    85	// Derived by running 'go test -calibrate'.
    86	const linearMax = 18
    87	
    88	// is16 reports whether r is in the sorted slice of 16-bit ranges.
    89	func is16(ranges []Range16, r uint16) bool {
    90		if len(ranges) <= linearMax || r <= MaxLatin1 {
    91			for i := range ranges {
    92				range_ := &ranges[i]
    93				if r < range_.Lo {
    94					return false
    95				}
    96				if r <= range_.Hi {
    97					return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0
    98				}
    99			}
   100			return false
   101		}
   102	
   103		// binary search over ranges
   104		lo := 0
   105		hi := len(ranges)
   106		for lo < hi {
   107			m := lo + (hi-lo)/2
   108			range_ := &ranges[m]
   109			if range_.Lo <= r && r <= range_.Hi {
   110				return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0
   111			}
   112			if r < range_.Lo {
   113				hi = m
   114			} else {
   115				lo = m + 1
   116			}
   117		}
   118		return false
   119	}
   120	
   121	// is32 reports whether r is in the sorted slice of 32-bit ranges.
   122	func is32(ranges []Range32, r uint32) bool {
   123		if len(ranges) <= linearMax {
   124			for i := range ranges {
   125				range_ := &ranges[i]
   126				if r < range_.Lo {
   127					return false
   128				}
   129				if r <= range_.Hi {
   130					return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0
   131				}
   132			}
   133			return false
   134		}
   135	
   136		// binary search over ranges
   137		lo := 0
   138		hi := len(ranges)
   139		for lo < hi {
   140			m := lo + (hi-lo)/2
   141			range_ := ranges[m]
   142			if range_.Lo <= r && r <= range_.Hi {
   143				return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0
   144			}
   145			if r < range_.Lo {
   146				hi = m
   147			} else {
   148				lo = m + 1
   149			}
   150		}
   151		return false
   152	}
   153	
   154	// Is reports whether the rune is in the specified table of ranges.
   155	func Is(rangeTab *RangeTable, r rune) bool {
   156		r16 := rangeTab.R16
   157		if len(r16) > 0 && r <= rune(r16[len(r16)-1].Hi) {
   158			return is16(r16, uint16(r))
   159		}
   160		r32 := rangeTab.R32
   161		if len(r32) > 0 && r >= rune(r32[0].Lo) {
   162			return is32(r32, uint32(r))
   163		}
   164		return false
   165	}
   166	
   167	func isExcludingLatin(rangeTab *RangeTable, r rune) bool {
   168		r16 := rangeTab.R16
   169		if off := rangeTab.LatinOffset; len(r16) > off && r <= rune(r16[len(r16)-1].Hi) {
   170			return is16(r16[off:], uint16(r))
   171		}
   172		r32 := rangeTab.R32
   173		if len(r32) > 0 && r >= rune(r32[0].Lo) {
   174			return is32(r32, uint32(r))
   175		}
   176		return false
   177	}
   178	
   179	// IsUpper reports whether the rune is an upper case letter.
   180	func IsUpper(r rune) bool {
   181		// See comment in IsGraphic.
   182		if uint32(r) <= MaxLatin1 {
   183			return properties[uint8(r)]&pLmask == pLu
   184		}
   185		return isExcludingLatin(Upper, r)
   186	}
   187	
   188	// IsLower reports whether the rune is a lower case letter.
   189	func IsLower(r rune) bool {
   190		// See comment in IsGraphic.
   191		if uint32(r) <= MaxLatin1 {
   192			return properties[uint8(r)]&pLmask == pLl
   193		}
   194		return isExcludingLatin(Lower, r)
   195	}
   196	
   197	// IsTitle reports whether the rune is a title case letter.
   198	func IsTitle(r rune) bool {
   199		if r <= MaxLatin1 {
   200			return false
   201		}
   202		return isExcludingLatin(Title, r)
   203	}
   204	
   205	// to maps the rune using the specified case mapping.
   206	// It additionally reports whether caseRange contained a mapping for r.
   207	func to(_case int, r rune, caseRange []CaseRange) (mappedRune rune, foundMapping bool) {
   208		if _case < 0 || MaxCase <= _case {
   209			return ReplacementChar, false // as reasonable an error as any
   210		}
   211		// binary search over ranges
   212		lo := 0
   213		hi := len(caseRange)
   214		for lo < hi {
   215			m := lo + (hi-lo)/2
   216			cr := caseRange[m]
   217			if rune(cr.Lo) <= r && r <= rune(cr.Hi) {
   218				delta := cr.Delta[_case]
   219				if delta > MaxRune {
   220					// In an Upper-Lower sequence, which always starts with
   221					// an UpperCase letter, the real deltas always look like:
   222					//	{0, 1, 0}    UpperCase (Lower is next)
   223					//	{-1, 0, -1}  LowerCase (Upper, Title are previous)
   224					// The characters at even offsets from the beginning of the
   225					// sequence are upper case; the ones at odd offsets are lower.
   226					// The correct mapping can be done by clearing or setting the low
   227					// bit in the sequence offset.
   228					// The constants UpperCase and TitleCase are even while LowerCase
   229					// is odd so we take the low bit from _case.
   230					return rune(cr.Lo) + ((r-rune(cr.Lo))&^1 | rune(_case&1)), true
   231				}
   232				return r + delta, true
   233			}
   234			if r < rune(cr.Lo) {
   235				hi = m
   236			} else {
   237				lo = m + 1
   238			}
   239		}
   240		return r, false
   241	}
   242	
   243	// To maps the rune to the specified case: UpperCase, LowerCase, or TitleCase.
   244	func To(_case int, r rune) rune {
   245		r, _ = to(_case, r, CaseRanges)
   246		return r
   247	}
   248	
   249	// ToUpper maps the rune to upper case.
   250	func ToUpper(r rune) rune {
   251		if r <= MaxASCII {
   252			if 'a' <= r && r <= 'z' {
   253				r -= 'a' - 'A'
   254			}
   255			return r
   256		}
   257		return To(UpperCase, r)
   258	}
   259	
   260	// ToLower maps the rune to lower case.
   261	func ToLower(r rune) rune {
   262		if r <= MaxASCII {
   263			if 'A' <= r && r <= 'Z' {
   264				r += 'a' - 'A'
   265			}
   266			return r
   267		}
   268		return To(LowerCase, r)
   269	}
   270	
   271	// ToTitle maps the rune to title case.
   272	func ToTitle(r rune) rune {
   273		if r <= MaxASCII {
   274			if 'a' <= r && r <= 'z' { // title case is upper case for ASCII
   275				r -= 'a' - 'A'
   276			}
   277			return r
   278		}
   279		return To(TitleCase, r)
   280	}
   281	
   282	// ToUpper maps the rune to upper case giving priority to the special mapping.
   283	func (special SpecialCase) ToUpper(r rune) rune {
   284		r1, hadMapping := to(UpperCase, r, []CaseRange(special))
   285		if r1 == r && !hadMapping {
   286			r1 = ToUpper(r)
   287		}
   288		return r1
   289	}
   290	
   291	// ToTitle maps the rune to title case giving priority to the special mapping.
   292	func (special SpecialCase) ToTitle(r rune) rune {
   293		r1, hadMapping := to(TitleCase, r, []CaseRange(special))
   294		if r1 == r && !hadMapping {
   295			r1 = ToTitle(r)
   296		}
   297		return r1
   298	}
   299	
   300	// ToLower maps the rune to lower case giving priority to the special mapping.
   301	func (special SpecialCase) ToLower(r rune) rune {
   302		r1, hadMapping := to(LowerCase, r, []CaseRange(special))
   303		if r1 == r && !hadMapping {
   304			r1 = ToLower(r)
   305		}
   306		return r1
   307	}
   308	
   309	// caseOrbit is defined in tables.go as []foldPair. Right now all the
   310	// entries fit in uint16, so use uint16. If that changes, compilation
   311	// will fail (the constants in the composite literal will not fit in uint16)
   312	// and the types here can change to uint32.
   313	type foldPair struct {
   314		From uint16
   315		To   uint16
   316	}
   317	
   318	// SimpleFold iterates over Unicode code points equivalent under
   319	// the Unicode-defined simple case folding. Among the code points
   320	// equivalent to rune (including rune itself), SimpleFold returns the
   321	// smallest rune > r if one exists, or else the smallest rune >= 0.
   322	// If r is not a valid Unicode code point, SimpleFold(r) returns r.
   323	//
   324	// For example:
   325	//	SimpleFold('A') = 'a'
   326	//	SimpleFold('a') = 'A'
   327	//
   328	//	SimpleFold('K') = 'k'
   329	//	SimpleFold('k') = '\u212A' (Kelvin symbol, K)
   330	//	SimpleFold('\u212A') = 'K'
   331	//
   332	//	SimpleFold('1') = '1'
   333	//
   334	//	SimpleFold(-2) = -2
   335	//
   336	func SimpleFold(r rune) rune {
   337		if r < 0 || r > MaxRune {
   338			return r
   339		}
   340	
   341		if int(r) < len(asciiFold) {
   342			return rune(asciiFold[r])
   343		}
   344	
   345		// Consult caseOrbit table for special cases.
   346		lo := 0
   347		hi := len(caseOrbit)
   348		for lo < hi {
   349			m := lo + (hi-lo)/2
   350			if rune(caseOrbit[m].From) < r {
   351				lo = m + 1
   352			} else {
   353				hi = m
   354			}
   355		}
   356		if lo < len(caseOrbit) && rune(caseOrbit[lo].From) == r {
   357			return rune(caseOrbit[lo].To)
   358		}
   359	
   360		// No folding specified. This is a one- or two-element
   361		// equivalence class containing rune and ToLower(rune)
   362		// and ToUpper(rune) if they are different from rune.
   363		if l := ToLower(r); l != r {
   364			return l
   365		}
   366		return ToUpper(r)
   367	}
   368	

View as plain text