...

Source file src/pkg/vendor/golang.org/x/text/secure/bidirule/bidirule.go

     1	// Copyright 2016 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package bidirule implements the Bidi Rule defined by RFC 5893.
     6	//
     7	// This package is under development. The API may change without notice and
     8	// without preserving backward compatibility.
     9	package bidirule
    10	
    11	import (
    12		"errors"
    13		"unicode/utf8"
    14	
    15		"golang.org/x/text/transform"
    16		"golang.org/x/text/unicode/bidi"
    17	)
    18	
    19	// This file contains an implementation of RFC 5893: Right-to-Left Scripts for
    20	// Internationalized Domain Names for Applications (IDNA)
    21	//
    22	// A label is an individual component of a domain name.  Labels are usually
    23	// shown separated by dots; for example, the domain name "www.example.com" is
    24	// composed of three labels: "www", "example", and "com".
    25	//
    26	// An RTL label is a label that contains at least one character of class R, AL,
    27	// or AN. An LTR label is any label that is not an RTL label.
    28	//
    29	// A "Bidi domain name" is a domain name that contains at least one RTL label.
    30	//
    31	//  The following guarantees can be made based on the above:
    32	//
    33	//  o  In a domain name consisting of only labels that satisfy the rule,
    34	//     the requirements of Section 3 are satisfied.  Note that even LTR
    35	//     labels and pure ASCII labels have to be tested.
    36	//
    37	//  o  In a domain name consisting of only LDH labels (as defined in the
    38	//     Definitions document [RFC5890]) and labels that satisfy the rule,
    39	//     the requirements of Section 3 are satisfied as long as a label
    40	//     that starts with an ASCII digit does not come after a
    41	//     right-to-left label.
    42	//
    43	//  No guarantee is given for other combinations.
    44	
    45	// ErrInvalid indicates a label is invalid according to the Bidi Rule.
    46	var ErrInvalid = errors.New("bidirule: failed Bidi Rule")
    47	
    48	type ruleState uint8
    49	
    50	const (
    51		ruleInitial ruleState = iota
    52		ruleLTR
    53		ruleLTRFinal
    54		ruleRTL
    55		ruleRTLFinal
    56		ruleInvalid
    57	)
    58	
    59	type ruleTransition struct {
    60		next ruleState
    61		mask uint16
    62	}
    63	
    64	var transitions = [...][2]ruleTransition{
    65		// [2.1] The first character must be a character with Bidi property L, R, or
    66		// AL. If it has the R or AL property, it is an RTL label; if it has the L
    67		// property, it is an LTR label.
    68		ruleInitial: {
    69			{ruleLTRFinal, 1 << bidi.L},
    70			{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL},
    71		},
    72		ruleRTL: {
    73			// [2.3] In an RTL label, the end of the label must be a character with
    74			// Bidi property R, AL, EN, or AN, followed by zero or more characters
    75			// with Bidi property NSM.
    76			{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN},
    77	
    78			// [2.2] In an RTL label, only characters with the Bidi properties R,
    79			// AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
    80			// We exclude the entries from [2.3]
    81			{ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
    82		},
    83		ruleRTLFinal: {
    84			// [2.3] In an RTL label, the end of the label must be a character with
    85			// Bidi property R, AL, EN, or AN, followed by zero or more characters
    86			// with Bidi property NSM.
    87			{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN | 1<<bidi.NSM},
    88	
    89			// [2.2] In an RTL label, only characters with the Bidi properties R,
    90			// AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
    91			// We exclude the entries from [2.3] and NSM.
    92			{ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
    93		},
    94		ruleLTR: {
    95			// [2.6] In an LTR label, the end of the label must be a character with
    96			// Bidi property L or EN, followed by zero or more characters with Bidi
    97			// property NSM.
    98			{ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN},
    99	
   100			// [2.5] In an LTR label, only characters with the Bidi properties L,
   101			// EN, ES, CS, ET, ON, BN, or NSM are allowed.
   102			// We exclude the entries from [2.6].
   103			{ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
   104		},
   105		ruleLTRFinal: {
   106			// [2.6] In an LTR label, the end of the label must be a character with
   107			// Bidi property L or EN, followed by zero or more characters with Bidi
   108			// property NSM.
   109			{ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN | 1<<bidi.NSM},
   110	
   111			// [2.5] In an LTR label, only characters with the Bidi properties L,
   112			// EN, ES, CS, ET, ON, BN, or NSM are allowed.
   113			// We exclude the entries from [2.6].
   114			{ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
   115		},
   116		ruleInvalid: {
   117			{ruleInvalid, 0},
   118			{ruleInvalid, 0},
   119		},
   120	}
   121	
   122	// [2.4] In an RTL label, if an EN is present, no AN may be present, and
   123	// vice versa.
   124	const exclusiveRTL = uint16(1<<bidi.EN | 1<<bidi.AN)
   125	
   126	// From RFC 5893
   127	// An RTL label is a label that contains at least one character of type
   128	// R, AL, or AN.
   129	//
   130	// An LTR label is any label that is not an RTL label.
   131	
   132	// Direction reports the direction of the given label as defined by RFC 5893.
   133	// The Bidi Rule does not have to be applied to labels of the category
   134	// LeftToRight.
   135	func Direction(b []byte) bidi.Direction {
   136		for i := 0; i < len(b); {
   137			e, sz := bidi.Lookup(b[i:])
   138			if sz == 0 {
   139				i++
   140			}
   141			c := e.Class()
   142			if c == bidi.R || c == bidi.AL || c == bidi.AN {
   143				return bidi.RightToLeft
   144			}
   145			i += sz
   146		}
   147		return bidi.LeftToRight
   148	}
   149	
   150	// DirectionString reports the direction of the given label as defined by RFC
   151	// 5893. The Bidi Rule does not have to be applied to labels of the category
   152	// LeftToRight.
   153	func DirectionString(s string) bidi.Direction {
   154		for i := 0; i < len(s); {
   155			e, sz := bidi.LookupString(s[i:])
   156			if sz == 0 {
   157				i++
   158				continue
   159			}
   160			c := e.Class()
   161			if c == bidi.R || c == bidi.AL || c == bidi.AN {
   162				return bidi.RightToLeft
   163			}
   164			i += sz
   165		}
   166		return bidi.LeftToRight
   167	}
   168	
   169	// Valid reports whether b conforms to the BiDi rule.
   170	func Valid(b []byte) bool {
   171		var t Transformer
   172		if n, ok := t.advance(b); !ok || n < len(b) {
   173			return false
   174		}
   175		return t.isFinal()
   176	}
   177	
   178	// ValidString reports whether s conforms to the BiDi rule.
   179	func ValidString(s string) bool {
   180		var t Transformer
   181		if n, ok := t.advanceString(s); !ok || n < len(s) {
   182			return false
   183		}
   184		return t.isFinal()
   185	}
   186	
   187	// New returns a Transformer that verifies that input adheres to the Bidi Rule.
   188	func New() *Transformer {
   189		return &Transformer{}
   190	}
   191	
   192	// Transformer implements transform.Transform.
   193	type Transformer struct {
   194		state  ruleState
   195		hasRTL bool
   196		seen   uint16
   197	}
   198	
   199	// A rule can only be violated for "Bidi Domain names", meaning if one of the
   200	// following categories has been observed.
   201	func (t *Transformer) isRTL() bool {
   202		const isRTL = 1<<bidi.R | 1<<bidi.AL | 1<<bidi.AN
   203		return t.seen&isRTL != 0
   204	}
   205	
   206	// Reset implements transform.Transformer.
   207	func (t *Transformer) Reset() { *t = Transformer{} }
   208	
   209	// Transform implements transform.Transformer. This Transformer has state and
   210	// needs to be reset between uses.
   211	func (t *Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
   212		if len(dst) < len(src) {
   213			src = src[:len(dst)]
   214			atEOF = false
   215			err = transform.ErrShortDst
   216		}
   217		n, err1 := t.Span(src, atEOF)
   218		copy(dst, src[:n])
   219		if err == nil || err1 != nil && err1 != transform.ErrShortSrc {
   220			err = err1
   221		}
   222		return n, n, err
   223	}
   224	
   225	// Span returns the first n bytes of src that conform to the Bidi rule.
   226	func (t *Transformer) Span(src []byte, atEOF bool) (n int, err error) {
   227		if t.state == ruleInvalid && t.isRTL() {
   228			return 0, ErrInvalid
   229		}
   230		n, ok := t.advance(src)
   231		switch {
   232		case !ok:
   233			err = ErrInvalid
   234		case n < len(src):
   235			if !atEOF {
   236				err = transform.ErrShortSrc
   237				break
   238			}
   239			err = ErrInvalid
   240		case !t.isFinal():
   241			err = ErrInvalid
   242		}
   243		return n, err
   244	}
   245	
   246	// Precomputing the ASCII values decreases running time for the ASCII fast path
   247	// by about 30%.
   248	var asciiTable [128]bidi.Properties
   249	
   250	func init() {
   251		for i := range asciiTable {
   252			p, _ := bidi.LookupRune(rune(i))
   253			asciiTable[i] = p
   254		}
   255	}
   256	
   257	func (t *Transformer) advance(s []byte) (n int, ok bool) {
   258		var e bidi.Properties
   259		var sz int
   260		for n < len(s) {
   261			if s[n] < utf8.RuneSelf {
   262				e, sz = asciiTable[s[n]], 1
   263			} else {
   264				e, sz = bidi.Lookup(s[n:])
   265				if sz <= 1 {
   266					if sz == 1 {
   267						// We always consider invalid UTF-8 to be invalid, even if
   268						// the string has not yet been determined to be RTL.
   269						// TODO: is this correct?
   270						return n, false
   271					}
   272					return n, true // incomplete UTF-8 encoding
   273				}
   274			}
   275			// TODO: using CompactClass would result in noticeable speedup.
   276			// See unicode/bidi/prop.go:Properties.CompactClass.
   277			c := uint16(1 << e.Class())
   278			t.seen |= c
   279			if t.seen&exclusiveRTL == exclusiveRTL {
   280				t.state = ruleInvalid
   281				return n, false
   282			}
   283			switch tr := transitions[t.state]; {
   284			case tr[0].mask&c != 0:
   285				t.state = tr[0].next
   286			case tr[1].mask&c != 0:
   287				t.state = tr[1].next
   288			default:
   289				t.state = ruleInvalid
   290				if t.isRTL() {
   291					return n, false
   292				}
   293			}
   294			n += sz
   295		}
   296		return n, true
   297	}
   298	
   299	func (t *Transformer) advanceString(s string) (n int, ok bool) {
   300		var e bidi.Properties
   301		var sz int
   302		for n < len(s) {
   303			if s[n] < utf8.RuneSelf {
   304				e, sz = asciiTable[s[n]], 1
   305			} else {
   306				e, sz = bidi.LookupString(s[n:])
   307				if sz <= 1 {
   308					if sz == 1 {
   309						return n, false // invalid UTF-8
   310					}
   311					return n, true // incomplete UTF-8 encoding
   312				}
   313			}
   314			// TODO: using CompactClass results in noticeable speedup.
   315			// See unicode/bidi/prop.go:Properties.CompactClass.
   316			c := uint16(1 << e.Class())
   317			t.seen |= c
   318			if t.seen&exclusiveRTL == exclusiveRTL {
   319				t.state = ruleInvalid
   320				return n, false
   321			}
   322			switch tr := transitions[t.state]; {
   323			case tr[0].mask&c != 0:
   324				t.state = tr[0].next
   325			case tr[1].mask&c != 0:
   326				t.state = tr[1].next
   327			default:
   328				t.state = ruleInvalid
   329				if t.isRTL() {
   330					return n, false
   331				}
   332			}
   333			n += sz
   334		}
   335		return n, true
   336	}
   337	

View as plain text