Source file src/pkg/html/template/transition.go

     1	// Copyright 2011 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	package template
     6	
     7	import (
     8		"bytes"
     9		"strings"
    10	)
    11	
    12	// transitionFunc is the array of context transition functions for text nodes.
    13	// A transition function takes a context and template text input, and returns
    14	// the updated context and the number of bytes consumed from the front of the
    15	// input.
    16	var transitionFunc = [...]func(context, []byte) (context, int){
    17		stateText:        tText,
    18		stateTag:         tTag,
    19		stateAttrName:    tAttrName,
    20		stateAfterName:   tAfterName,
    21		stateBeforeValue: tBeforeValue,
    22		stateHTMLCmt:     tHTMLCmt,
    23		stateRCDATA:      tSpecialTagEnd,
    24		stateAttr:        tAttr,
    25		stateURL:         tURL,
    26		stateSrcset:      tURL,
    27		stateJS:          tJS,
    28		stateJSDqStr:     tJSDelimited,
    29		stateJSSqStr:     tJSDelimited,
    30		stateJSRegexp:    tJSDelimited,
    31		stateJSBlockCmt:  tBlockCmt,
    32		stateJSLineCmt:   tLineCmt,
    33		stateCSS:         tCSS,
    34		stateCSSDqStr:    tCSSStr,
    35		stateCSSSqStr:    tCSSStr,
    36		stateCSSDqURL:    tCSSStr,
    37		stateCSSSqURL:    tCSSStr,
    38		stateCSSURL:      tCSSStr,
    39		stateCSSBlockCmt: tBlockCmt,
    40		stateCSSLineCmt:  tLineCmt,
    41		stateError:       tError,
    42	}
    43	
    44	var commentStart = []byte("<!--")
    45	var commentEnd = []byte("-->")
    46	
    47	// tText is the context transition function for the text state.
    48	func tText(c context, s []byte) (context, int) {
    49		k := 0
    50		for {
    51			i := k + bytes.IndexByte(s[k:], '<')
    52			if i < k || i+1 == len(s) {
    53				return c, len(s)
    54			} else if i+4 <= len(s) && bytes.Equal(commentStart, s[i:i+4]) {
    55				return context{state: stateHTMLCmt}, i + 4
    56			}
    57			i++
    58			end := false
    59			if s[i] == '/' {
    60				if i+1 == len(s) {
    61					return c, len(s)
    62				}
    63				end, i = true, i+1
    64			}
    65			j, e := eatTagName(s, i)
    66			if j != i {
    67				if end {
    68					e = elementNone
    69				}
    70				// We've found an HTML tag.
    71				return context{state: stateTag, element: e}, j
    72			}
    73			k = j
    74		}
    75	}
    76	
    77	var elementContentType = [...]state{
    78		elementNone:     stateText,
    79		elementScript:   stateJS,
    80		elementStyle:    stateCSS,
    81		elementTextarea: stateRCDATA,
    82		elementTitle:    stateRCDATA,
    83	}
    84	
    85	// tTag is the context transition function for the tag state.
    86	func tTag(c context, s []byte) (context, int) {
    87		// Find the attribute name.
    88		i := eatWhiteSpace(s, 0)
    89		if i == len(s) {
    90			return c, len(s)
    91		}
    92		if s[i] == '>' {
    93			return context{
    94				state:   elementContentType[c.element],
    95				element: c.element,
    96			}, i + 1
    97		}
    98		j, err := eatAttrName(s, i)
    99		if err != nil {
   100			return context{state: stateError, err: err}, len(s)
   101		}
   102		state, attr := stateTag, attrNone
   103		if i == j {
   104			return context{
   105				state: stateError,
   106				err:   errorf(ErrBadHTML, nil, 0, "expected space, attr name, or end of tag, but got %q", s[i:]),
   107			}, len(s)
   108		}
   109	
   110		attrName := strings.ToLower(string(s[i:j]))
   111		if c.element == elementScript && attrName == "type" {
   112			attr = attrScriptType
   113		} else {
   114			switch attrType(attrName) {
   115			case contentTypeURL:
   116				attr = attrURL
   117			case contentTypeCSS:
   118				attr = attrStyle
   119			case contentTypeJS:
   120				attr = attrScript
   121			case contentTypeSrcset:
   122				attr = attrSrcset
   123			}
   124		}
   125	
   126		if j == len(s) {
   127			state = stateAttrName
   128		} else {
   129			state = stateAfterName
   130		}
   131		return context{state: state, element: c.element, attr: attr}, j
   132	}
   133	
   134	// tAttrName is the context transition function for stateAttrName.
   135	func tAttrName(c context, s []byte) (context, int) {
   136		i, err := eatAttrName(s, 0)
   137		if err != nil {
   138			return context{state: stateError, err: err}, len(s)
   139		} else if i != len(s) {
   140			c.state = stateAfterName
   141		}
   142		return c, i
   143	}
   144	
   145	// tAfterName is the context transition function for stateAfterName.
   146	func tAfterName(c context, s []byte) (context, int) {
   147		// Look for the start of the value.
   148		i := eatWhiteSpace(s, 0)
   149		if i == len(s) {
   150			return c, len(s)
   151		} else if s[i] != '=' {
   152			// Occurs due to tag ending '>', and valueless attribute.
   153			c.state = stateTag
   154			return c, i
   155		}
   156		c.state = stateBeforeValue
   157		// Consume the "=".
   158		return c, i + 1
   159	}
   160	
   161	var attrStartStates = [...]state{
   162		attrNone:       stateAttr,
   163		attrScript:     stateJS,
   164		attrScriptType: stateAttr,
   165		attrStyle:      stateCSS,
   166		attrURL:        stateURL,
   167		attrSrcset:     stateSrcset,
   168	}
   169	
   170	// tBeforeValue is the context transition function for stateBeforeValue.
   171	func tBeforeValue(c context, s []byte) (context, int) {
   172		i := eatWhiteSpace(s, 0)
   173		if i == len(s) {
   174			return c, len(s)
   175		}
   176		// Find the attribute delimiter.
   177		delim := delimSpaceOrTagEnd
   178		switch s[i] {
   179		case '\'':
   180			delim, i = delimSingleQuote, i+1
   181		case '"':
   182			delim, i = delimDoubleQuote, i+1
   183		}
   184		c.state, c.delim = attrStartStates[c.attr], delim
   185		return c, i
   186	}
   187	
   188	// tHTMLCmt is the context transition function for stateHTMLCmt.
   189	func tHTMLCmt(c context, s []byte) (context, int) {
   190		if i := bytes.Index(s, commentEnd); i != -1 {
   191			return context{}, i + 3
   192		}
   193		return c, len(s)
   194	}
   195	
   196	// specialTagEndMarkers maps element types to the character sequence that
   197	// case-insensitively signals the end of the special tag body.
   198	var specialTagEndMarkers = [...][]byte{
   199		elementScript:   []byte("script"),
   200		elementStyle:    []byte("style"),
   201		elementTextarea: []byte("textarea"),
   202		elementTitle:    []byte("title"),
   203	}
   204	
   205	var (
   206		specialTagEndPrefix = []byte("</")
   207		tagEndSeparators    = []byte("> \t\n\f/")
   208	)
   209	
   210	// tSpecialTagEnd is the context transition function for raw text and RCDATA
   211	// element states.
   212	func tSpecialTagEnd(c context, s []byte) (context, int) {
   213		if c.element != elementNone {
   214			if i := indexTagEnd(s, specialTagEndMarkers[c.element]); i != -1 {
   215				return context{}, i
   216			}
   217		}
   218		return c, len(s)
   219	}
   220	
   221	// indexTagEnd finds the index of a special tag end in a case insensitive way, or returns -1
   222	func indexTagEnd(s []byte, tag []byte) int {
   223		res := 0
   224		plen := len(specialTagEndPrefix)
   225		for len(s) > 0 {
   226			// Try to find the tag end prefix first
   227			i := bytes.Index(s, specialTagEndPrefix)
   228			if i == -1 {
   229				return i
   230			}
   231			s = s[i+plen:]
   232			// Try to match the actual tag if there is still space for it
   233			if len(tag) <= len(s) && bytes.EqualFold(tag, s[:len(tag)]) {
   234				s = s[len(tag):]
   235				// Check the tag is followed by a proper separator
   236				if len(s) > 0 && bytes.IndexByte(tagEndSeparators, s[0]) != -1 {
   237					return res + i
   238				}
   239				res += len(tag)
   240			}
   241			res += i + plen
   242		}
   243		return -1
   244	}
   245	
   246	// tAttr is the context transition function for the attribute state.
   247	func tAttr(c context, s []byte) (context, int) {
   248		return c, len(s)
   249	}
   250	
   251	// tURL is the context transition function for the URL state.
   252	func tURL(c context, s []byte) (context, int) {
   253		if bytes.ContainsAny(s, "#?") {
   254			c.urlPart = urlPartQueryOrFrag
   255		} else if len(s) != eatWhiteSpace(s, 0) && c.urlPart == urlPartNone {
   256			// HTML5 uses "Valid URL potentially surrounded by spaces" for
   257			// attrs: https://www.w3.org/TR/html5/index.html#attributes-1
   258			c.urlPart = urlPartPreQuery
   259		}
   260		return c, len(s)
   261	}
   262	
   263	// tJS is the context transition function for the JS state.
   264	func tJS(c context, s []byte) (context, int) {
   265		i := bytes.IndexAny(s, `"'/`)
   266		if i == -1 {
   267			// Entire input is non string, comment, regexp tokens.
   268			c.jsCtx = nextJSCtx(s, c.jsCtx)
   269			return c, len(s)
   270		}
   271		c.jsCtx = nextJSCtx(s[:i], c.jsCtx)
   272		switch s[i] {
   273		case '"':
   274			c.state, c.jsCtx = stateJSDqStr, jsCtxRegexp
   275		case '\'':
   276			c.state, c.jsCtx = stateJSSqStr, jsCtxRegexp
   277		case '/':
   278			switch {
   279			case i+1 < len(s) && s[i+1] == '/':
   280				c.state, i = stateJSLineCmt, i+1
   281			case i+1 < len(s) && s[i+1] == '*':
   282				c.state, i = stateJSBlockCmt, i+1
   283			case c.jsCtx == jsCtxRegexp:
   284				c.state = stateJSRegexp
   285			case c.jsCtx == jsCtxDivOp:
   286				c.jsCtx = jsCtxRegexp
   287			default:
   288				return context{
   289					state: stateError,
   290					err:   errorf(ErrSlashAmbig, nil, 0, "'/' could start a division or regexp: %.32q", s[i:]),
   291				}, len(s)
   292			}
   293		default:
   294			panic("unreachable")
   295		}
   296		return c, i + 1
   297	}
   298	
   299	// tJSDelimited is the context transition function for the JS string and regexp
   300	// states.
   301	func tJSDelimited(c context, s []byte) (context, int) {
   302		specials := `\"`
   303		switch c.state {
   304		case stateJSSqStr:
   305			specials = `\'`
   306		case stateJSRegexp:
   307			specials = `\/[]`
   308		}
   309	
   310		k, inCharset := 0, false
   311		for {
   312			i := k + bytes.IndexAny(s[k:], specials)
   313			if i < k {
   314				break
   315			}
   316			switch s[i] {
   317			case '\\':
   318				i++
   319				if i == len(s) {
   320					return context{
   321						state: stateError,
   322						err:   errorf(ErrPartialEscape, nil, 0, "unfinished escape sequence in JS string: %q", s),
   323					}, len(s)
   324				}
   325			case '[':
   326				inCharset = true
   327			case ']':
   328				inCharset = false
   329			default:
   330				// end delimiter
   331				if !inCharset {
   332					c.state, c.jsCtx = stateJS, jsCtxDivOp
   333					return c, i + 1
   334				}
   335			}
   336			k = i + 1
   337		}
   338	
   339		if inCharset {
   340			// This can be fixed by making context richer if interpolation
   341			// into charsets is desired.
   342			return context{
   343				state: stateError,
   344				err:   errorf(ErrPartialCharset, nil, 0, "unfinished JS regexp charset: %q", s),
   345			}, len(s)
   346		}
   347	
   348		return c, len(s)
   349	}
   350	
   351	var blockCommentEnd = []byte("*/")
   352	
   353	// tBlockCmt is the context transition function for /*comment*/ states.
   354	func tBlockCmt(c context, s []byte) (context, int) {
   355		i := bytes.Index(s, blockCommentEnd)
   356		if i == -1 {
   357			return c, len(s)
   358		}
   359		switch c.state {
   360		case stateJSBlockCmt:
   361			c.state = stateJS
   362		case stateCSSBlockCmt:
   363			c.state = stateCSS
   364		default:
   365			panic(c.state.String())
   366		}
   367		return c, i + 2
   368	}
   369	
   370	// tLineCmt is the context transition function for //comment states.
   371	func tLineCmt(c context, s []byte) (context, int) {
   372		var lineTerminators string
   373		var endState state
   374		switch c.state {
   375		case stateJSLineCmt:
   376			lineTerminators, endState = "\n\r\u2028\u2029", stateJS
   377		case stateCSSLineCmt:
   378			lineTerminators, endState = "\n\f\r", stateCSS
   379			// Line comments are not part of any published CSS standard but
   380			// are supported by the 4 major browsers.
   381			// This defines line comments as
   382			//     LINECOMMENT ::= "//" [^\n\f\d]*
   383			// since https://www.w3.org/TR/css3-syntax/#SUBTOK-nl defines
   384			// newlines:
   385			//     nl ::= #xA | #xD #xA | #xD | #xC
   386		default:
   387			panic(c.state.String())
   388		}
   389	
   390		i := bytes.IndexAny(s, lineTerminators)
   391		if i == -1 {
   392			return c, len(s)
   393		}
   394		c.state = endState
   395		// Per section 7.4 of EcmaScript 5 : https://es5.github.com/#x7.4
   396		// "However, the LineTerminator at the end of the line is not
   397		// considered to be part of the single-line comment; it is
   398		// recognized separately by the lexical grammar and becomes part
   399		// of the stream of input elements for the syntactic grammar."
   400		return c, i
   401	}
   402	
   403	// tCSS is the context transition function for the CSS state.
   404	func tCSS(c context, s []byte) (context, int) {
   405		// CSS quoted strings are almost never used except for:
   406		// (1) URLs as in background: "/foo.png"
   407		// (2) Multiword font-names as in font-family: "Times New Roman"
   408		// (3) List separators in content values as in inline-lists:
   409		//    <style>
   410		//    ul.inlineList { list-style: none; padding:0 }
   411		//    ul.inlineList > li { display: inline }
   412		//    ul.inlineList > li:before { content: ", " }
   413		//    ul.inlineList > li:first-child:before { content: "" }
   414		//    </style>
   415		//    <ul class=inlineList><li>One<li>Two<li>Three</ul>
   416		// (4) Attribute value selectors as in a[href="http://example.com/"]
   417		//
   418		// We conservatively treat all strings as URLs, but make some
   419		// allowances to avoid confusion.
   420		//
   421		// In (1), our conservative assumption is justified.
   422		// In (2), valid font names do not contain ':', '?', or '#', so our
   423		// conservative assumption is fine since we will never transition past
   424		// urlPartPreQuery.
   425		// In (3), our protocol heuristic should not be tripped, and there
   426		// should not be non-space content after a '?' or '#', so as long as
   427		// we only %-encode RFC 3986 reserved characters we are ok.
   428		// In (4), we should URL escape for URL attributes, and for others we
   429		// have the attribute name available if our conservative assumption
   430		// proves problematic for real code.
   431	
   432		k := 0
   433		for {
   434			i := k + bytes.IndexAny(s[k:], `("'/`)
   435			if i < k {
   436				return c, len(s)
   437			}
   438			switch s[i] {
   439			case '(':
   440				// Look for url to the left.
   441				p := bytes.TrimRight(s[:i], "\t\n\f\r ")
   442				if endsWithCSSKeyword(p, "url") {
   443					j := len(s) - len(bytes.TrimLeft(s[i+1:], "\t\n\f\r "))
   444					switch {
   445					case j != len(s) && s[j] == '"':
   446						c.state, j = stateCSSDqURL, j+1
   447					case j != len(s) && s[j] == '\'':
   448						c.state, j = stateCSSSqURL, j+1
   449					default:
   450						c.state = stateCSSURL
   451					}
   452					return c, j
   453				}
   454			case '/':
   455				if i+1 < len(s) {
   456					switch s[i+1] {
   457					case '/':
   458						c.state = stateCSSLineCmt
   459						return c, i + 2
   460					case '*':
   461						c.state = stateCSSBlockCmt
   462						return c, i + 2
   463					}
   464				}
   465			case '"':
   466				c.state = stateCSSDqStr
   467				return c, i + 1
   468			case '\'':
   469				c.state = stateCSSSqStr
   470				return c, i + 1
   471			}
   472			k = i + 1
   473		}
   474	}
   475	
   476	// tCSSStr is the context transition function for the CSS string and URL states.
   477	func tCSSStr(c context, s []byte) (context, int) {
   478		var endAndEsc string
   479		switch c.state {
   480		case stateCSSDqStr, stateCSSDqURL:
   481			endAndEsc = `\"`
   482		case stateCSSSqStr, stateCSSSqURL:
   483			endAndEsc = `\'`
   484		case stateCSSURL:
   485			// Unquoted URLs end with a newline or close parenthesis.
   486			// The below includes the wc (whitespace character) and nl.
   487			endAndEsc = "\\\t\n\f\r )"
   488		default:
   489			panic(c.state.String())
   490		}
   491	
   492		k := 0
   493		for {
   494			i := k + bytes.IndexAny(s[k:], endAndEsc)
   495			if i < k {
   496				c, nread := tURL(c, decodeCSS(s[k:]))
   497				return c, k + nread
   498			}
   499			if s[i] == '\\' {
   500				i++
   501				if i == len(s) {
   502					return context{
   503						state: stateError,
   504						err:   errorf(ErrPartialEscape, nil, 0, "unfinished escape sequence in CSS string: %q", s),
   505					}, len(s)
   506				}
   507			} else {
   508				c.state = stateCSS
   509				return c, i + 1
   510			}
   511			c, _ = tURL(c, decodeCSS(s[:i+1]))
   512			k = i + 1
   513		}
   514	}
   515	
   516	// tError is the context transition function for the error state.
   517	func tError(c context, s []byte) (context, int) {
   518		return c, len(s)
   519	}
   520	
   521	// eatAttrName returns the largest j such that s[i:j] is an attribute name.
   522	// It returns an error if s[i:] does not look like it begins with an
   523	// attribute name, such as encountering a quote mark without a preceding
   524	// equals sign.
   525	func eatAttrName(s []byte, i int) (int, *Error) {
   526		for j := i; j < len(s); j++ {
   527			switch s[j] {
   528			case ' ', '\t', '\n', '\f', '\r', '=', '>':
   529				return j, nil
   530			case '\'', '"', '<':
   531				// These result in a parse warning in HTML5 and are
   532				// indicative of serious problems if seen in an attr
   533				// name in a template.
   534				return -1, errorf(ErrBadHTML, nil, 0, "%q in attribute name: %.32q", s[j:j+1], s)
   535			default:
   536				// No-op.
   537			}
   538		}
   539		return len(s), nil
   540	}
   541	
   542	var elementNameMap = map[string]element{
   543		"script":   elementScript,
   544		"style":    elementStyle,
   545		"textarea": elementTextarea,
   546		"title":    elementTitle,
   547	}
   548	
   549	// asciiAlpha reports whether c is an ASCII letter.
   550	func asciiAlpha(c byte) bool {
   551		return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
   552	}
   553	
   554	// asciiAlphaNum reports whether c is an ASCII letter or digit.
   555	func asciiAlphaNum(c byte) bool {
   556		return asciiAlpha(c) || '0' <= c && c <= '9'
   557	}
   558	
   559	// eatTagName returns the largest j such that s[i:j] is a tag name and the tag type.
   560	func eatTagName(s []byte, i int) (int, element) {
   561		if i == len(s) || !asciiAlpha(s[i]) {
   562			return i, elementNone
   563		}
   564		j := i + 1
   565		for j < len(s) {
   566			x := s[j]
   567			if asciiAlphaNum(x) {
   568				j++
   569				continue
   570			}
   571			// Allow "x-y" or "x:y" but not "x-", "-y", or "x--y".
   572			if (x == ':' || x == '-') && j+1 < len(s) && asciiAlphaNum(s[j+1]) {
   573				j += 2
   574				continue
   575			}
   576			break
   577		}
   578		return j, elementNameMap[strings.ToLower(string(s[i:j]))]
   579	}
   580	
   581	// eatWhiteSpace returns the largest j such that s[i:j] is white space.
   582	func eatWhiteSpace(s []byte, i int) int {
   583		for j := i; j < len(s); j++ {
   584			switch s[j] {
   585			case ' ', '\t', '\n', '\f', '\r':
   586				// No-op.
   587			default:
   588				return j
   589			}
   590		}
   591		return len(s)
   592	}
   593
View as plain text