Source file src/pkg/html/escape.go

     1	// Copyright 2010 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package html provides functions for escaping and unescaping HTML text.
     6	package html
     7	
     8	import (
     9		"strings"
    10		"unicode/utf8"
    11	)
    12	
    13	// These replacements permit compatibility with old numeric entities that
    14	// assumed Windows-1252 encoding.
    15	// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
    16	var replacementTable = [...]rune{
    17		'\u20AC', // First entry is what 0x80 should be replaced with.
    18		'\u0081',
    19		'\u201A',
    20		'\u0192',
    21		'\u201E',
    22		'\u2026',
    23		'\u2020',
    24		'\u2021',
    25		'\u02C6',
    26		'\u2030',
    27		'\u0160',
    28		'\u2039',
    29		'\u0152',
    30		'\u008D',
    31		'\u017D',
    32		'\u008F',
    33		'\u0090',
    34		'\u2018',
    35		'\u2019',
    36		'\u201C',
    37		'\u201D',
    38		'\u2022',
    39		'\u2013',
    40		'\u2014',
    41		'\u02DC',
    42		'\u2122',
    43		'\u0161',
    44		'\u203A',
    45		'\u0153',
    46		'\u009D',
    47		'\u017E',
    48		'\u0178', // Last entry is 0x9F.
    49		// 0x00->'\uFFFD' is handled programmatically.
    50		// 0x0D->'\u000D' is a no-op.
    51	}
    52	
    53	// unescapeEntity reads an entity like "&lt;" from b[src:] and writes the
    54	// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
    55	// Precondition: b[src] == '&' && dst <= src.
    56	func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
    57		const attribute = false
    58	
    59		// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
    60	
    61		// i starts at 1 because we already know that s[0] == '&'.
    62		i, s := 1, b[src:]
    63	
    64		if len(s) <= 1 {
    65			b[dst] = b[src]
    66			return dst + 1, src + 1
    67		}
    68	
    69		if s[i] == '#' {
    70			if len(s) <= 3 { // We need to have at least "&#.".
    71				b[dst] = b[src]
    72				return dst + 1, src + 1
    73			}
    74			i++
    75			c := s[i]
    76			hex := false
    77			if c == 'x' || c == 'X' {
    78				hex = true
    79				i++
    80			}
    81	
    82			x := '\x00'
    83			for i < len(s) {
    84				c = s[i]
    85				i++
    86				if hex {
    87					if '0' <= c && c <= '9' {
    88						x = 16*x + rune(c) - '0'
    89						continue
    90					} else if 'a' <= c && c <= 'f' {
    91						x = 16*x + rune(c) - 'a' + 10
    92						continue
    93					} else if 'A' <= c && c <= 'F' {
    94						x = 16*x + rune(c) - 'A' + 10
    95						continue
    96					}
    97				} else if '0' <= c && c <= '9' {
    98					x = 10*x + rune(c) - '0'
    99					continue
   100				}
   101				if c != ';' {
   102					i--
   103				}
   104				break
   105			}
   106	
   107			if i <= 3 { // No characters matched.
   108				b[dst] = b[src]
   109				return dst + 1, src + 1
   110			}
   111	
   112			if 0x80 <= x && x <= 0x9F {
   113				// Replace characters from Windows-1252 with UTF-8 equivalents.
   114				x = replacementTable[x-0x80]
   115			} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
   116				// Replace invalid characters with the replacement character.
   117				x = '\uFFFD'
   118			}
   119	
   120			return dst + utf8.EncodeRune(b[dst:], x), src + i
   121		}
   122	
   123		// Consume the maximum number of characters possible, with the
   124		// consumed characters matching one of the named references.
   125	
   126		for i < len(s) {
   127			c := s[i]
   128			i++
   129			// Lower-cased characters are more common in entities, so we check for them first.
   130			if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
   131				continue
   132			}
   133			if c != ';' {
   134				i--
   135			}
   136			break
   137		}
   138	
   139		entityName := s[1:i]
   140		if len(entityName) == 0 {
   141			// No-op.
   142		} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
   143			// No-op.
   144		} else if x := entity[string(entityName)]; x != 0 {
   145			return dst + utf8.EncodeRune(b[dst:], x), src + i
   146		} else if x := entity2[string(entityName)]; x[0] != 0 {
   147			dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
   148			return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
   149		} else if !attribute {
   150			maxLen := len(entityName) - 1
   151			if maxLen > longestEntityWithoutSemicolon {
   152				maxLen = longestEntityWithoutSemicolon
   153			}
   154			for j := maxLen; j > 1; j-- {
   155				if x := entity[string(entityName[:j])]; x != 0 {
   156					return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
   157				}
   158			}
   159		}
   160	
   161		dst1, src1 = dst+i, src+i
   162		copy(b[dst:dst1], b[src:src1])
   163		return dst1, src1
   164	}
   165	
   166	var htmlEscaper = strings.NewReplacer(
   167		`&`, "&amp;",
   168		`'`, "&#39;", // "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
   169		`<`, "&lt;",
   170		`>`, "&gt;",
   171		`"`, "&#34;", // "&#34;" is shorter than "&quot;".
   172	)
   173	
   174	// EscapeString escapes special characters like "<" to become "&lt;". It
   175	// escapes only five such characters: <, >, &, ' and ".
   176	// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
   177	// always true.
   178	func EscapeString(s string) string {
   179		return htmlEscaper.Replace(s)
   180	}
   181	
   182	// UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
   183	// larger range of entities than EscapeString escapes. For example, "&aacute;"
   184	// unescapes to "á", as does "&#225;" and "&#xE1;".
   185	// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
   186	// always true.
   187	func UnescapeString(s string) string {
   188		populateMapsOnce.Do(populateMaps)
   189		i := strings.IndexByte(s, '&')
   190	
   191		if i < 0 {
   192			return s
   193		}
   194	
   195		b := []byte(s)
   196		dst, src := unescapeEntity(b, i, i)
   197		for len(s[src:]) > 0 {
   198			if s[src] == '&' {
   199				i = 0
   200			} else {
   201				i = strings.IndexByte(s[src:], '&')
   202			}
   203			if i < 0 {
   204				dst += copy(b[dst:], s[src:])
   205				break
   206			}
   207	
   208			if i > 0 {
   209				copy(b[dst:], s[src:src+i])
   210			}
   211			dst, src = unescapeEntity(b, dst+i, src+i)
   212		}
   213		return string(b[:dst])
   214	}
   215
View as plain text