Source file src/pkg/mime/encodedword.go

     1	// Copyright 2015 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	package mime
     6	
     7	import (
     8		"bytes"
     9		"encoding/base64"
    10		"errors"
    11		"fmt"
    12		"io"
    13		"strings"
    14		"unicode"
    15		"unicode/utf8"
    16	)
    17	
    18	// A WordEncoder is an RFC 2047 encoded-word encoder.
    19	type WordEncoder byte
    20	
    21	const (
    22		// BEncoding represents Base64 encoding scheme as defined by RFC 2045.
    23		BEncoding = WordEncoder('b')
    24		// QEncoding represents the Q-encoding scheme as defined by RFC 2047.
    25		QEncoding = WordEncoder('q')
    26	)
    27	
    28	var (
    29		errInvalidWord = errors.New("mime: invalid RFC 2047 encoded-word")
    30	)
    31	
    32	// Encode returns the encoded-word form of s. If s is ASCII without special
    33	// characters, it is returned unchanged. The provided charset is the IANA
    34	// charset name of s. It is case insensitive.
    35	func (e WordEncoder) Encode(charset, s string) string {
    36		if !needsEncoding(s) {
    37			return s
    38		}
    39		return e.encodeWord(charset, s)
    40	}
    41	
    42	func needsEncoding(s string) bool {
    43		for _, b := range s {
    44			if (b < ' ' || b > '~') && b != '\t' {
    45				return true
    46			}
    47		}
    48		return false
    49	}
    50	
    51	// encodeWord encodes a string into an encoded-word.
    52	func (e WordEncoder) encodeWord(charset, s string) string {
    53		var buf strings.Builder
    54		// Could use a hint like len(s)*3, but that's not enough for cases
    55		// with word splits and too much for simpler inputs.
    56		// 48 is close to maxEncodedWordLen/2, but adjusted to allocator size class.
    57		buf.Grow(48)
    58	
    59		e.openWord(&buf, charset)
    60		if e == BEncoding {
    61			e.bEncode(&buf, charset, s)
    62		} else {
    63			e.qEncode(&buf, charset, s)
    64		}
    65		closeWord(&buf)
    66	
    67		return buf.String()
    68	}
    69	
    70	const (
    71		// The maximum length of an encoded-word is 75 characters.
    72		// See RFC 2047, section 2.
    73		maxEncodedWordLen = 75
    74		// maxContentLen is how much content can be encoded, ignoring the header and
    75		// 2-byte footer.
    76		maxContentLen = maxEncodedWordLen - len("=?UTF-8?q?") - len("?=")
    77	)
    78	
    79	var maxBase64Len = base64.StdEncoding.DecodedLen(maxContentLen)
    80	
    81	// bEncode encodes s using base64 encoding and writes it to buf.
    82	func (e WordEncoder) bEncode(buf *strings.Builder, charset, s string) {
    83		w := base64.NewEncoder(base64.StdEncoding, buf)
    84		// If the charset is not UTF-8 or if the content is short, do not bother
    85		// splitting the encoded-word.
    86		if !isUTF8(charset) || base64.StdEncoding.EncodedLen(len(s)) <= maxContentLen {
    87			io.WriteString(w, s)
    88			w.Close()
    89			return
    90		}
    91	
    92		var currentLen, last, runeLen int
    93		for i := 0; i < len(s); i += runeLen {
    94			// Multi-byte characters must not be split across encoded-words.
    95			// See RFC 2047, section 5.3.
    96			_, runeLen = utf8.DecodeRuneInString(s[i:])
    97	
    98			if currentLen+runeLen <= maxBase64Len {
    99				currentLen += runeLen
   100			} else {
   101				io.WriteString(w, s[last:i])
   102				w.Close()
   103				e.splitWord(buf, charset)
   104				last = i
   105				currentLen = runeLen
   106			}
   107		}
   108		io.WriteString(w, s[last:])
   109		w.Close()
   110	}
   111	
   112	// qEncode encodes s using Q encoding and writes it to buf. It splits the
   113	// encoded-words when necessary.
   114	func (e WordEncoder) qEncode(buf *strings.Builder, charset, s string) {
   115		// We only split encoded-words when the charset is UTF-8.
   116		if !isUTF8(charset) {
   117			writeQString(buf, s)
   118			return
   119		}
   120	
   121		var currentLen, runeLen int
   122		for i := 0; i < len(s); i += runeLen {
   123			b := s[i]
   124			// Multi-byte characters must not be split across encoded-words.
   125			// See RFC 2047, section 5.3.
   126			var encLen int
   127			if b >= ' ' && b <= '~' && b != '=' && b != '?' && b != '_' {
   128				runeLen, encLen = 1, 1
   129			} else {
   130				_, runeLen = utf8.DecodeRuneInString(s[i:])
   131				encLen = 3 * runeLen
   132			}
   133	
   134			if currentLen+encLen > maxContentLen {
   135				e.splitWord(buf, charset)
   136				currentLen = 0
   137			}
   138			writeQString(buf, s[i:i+runeLen])
   139			currentLen += encLen
   140		}
   141	}
   142	
   143	// writeQString encodes s using Q encoding and writes it to buf.
   144	func writeQString(buf *strings.Builder, s string) {
   145		for i := 0; i < len(s); i++ {
   146			switch b := s[i]; {
   147			case b == ' ':
   148				buf.WriteByte('_')
   149			case b >= '!' && b <= '~' && b != '=' && b != '?' && b != '_':
   150				buf.WriteByte(b)
   151			default:
   152				buf.WriteByte('=')
   153				buf.WriteByte(upperhex[b>>4])
   154				buf.WriteByte(upperhex[b&0x0f])
   155			}
   156		}
   157	}
   158	
   159	// openWord writes the beginning of an encoded-word into buf.
   160	func (e WordEncoder) openWord(buf *strings.Builder, charset string) {
   161		buf.WriteString("=?")
   162		buf.WriteString(charset)
   163		buf.WriteByte('?')
   164		buf.WriteByte(byte(e))
   165		buf.WriteByte('?')
   166	}
   167	
   168	// closeWord writes the end of an encoded-word into buf.
   169	func closeWord(buf *strings.Builder) {
   170		buf.WriteString("?=")
   171	}
   172	
   173	// splitWord closes the current encoded-word and opens a new one.
   174	func (e WordEncoder) splitWord(buf *strings.Builder, charset string) {
   175		closeWord(buf)
   176		buf.WriteByte(' ')
   177		e.openWord(buf, charset)
   178	}
   179	
   180	func isUTF8(charset string) bool {
   181		return strings.EqualFold(charset, "UTF-8")
   182	}
   183	
   184	const upperhex = "0123456789ABCDEF"
   185	
   186	// A WordDecoder decodes MIME headers containing RFC 2047 encoded-words.
   187	type WordDecoder struct {
   188		// CharsetReader, if non-nil, defines a function to generate
   189		// charset-conversion readers, converting from the provided
   190		// charset into UTF-8.
   191		// Charsets are always lower-case. utf-8, iso-8859-1 and us-ascii charsets
   192		// are handled by default.
   193		// One of the CharsetReader's result values must be non-nil.
   194		CharsetReader func(charset string, input io.Reader) (io.Reader, error)
   195	}
   196	
   197	// Decode decodes an RFC 2047 encoded-word.
   198	func (d *WordDecoder) Decode(word string) (string, error) {
   199		// See https://tools.ietf.org/html/rfc2047#section-2 for details.
   200		// Our decoder is permissive, we accept empty encoded-text.
   201		if len(word) < 8 || !strings.HasPrefix(word, "=?") || !strings.HasSuffix(word, "?=") || strings.Count(word, "?") != 4 {
   202			return "", errInvalidWord
   203		}
   204		word = word[2 : len(word)-2]
   205	
   206		// split delimits the first 2 fields
   207		split := strings.IndexByte(word, '?')
   208	
   209		// split word "UTF-8?q?ascii" into "UTF-8", 'q', and "ascii"
   210		charset := word[:split]
   211		if len(charset) == 0 {
   212			return "", errInvalidWord
   213		}
   214		if len(word) < split+3 {
   215			return "", errInvalidWord
   216		}
   217		encoding := word[split+1]
   218		// the field after split must only be one byte
   219		if word[split+2] != '?' {
   220			return "", errInvalidWord
   221		}
   222		text := word[split+3:]
   223	
   224		content, err := decode(encoding, text)
   225		if err != nil {
   226			return "", err
   227		}
   228	
   229		var buf strings.Builder
   230	
   231		if err := d.convert(&buf, charset, content); err != nil {
   232			return "", err
   233		}
   234	
   235		return buf.String(), nil
   236	}
   237	
   238	// DecodeHeader decodes all encoded-words of the given string. It returns an
   239	// error if and only if CharsetReader of d returns an error.
   240	func (d *WordDecoder) DecodeHeader(header string) (string, error) {
   241		// If there is no encoded-word, returns before creating a buffer.
   242		i := strings.Index(header, "=?")
   243		if i == -1 {
   244			return header, nil
   245		}
   246	
   247		var buf strings.Builder
   248	
   249		buf.WriteString(header[:i])
   250		header = header[i:]
   251	
   252		betweenWords := false
   253		for {
   254			start := strings.Index(header, "=?")
   255			if start == -1 {
   256				break
   257			}
   258			cur := start + len("=?")
   259	
   260			i := strings.Index(header[cur:], "?")
   261			if i == -1 {
   262				break
   263			}
   264			charset := header[cur : cur+i]
   265			cur += i + len("?")
   266	
   267			if len(header) < cur+len("Q??=") {
   268				break
   269			}
   270			encoding := header[cur]
   271			cur++
   272	
   273			if header[cur] != '?' {
   274				break
   275			}
   276			cur++
   277	
   278			j := strings.Index(header[cur:], "?=")
   279			if j == -1 {
   280				break
   281			}
   282			text := header[cur : cur+j]
   283			end := cur + j + len("?=")
   284	
   285			content, err := decode(encoding, text)
   286			if err != nil {
   287				betweenWords = false
   288				buf.WriteString(header[:start+2])
   289				header = header[start+2:]
   290				continue
   291			}
   292	
   293			// Write characters before the encoded-word. White-space and newline
   294			// characters separating two encoded-words must be deleted.
   295			if start > 0 && (!betweenWords || hasNonWhitespace(header[:start])) {
   296				buf.WriteString(header[:start])
   297			}
   298	
   299			if err := d.convert(&buf, charset, content); err != nil {
   300				return "", err
   301			}
   302	
   303			header = header[end:]
   304			betweenWords = true
   305		}
   306	
   307		if len(header) > 0 {
   308			buf.WriteString(header)
   309		}
   310	
   311		return buf.String(), nil
   312	}
   313	
   314	func decode(encoding byte, text string) ([]byte, error) {
   315		switch encoding {
   316		case 'B', 'b':
   317			return base64.StdEncoding.DecodeString(text)
   318		case 'Q', 'q':
   319			return qDecode(text)
   320		default:
   321			return nil, errInvalidWord
   322		}
   323	}
   324	
   325	func (d *WordDecoder) convert(buf *strings.Builder, charset string, content []byte) error {
   326		switch {
   327		case strings.EqualFold("utf-8", charset):
   328			buf.Write(content)
   329		case strings.EqualFold("iso-8859-1", charset):
   330			for _, c := range content {
   331				buf.WriteRune(rune(c))
   332			}
   333		case strings.EqualFold("us-ascii", charset):
   334			for _, c := range content {
   335				if c >= utf8.RuneSelf {
   336					buf.WriteRune(unicode.ReplacementChar)
   337				} else {
   338					buf.WriteByte(c)
   339				}
   340			}
   341		default:
   342			if d.CharsetReader == nil {
   343				return fmt.Errorf("mime: unhandled charset %q", charset)
   344			}
   345			r, err := d.CharsetReader(strings.ToLower(charset), bytes.NewReader(content))
   346			if err != nil {
   347				return err
   348			}
   349			if _, err = io.Copy(buf, r); err != nil {
   350				return err
   351			}
   352		}
   353		return nil
   354	}
   355	
   356	// hasNonWhitespace reports whether s (assumed to be ASCII) contains at least
   357	// one byte of non-whitespace.
   358	func hasNonWhitespace(s string) bool {
   359		for _, b := range s {
   360			switch b {
   361			// Encoded-words can only be separated by linear white spaces which does
   362			// not include vertical tabs (\v).
   363			case ' ', '\t', '\n', '\r':
   364			default:
   365				return true
   366			}
   367		}
   368		return false
   369	}
   370	
   371	// qDecode decodes a Q encoded string.
   372	func qDecode(s string) ([]byte, error) {
   373		dec := make([]byte, len(s))
   374		n := 0
   375		for i := 0; i < len(s); i++ {
   376			switch c := s[i]; {
   377			case c == '_':
   378				dec[n] = ' '
   379			case c == '=':
   380				if i+2 >= len(s) {
   381					return nil, errInvalidWord
   382				}
   383				b, err := readHexByte(s[i+1], s[i+2])
   384				if err != nil {
   385					return nil, err
   386				}
   387				dec[n] = b
   388				i += 2
   389			case (c <= '~' && c >= ' ') || c == '\n' || c == '\r' || c == '\t':
   390				dec[n] = c
   391			default:
   392				return nil, errInvalidWord
   393			}
   394			n++
   395		}
   396	
   397		return dec[:n], nil
   398	}
   399	
   400	// readHexByte returns the byte from its quoted-printable representation.
   401	func readHexByte(a, b byte) (byte, error) {
   402		var hb, lb byte
   403		var err error
   404		if hb, err = fromHex(a); err != nil {
   405			return 0, err
   406		}
   407		if lb, err = fromHex(b); err != nil {
   408			return 0, err
   409		}
   410		return hb<<4 | lb, nil
   411	}
   412	
   413	func fromHex(b byte) (byte, error) {
   414		switch {
   415		case b >= '0' && b <= '9':
   416			return b - '0', nil
   417		case b >= 'A' && b <= 'F':
   418			return b - 'A' + 10, nil
   419		// Accept badly encoded bytes.
   420		case b >= 'a' && b <= 'f':
   421			return b - 'a' + 10, nil
   422		}
   423		return 0, fmt.Errorf("mime: invalid hex byte %#02x", b)
   424	}
   425
View as plain text