...

Source file src/pkg/net/http/sniff.go

     1	// Copyright 2011 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	package http
     6	
     7	import (
     8		"bytes"
     9		"encoding/binary"
    10	)
    11	
    12	// The algorithm uses at most sniffLen bytes to make its decision.
    13	const sniffLen = 512
    14	
    15	// DetectContentType implements the algorithm described
    16	// at https://mimesniff.spec.whatwg.org/ to determine the
    17	// Content-Type of the given data. It considers at most the
    18	// first 512 bytes of data. DetectContentType always returns
    19	// a valid MIME type: if it cannot determine a more specific one, it
    20	// returns "application/octet-stream".
    21	func DetectContentType(data []byte) string {
    22		if len(data) > sniffLen {
    23			data = data[:sniffLen]
    24		}
    25	
    26		// Index of the first non-whitespace byte in data.
    27		firstNonWS := 0
    28		for ; firstNonWS < len(data) && isWS(data[firstNonWS]); firstNonWS++ {
    29		}
    30	
    31		for _, sig := range sniffSignatures {
    32			if ct := sig.match(data, firstNonWS); ct != "" {
    33				return ct
    34			}
    35		}
    36	
    37		return "application/octet-stream" // fallback
    38	}
    39	
    40	// isWS reports whether the provided byte is a whitespace byte (0xWS)
    41	// as defined in https://mimesniff.spec.whatwg.org/#terminology.
    42	func isWS(b byte) bool {
    43		switch b {
    44		case '\t', '\n', '\x0c', '\r', ' ':
    45			return true
    46		}
    47		return false
    48	}
    49	
    50	// isTT reports whether the provided byte is a tag-terminating byte (0xTT)
    51	// as defined in https://mimesniff.spec.whatwg.org/#terminology.
    52	func isTT(b byte) bool {
    53		switch b {
    54		case ' ', '>':
    55			return true
    56		}
    57		return false
    58	}
    59	
    60	type sniffSig interface {
    61		// match returns the MIME type of the data, or "" if unknown.
    62		match(data []byte, firstNonWS int) string
    63	}
    64	
    65	// Data matching the table in section 6.
    66	var sniffSignatures = []sniffSig{
    67		htmlSig("<!DOCTYPE HTML"),
    68		htmlSig("<HTML"),
    69		htmlSig("<HEAD"),
    70		htmlSig("<SCRIPT"),
    71		htmlSig("<IFRAME"),
    72		htmlSig("<H1"),
    73		htmlSig("<DIV"),
    74		htmlSig("<FONT"),
    75		htmlSig("<TABLE"),
    76		htmlSig("<A"),
    77		htmlSig("<STYLE"),
    78		htmlSig("<TITLE"),
    79		htmlSig("<B"),
    80		htmlSig("<BODY"),
    81		htmlSig("<BR"),
    82		htmlSig("<P"),
    83		htmlSig("<!--"),
    84		&maskedSig{
    85			mask:   []byte("\xFF\xFF\xFF\xFF\xFF"),
    86			pat:    []byte("<?xml"),
    87			skipWS: true,
    88			ct:     "text/xml; charset=utf-8"},
    89		&exactSig{[]byte("%PDF-"), "application/pdf"},
    90		&exactSig{[]byte("%!PS-Adobe-"), "application/postscript"},
    91	
    92		// UTF BOMs.
    93		&maskedSig{
    94			mask: []byte("\xFF\xFF\x00\x00"),
    95			pat:  []byte("\xFE\xFF\x00\x00"),
    96			ct:   "text/plain; charset=utf-16be",
    97		},
    98		&maskedSig{
    99			mask: []byte("\xFF\xFF\x00\x00"),
   100			pat:  []byte("\xFF\xFE\x00\x00"),
   101			ct:   "text/plain; charset=utf-16le",
   102		},
   103		&maskedSig{
   104			mask: []byte("\xFF\xFF\xFF\x00"),
   105			pat:  []byte("\xEF\xBB\xBF\x00"),
   106			ct:   "text/plain; charset=utf-8",
   107		},
   108	
   109		// Image types
   110		// For posterity, we originally returned "image/vnd.microsoft.icon" from
   111		// https://tools.ietf.org/html/draft-ietf-websec-mime-sniff-03#section-7
   112		// https://codereview.appspot.com/4746042
   113		// but that has since been replaced with "image/x-icon" in Section 6.2
   114		// of https://mimesniff.spec.whatwg.org/#matching-an-image-type-pattern
   115		&exactSig{[]byte("\x00\x00\x01\x00"), "image/x-icon"},
   116		&exactSig{[]byte("\x00\x00\x02\x00"), "image/x-icon"},
   117		&exactSig{[]byte("BM"), "image/bmp"},
   118		&exactSig{[]byte("GIF87a"), "image/gif"},
   119		&exactSig{[]byte("GIF89a"), "image/gif"},
   120		&maskedSig{
   121			mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF"),
   122			pat:  []byte("RIFF\x00\x00\x00\x00WEBPVP"),
   123			ct:   "image/webp",
   124		},
   125		&exactSig{[]byte("\x89PNG\x0D\x0A\x1A\x0A"), "image/png"},
   126		&exactSig{[]byte("\xFF\xD8\xFF"), "image/jpeg"},
   127	
   128		// Audio and Video types
   129		// Enforce the pattern match ordering as prescribed in
   130		// https://mimesniff.spec.whatwg.org/#matching-an-audio-or-video-type-pattern
   131		&maskedSig{
   132			mask: []byte("\xFF\xFF\xFF\xFF"),
   133			pat:  []byte(".snd"),
   134			ct:   "audio/basic",
   135		},
   136		&maskedSig{
   137			mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"),
   138			pat:  []byte("FORM\x00\x00\x00\x00AIFF"),
   139			ct:   "audio/aiff",
   140		},
   141		&maskedSig{
   142			mask: []byte("\xFF\xFF\xFF"),
   143			pat:  []byte("ID3"),
   144			ct:   "audio/mpeg",
   145		},
   146		&maskedSig{
   147			mask: []byte("\xFF\xFF\xFF\xFF\xFF"),
   148			pat:  []byte("OggS\x00"),
   149			ct:   "application/ogg",
   150		},
   151		&maskedSig{
   152			mask: []byte("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"),
   153			pat:  []byte("MThd\x00\x00\x00\x06"),
   154			ct:   "audio/midi",
   155		},
   156		&maskedSig{
   157			mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"),
   158			pat:  []byte("RIFF\x00\x00\x00\x00AVI "),
   159			ct:   "video/avi",
   160		},
   161		&maskedSig{
   162			mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"),
   163			pat:  []byte("RIFF\x00\x00\x00\x00WAVE"),
   164			ct:   "audio/wave",
   165		},
   166		// 6.2.0.2. video/mp4
   167		mp4Sig{},
   168		// 6.2.0.3. video/webm
   169		&exactSig{[]byte("\x1A\x45\xDF\xA3"), "video/webm"},
   170	
   171		// Font types
   172		&maskedSig{
   173			// 34 NULL bytes followed by the string "LP"
   174			pat: []byte("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00LP"),
   175			// 34 NULL bytes followed by \xF\xF
   176			mask: []byte("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xFF\xFF"),
   177			ct:   "application/vnd.ms-fontobject",
   178		},
   179		&exactSig{[]byte("\x00\x01\x00\x00"), "font/ttf"},
   180		&exactSig{[]byte("OTTO"), "font/otf"},
   181		&exactSig{[]byte("ttcf"), "font/collection"},
   182		&exactSig{[]byte("wOFF"), "font/woff"},
   183		&exactSig{[]byte("wOF2"), "font/woff2"},
   184	
   185		// Archive types
   186		&exactSig{[]byte("\x1F\x8B\x08"), "application/x-gzip"},
   187		&exactSig{[]byte("PK\x03\x04"), "application/zip"},
   188		// RAR's signatures are incorrectly defined by the MIME spec as per
   189		//    https://github.com/whatwg/mimesniff/issues/63
   190		// However, RAR Labs correctly defines it at:
   191		//    https://www.rarlab.com/technote.htm#rarsign
   192		// so we use the definition from RAR Labs.
   193		// TODO: do whatever the spec ends up doing.
   194		&exactSig{[]byte("Rar!\x1A\x07\x00"), "application/x-rar-compressed"},     // RAR v1.5-v4.0
   195		&exactSig{[]byte("Rar!\x1A\x07\x01\x00"), "application/x-rar-compressed"}, // RAR v5+
   196	
   197		&exactSig{[]byte("\x00\x61\x73\x6D"), "application/wasm"},
   198	
   199		textSig{}, // should be last
   200	}
   201	
   202	type exactSig struct {
   203		sig []byte
   204		ct  string
   205	}
   206	
   207	func (e *exactSig) match(data []byte, firstNonWS int) string {
   208		if bytes.HasPrefix(data, e.sig) {
   209			return e.ct
   210		}
   211		return ""
   212	}
   213	
   214	type maskedSig struct {
   215		mask, pat []byte
   216		skipWS    bool
   217		ct        string
   218	}
   219	
   220	func (m *maskedSig) match(data []byte, firstNonWS int) string {
   221		// pattern matching algorithm section 6
   222		// https://mimesniff.spec.whatwg.org/#pattern-matching-algorithm
   223	
   224		if m.skipWS {
   225			data = data[firstNonWS:]
   226		}
   227		if len(m.pat) != len(m.mask) {
   228			return ""
   229		}
   230		if len(data) < len(m.pat) {
   231			return ""
   232		}
   233		for i, pb := range m.pat {
   234			maskedData := data[i] & m.mask[i]
   235			if maskedData != pb {
   236				return ""
   237			}
   238		}
   239		return m.ct
   240	}
   241	
   242	type htmlSig []byte
   243	
   244	func (h htmlSig) match(data []byte, firstNonWS int) string {
   245		data = data[firstNonWS:]
   246		if len(data) < len(h)+1 {
   247			return ""
   248		}
   249		for i, b := range h {
   250			db := data[i]
   251			if 'A' <= b && b <= 'Z' {
   252				db &= 0xDF
   253			}
   254			if b != db {
   255				return ""
   256			}
   257		}
   258		// Next byte must be a tag-terminating byte(0xTT).
   259		if !isTT(data[len(h)]) {
   260			return ""
   261		}
   262		return "text/html; charset=utf-8"
   263	}
   264	
   265	var mp4ftype = []byte("ftyp")
   266	var mp4 = []byte("mp4")
   267	
   268	type mp4Sig struct{}
   269	
   270	func (mp4Sig) match(data []byte, firstNonWS int) string {
   271		// https://mimesniff.spec.whatwg.org/#signature-for-mp4
   272		// c.f. section 6.2.1
   273		if len(data) < 12 {
   274			return ""
   275		}
   276		boxSize := int(binary.BigEndian.Uint32(data[:4]))
   277		if len(data) < boxSize || boxSize%4 != 0 {
   278			return ""
   279		}
   280		if !bytes.Equal(data[4:8], mp4ftype) {
   281			return ""
   282		}
   283		for st := 8; st < boxSize; st += 4 {
   284			if st == 12 {
   285				// Ignores the four bytes that correspond to the version number of the "major brand".
   286				continue
   287			}
   288			if bytes.Equal(data[st:st+3], mp4) {
   289				return "video/mp4"
   290			}
   291		}
   292		return ""
   293	}
   294	
   295	type textSig struct{}
   296	
   297	func (textSig) match(data []byte, firstNonWS int) string {
   298		// c.f. section 5, step 4.
   299		for _, b := range data[firstNonWS:] {
   300			switch {
   301			case b <= 0x08,
   302				b == 0x0B,
   303				0x0E <= b && b <= 0x1A,
   304				0x1C <= b && b <= 0x1F:
   305				return ""
   306			}
   307		}
   308		return "text/plain; charset=utf-8"
   309	}
   310	

View as plain text