Source file src/go/doc/comment.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Godoc comment extraction and comment -> HTML formatting.
     6	
     7	package doc
     8	
     9	import (
    10		"bytes"
    11		"internal/lazyregexp"
    12		"io"
    13		"strings"
    14		"text/template" // for HTMLEscape
    15		"unicode"
    16		"unicode/utf8"
    17	)
    18	
    19	const (
    20		ldquo = "&ldquo;"
    21		rdquo = "&rdquo;"
    22		ulquo = "“"
    23		urquo = "”"
    24	)
    25	
    26	var (
    27		htmlQuoteReplacer    = strings.NewReplacer(ulquo, ldquo, urquo, rdquo)
    28		unicodeQuoteReplacer = strings.NewReplacer("``", ulquo, "''", urquo)
    29	)
    30	
    31	// Escape comment text for HTML. If nice is set,
    32	// also turn `` into &ldquo; and '' into &rdquo;.
    33	func commentEscape(w io.Writer, text string, nice bool) {
    34		if nice {
    35			// In the first pass, we convert `` and '' into their unicode equivalents.
    36			// This prevents them from being escaped in HTMLEscape.
    37			text = convertQuotes(text)
    38			var buf bytes.Buffer
    39			template.HTMLEscape(&buf, []byte(text))
    40			// Now we convert the unicode quotes to their HTML escaped entities to maintain old behavior.
    41			// We need to use a temp buffer to read the string back and do the conversion,
    42			// otherwise HTMLEscape will escape & to &amp;
    43			htmlQuoteReplacer.WriteString(w, buf.String())
    44			return
    45		}
    46		template.HTMLEscape(w, []byte(text))
    47	}
    48	
    49	func convertQuotes(text string) string {
    50		return unicodeQuoteReplacer.Replace(text)
    51	}
    52	
    53	const (
    54		// Regexp for Go identifiers
    55		identRx = `[\pL_][\pL_0-9]*`
    56	
    57		// Regexp for URLs
    58		// Match parens, and check later for balance - see #5043, #22285
    59		// Match .,:;?! within path, but not at end - see #18139, #16565
    60		// This excludes some rare yet valid urls ending in common punctuation
    61		// in order to allow sentences ending in URLs.
    62	
    63		// protocol (required) e.g. http
    64		protoPart = `(https?|ftp|file|gopher|mailto|nntp)`
    65		// host (required) e.g. www.example.com or [::1]:8080
    66		hostPart = `([a-zA-Z0-9_@\-.\[\]:]+)`
    67		// path+query+fragment (optional) e.g. /path/index.html?q=foo#bar
    68		pathPart = `([.,:;?!]*[a-zA-Z0-9$'()*+&#=@~_/\-\[\]%])*`
    69	
    70		urlRx = protoPart + `://` + hostPart + pathPart
    71	)
    72	
    73	var matchRx = lazyregexp.New(`(` + urlRx + `)|(` + identRx + `)`)
    74	
    75	var (
    76		html_a      = []byte(`<a href="`)
    77		html_aq     = []byte(`">`)
    78		html_enda   = []byte("</a>")
    79		html_i      = []byte("<i>")
    80		html_endi   = []byte("</i>")
    81		html_p      = []byte("<p>\n")
    82		html_endp   = []byte("</p>\n")
    83		html_pre    = []byte("<pre>")
    84		html_endpre = []byte("</pre>\n")
    85		html_h      = []byte(`<h3 id="`)
    86		html_hq     = []byte(`">`)
    87		html_endh   = []byte("</h3>\n")
    88	)
    89	
    90	// Emphasize and escape a line of text for HTML. URLs are converted into links;
    91	// if the URL also appears in the words map, the link is taken from the map (if
    92	// the corresponding map value is the empty string, the URL is not converted
    93	// into a link). Go identifiers that appear in the words map are italicized; if
    94	// the corresponding map value is not the empty string, it is considered a URL
    95	// and the word is converted into a link. If nice is set, the remaining text's
    96	// appearance is improved where it makes sense (e.g., `` is turned into &ldquo;
    97	// and '' into &rdquo;).
    98	func emphasize(w io.Writer, line string, words map[string]string, nice bool) {
    99		for {
   100			m := matchRx.FindStringSubmatchIndex(line)
   101			if m == nil {
   102				break
   103			}
   104			// m >= 6 (two parenthesized sub-regexps in matchRx, 1st one is urlRx)
   105	
   106			// write text before match
   107			commentEscape(w, line[0:m[0]], nice)
   108	
   109			// adjust match for URLs
   110			match := line[m[0]:m[1]]
   111			if strings.Contains(match, "://") {
   112				m0, m1 := m[0], m[1]
   113				for _, s := range []string{"()", "{}", "[]"} {
   114					open, close := s[:1], s[1:] // E.g., "(" and ")"
   115					// require opening parentheses before closing parentheses (#22285)
   116					if i := strings.Index(match, close); i >= 0 && i < strings.Index(match, open) {
   117						m1 = m0 + i
   118						match = line[m0:m1]
   119					}
   120					// require balanced pairs of parentheses (#5043)
   121					for i := 0; strings.Count(match, open) != strings.Count(match, close) && i < 10; i++ {
   122						m1 = strings.LastIndexAny(line[:m1], s)
   123						match = line[m0:m1]
   124					}
   125				}
   126				if m1 != m[1] {
   127					// redo matching with shortened line for correct indices
   128					m = matchRx.FindStringSubmatchIndex(line[:m[0]+len(match)])
   129				}
   130			}
   131	
   132			// analyze match
   133			url := ""
   134			italics := false
   135			if words != nil {
   136				url, italics = words[match]
   137			}
   138			if m[2] >= 0 {
   139				// match against first parenthesized sub-regexp; must be match against urlRx
   140				if !italics {
   141					// no alternative URL in words list, use match instead
   142					url = match
   143				}
   144				italics = false // don't italicize URLs
   145			}
   146	
   147			// write match
   148			if len(url) > 0 {
   149				w.Write(html_a)
   150				template.HTMLEscape(w, []byte(url))
   151				w.Write(html_aq)
   152			}
   153			if italics {
   154				w.Write(html_i)
   155			}
   156			commentEscape(w, match, nice)
   157			if italics {
   158				w.Write(html_endi)
   159			}
   160			if len(url) > 0 {
   161				w.Write(html_enda)
   162			}
   163	
   164			// advance
   165			line = line[m[1]:]
   166		}
   167		commentEscape(w, line, nice)
   168	}
   169	
   170	func indentLen(s string) int {
   171		i := 0
   172		for i < len(s) && (s[i] == ' ' || s[i] == '\t') {
   173			i++
   174		}
   175		return i
   176	}
   177	
   178	func isBlank(s string) bool {
   179		return len(s) == 0 || (len(s) == 1 && s[0] == '\n')
   180	}
   181	
   182	func commonPrefix(a, b string) string {
   183		i := 0
   184		for i < len(a) && i < len(b) && a[i] == b[i] {
   185			i++
   186		}
   187		return a[0:i]
   188	}
   189	
   190	func unindent(block []string) {
   191		if len(block) == 0 {
   192			return
   193		}
   194	
   195		// compute maximum common white prefix
   196		prefix := block[0][0:indentLen(block[0])]
   197		for _, line := range block {
   198			if !isBlank(line) {
   199				prefix = commonPrefix(prefix, line[0:indentLen(line)])
   200			}
   201		}
   202		n := len(prefix)
   203	
   204		// remove
   205		for i, line := range block {
   206			if !isBlank(line) {
   207				block[i] = line[n:]
   208			}
   209		}
   210	}
   211	
   212	// heading returns the trimmed line if it passes as a section heading;
   213	// otherwise it returns the empty string.
   214	func heading(line string) string {
   215		line = strings.TrimSpace(line)
   216		if len(line) == 0 {
   217			return ""
   218		}
   219	
   220		// a heading must start with an uppercase letter
   221		r, _ := utf8.DecodeRuneInString(line)
   222		if !unicode.IsLetter(r) || !unicode.IsUpper(r) {
   223			return ""
   224		}
   225	
   226		// it must end in a letter or digit:
   227		r, _ = utf8.DecodeLastRuneInString(line)
   228		if !unicode.IsLetter(r) && !unicode.IsDigit(r) {
   229			return ""
   230		}
   231	
   232		// exclude lines with illegal characters. we allow "(),"
   233		if strings.ContainsAny(line, ";:!?+*/=[]{}_^°&§~%#@<\">\\") {
   234			return ""
   235		}
   236	
   237		// allow "'" for possessive "'s" only
   238		for b := line; ; {
   239			i := strings.IndexRune(b, '\'')
   240			if i < 0 {
   241				break
   242			}
   243			if i+1 >= len(b) || b[i+1] != 's' || (i+2 < len(b) && b[i+2] != ' ') {
   244				return "" // not followed by "s "
   245			}
   246			b = b[i+2:]
   247		}
   248	
   249		// allow "." when followed by non-space
   250		for b := line; ; {
   251			i := strings.IndexRune(b, '.')
   252			if i < 0 {
   253				break
   254			}
   255			if i+1 >= len(b) || b[i+1] == ' ' {
   256				return "" // not followed by non-space
   257			}
   258			b = b[i+1:]
   259		}
   260	
   261		return line
   262	}
   263	
   264	type op int
   265	
   266	const (
   267		opPara op = iota
   268		opHead
   269		opPre
   270	)
   271	
   272	type block struct {
   273		op    op
   274		lines []string
   275	}
   276	
   277	var nonAlphaNumRx = lazyregexp.New(`[^a-zA-Z0-9]`)
   278	
   279	func anchorID(line string) string {
   280		// Add a "hdr-" prefix to avoid conflicting with IDs used for package symbols.
   281		return "hdr-" + nonAlphaNumRx.ReplaceAllString(line, "_")
   282	}
   283	
   284	// ToHTML converts comment text to formatted HTML.
   285	// The comment was prepared by DocReader,
   286	// so it is known not to have leading, trailing blank lines
   287	// nor to have trailing spaces at the end of lines.
   288	// The comment markers have already been removed.
   289	//
   290	// Each span of unindented non-blank lines is converted into
   291	// a single paragraph. There is one exception to the rule: a span that
   292	// consists of a single line, is followed by another paragraph span,
   293	// begins with a capital letter, and contains no punctuation
   294	// other than parentheses and commas is formatted as a heading.
   295	//
   296	// A span of indented lines is converted into a <pre> block,
   297	// with the common indent prefix removed.
   298	//
   299	// URLs in the comment text are converted into links; if the URL also appears
   300	// in the words map, the link is taken from the map (if the corresponding map
   301	// value is the empty string, the URL is not converted into a link).
   302	//
   303	// Go identifiers that appear in the words map are italicized; if the corresponding
   304	// map value is not the empty string, it is considered a URL and the word is converted
   305	// into a link.
   306	func ToHTML(w io.Writer, text string, words map[string]string) {
   307		for _, b := range blocks(text) {
   308			switch b.op {
   309			case opPara:
   310				w.Write(html_p)
   311				for _, line := range b.lines {
   312					emphasize(w, line, words, true)
   313				}
   314				w.Write(html_endp)
   315			case opHead:
   316				w.Write(html_h)
   317				id := ""
   318				for _, line := range b.lines {
   319					if id == "" {
   320						id = anchorID(line)
   321						w.Write([]byte(id))
   322						w.Write(html_hq)
   323					}
   324					commentEscape(w, line, true)
   325				}
   326				if id == "" {
   327					w.Write(html_hq)
   328				}
   329				w.Write(html_endh)
   330			case opPre:
   331				w.Write(html_pre)
   332				for _, line := range b.lines {
   333					emphasize(w, line, nil, false)
   334				}
   335				w.Write(html_endpre)
   336			}
   337		}
   338	}
   339	
   340	func blocks(text string) []block {
   341		var (
   342			out  []block
   343			para []string
   344	
   345			lastWasBlank   = false
   346			lastWasHeading = false
   347		)
   348	
   349		close := func() {
   350			if para != nil {
   351				out = append(out, block{opPara, para})
   352				para = nil
   353			}
   354		}
   355	
   356		lines := strings.SplitAfter(text, "\n")
   357		unindent(lines)
   358		for i := 0; i < len(lines); {
   359			line := lines[i]
   360			if isBlank(line) {
   361				// close paragraph
   362				close()
   363				i++
   364				lastWasBlank = true
   365				continue
   366			}
   367			if indentLen(line) > 0 {
   368				// close paragraph
   369				close()
   370	
   371				// count indented or blank lines
   372				j := i + 1
   373				for j < len(lines) && (isBlank(lines[j]) || indentLen(lines[j]) > 0) {
   374					j++
   375				}
   376				// but not trailing blank lines
   377				for j > i && isBlank(lines[j-1]) {
   378					j--
   379				}
   380				pre := lines[i:j]
   381				i = j
   382	
   383				unindent(pre)
   384	
   385				// put those lines in a pre block
   386				out = append(out, block{opPre, pre})
   387				lastWasHeading = false
   388				continue
   389			}
   390	
   391			if lastWasBlank && !lastWasHeading && i+2 < len(lines) &&
   392				isBlank(lines[i+1]) && !isBlank(lines[i+2]) && indentLen(lines[i+2]) == 0 {
   393				// current line is non-blank, surrounded by blank lines
   394				// and the next non-blank line is not indented: this
   395				// might be a heading.
   396				if head := heading(line); head != "" {
   397					close()
   398					out = append(out, block{opHead, []string{head}})
   399					i += 2
   400					lastWasHeading = true
   401					continue
   402				}
   403			}
   404	
   405			// open paragraph
   406			lastWasBlank = false
   407			lastWasHeading = false
   408			para = append(para, lines[i])
   409			i++
   410		}
   411		close()
   412	
   413		return out
   414	}
   415	
   416	// ToText prepares comment text for presentation in textual output.
   417	// It wraps paragraphs of text to width or fewer Unicode code points
   418	// and then prefixes each line with the indent. In preformatted sections
   419	// (such as program text), it prefixes each non-blank line with preIndent.
   420	func ToText(w io.Writer, text string, indent, preIndent string, width int) {
   421		l := lineWrapper{
   422			out:    w,
   423			width:  width,
   424			indent: indent,
   425		}
   426		for _, b := range blocks(text) {
   427			switch b.op {
   428			case opPara:
   429				// l.write will add leading newline if required
   430				for _, line := range b.lines {
   431					line = convertQuotes(line)
   432					l.write(line)
   433				}
   434				l.flush()
   435			case opHead:
   436				w.Write(nl)
   437				for _, line := range b.lines {
   438					line = convertQuotes(line)
   439					l.write(line + "\n")
   440				}
   441				l.flush()
   442			case opPre:
   443				w.Write(nl)
   444				for _, line := range b.lines {
   445					if isBlank(line) {
   446						w.Write([]byte("\n"))
   447					} else {
   448						w.Write([]byte(preIndent))
   449						w.Write([]byte(line))
   450					}
   451				}
   452			}
   453		}
   454	}
   455	
   456	type lineWrapper struct {
   457		out       io.Writer
   458		printed   bool
   459		width     int
   460		indent    string
   461		n         int
   462		pendSpace int
   463	}
   464	
   465	var nl = []byte("\n")
   466	var space = []byte(" ")
   467	var prefix = []byte("// ")
   468	
   469	func (l *lineWrapper) write(text string) {
   470		if l.n == 0 && l.printed {
   471			l.out.Write(nl) // blank line before new paragraph
   472		}
   473		l.printed = true
   474	
   475		needsPrefix := false
   476		isComment := strings.HasPrefix(text, "//")
   477		for _, f := range strings.Fields(text) {
   478			w := utf8.RuneCountInString(f)
   479			// wrap if line is too long
   480			if l.n > 0 && l.n+l.pendSpace+w > l.width {
   481				l.out.Write(nl)
   482				l.n = 0
   483				l.pendSpace = 0
   484				needsPrefix = isComment
   485			}
   486			if l.n == 0 {
   487				l.out.Write([]byte(l.indent))
   488			}
   489			if needsPrefix {
   490				l.out.Write(prefix)
   491				needsPrefix = false
   492			}
   493			l.out.Write(space[:l.pendSpace])
   494			l.out.Write([]byte(f))
   495			l.n += l.pendSpace + w
   496			l.pendSpace = 1
   497		}
   498	}
   499	
   500	func (l *lineWrapper) flush() {
   501		if l.n == 0 {
   502			return
   503		}
   504		l.out.Write(nl)
   505		l.pendSpace = 0
   506		l.n = 0
   507	}
   508
View as plain text