...

Source file src/strings/search.go

     1	// Copyright 2012 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	package strings
     6	
     7	// stringFinder efficiently finds strings in a source text. It's implemented
     8	// using the Boyer-Moore string search algorithm:
     9	// https://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
    10	// https://www.cs.utexas.edu/~moore/publications/fstrpos.pdf (note: this aged
    11	// document uses 1-based indexing)
    12	type stringFinder struct {
    13		// pattern is the string that we are searching for in the text.
    14		pattern string
    15	
    16		// badCharSkip[b] contains the distance between the last byte of pattern
    17		// and the rightmost occurrence of b in pattern. If b is not in pattern,
    18		// badCharSkip[b] is len(pattern).
    19		//
    20		// Whenever a mismatch is found with byte b in the text, we can safely
    21		// shift the matching frame at least badCharSkip[b] until the next time
    22		// the matching char could be in alignment.
    23		badCharSkip [256]int
    24	
    25		// goodSuffixSkip[i] defines how far we can shift the matching frame given
    26		// that the suffix pattern[i+1:] matches, but the byte pattern[i] does
    27		// not. There are two cases to consider:
    28		//
    29		// 1. The matched suffix occurs elsewhere in pattern (with a different
    30		// byte preceding it that we might possibly match). In this case, we can
    31		// shift the matching frame to align with the next suffix chunk. For
    32		// example, the pattern "mississi" has the suffix "issi" next occurring
    33		// (in right-to-left order) at index 1, so goodSuffixSkip[3] ==
    34		// shift+len(suffix) == 3+4 == 7.
    35		//
    36		// 2. If the matched suffix does not occur elsewhere in pattern, then the
    37		// matching frame may share part of its prefix with the end of the
    38		// matching suffix. In this case, goodSuffixSkip[i] will contain how far
    39		// to shift the frame to align this portion of the prefix to the
    40		// suffix. For example, in the pattern "abcxxxabc", when the first
    41		// mismatch from the back is found to be in position 3, the matching
    42		// suffix "xxabc" is not found elsewhere in the pattern. However, its
    43		// rightmost "abc" (at position 6) is a prefix of the whole pattern, so
    44		// goodSuffixSkip[3] == shift+len(suffix) == 6+5 == 11.
    45		goodSuffixSkip []int
    46	}
    47	
    48	func makeStringFinder(pattern string) *stringFinder {
    49		f := &stringFinder{
    50			pattern:        pattern,
    51			goodSuffixSkip: make([]int, len(pattern)),
    52		}
    53		// last is the index of the last character in the pattern.
    54		last := len(pattern) - 1
    55	
    56		// Build bad character table.
    57		// Bytes not in the pattern can skip one pattern's length.
    58		for i := range f.badCharSkip {
    59			f.badCharSkip[i] = len(pattern)
    60		}
    61		// The loop condition is < instead of <= so that the last byte does not
    62		// have a zero distance to itself. Finding this byte out of place implies
    63		// that it is not in the last position.
    64		for i := 0; i < last; i++ {
    65			f.badCharSkip[pattern[i]] = last - i
    66		}
    67	
    68		// Build good suffix table.
    69		// First pass: set each value to the next index which starts a prefix of
    70		// pattern.
    71		lastPrefix := last
    72		for i := last; i >= 0; i-- {
    73			if HasPrefix(pattern, pattern[i+1:]) {
    74				lastPrefix = i + 1
    75			}
    76			// lastPrefix is the shift, and (last-i) is len(suffix).
    77			f.goodSuffixSkip[i] = lastPrefix + last - i
    78		}
    79		// Second pass: find repeats of pattern's suffix starting from the front.
    80		for i := 0; i < last; i++ {
    81			lenSuffix := longestCommonSuffix(pattern, pattern[1:i+1])
    82			if pattern[i-lenSuffix] != pattern[last-lenSuffix] {
    83				// (last-i) is the shift, and lenSuffix is len(suffix).
    84				f.goodSuffixSkip[last-lenSuffix] = lenSuffix + last - i
    85			}
    86		}
    87	
    88		return f
    89	}
    90	
    91	func longestCommonSuffix(a, b string) (i int) {
    92		for ; i < len(a) && i < len(b); i++ {
    93			if a[len(a)-1-i] != b[len(b)-1-i] {
    94				break
    95			}
    96		}
    97		return
    98	}
    99	
   100	// next returns the index in text of the first occurrence of the pattern. If
   101	// the pattern is not found, it returns -1.
   102	func (f *stringFinder) next(text string) int {
   103		i := len(f.pattern) - 1
   104		for i < len(text) {
   105			// Compare backwards from the end until the first unmatching character.
   106			j := len(f.pattern) - 1
   107			for j >= 0 && text[i] == f.pattern[j] {
   108				i--
   109				j--
   110			}
   111			if j < 0 {
   112				return i + 1 // match
   113			}
   114			i += max(f.badCharSkip[text[i]], f.goodSuffixSkip[j])
   115		}
   116		return -1
   117	}
   118	
   119	func max(a, b int) int {
   120		if a > b {
   121			return a
   122		}
   123		return b
   124	}
   125	

View as plain text