...

Source file src/index/suffixarray/sais.go

     1	// Copyright 2019 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Suffix array construction by induced sorting (SAIS).
     6	// See Ge Nong, Sen Zhang, and Wai Hong Chen,
     7	// "Two Efficient Algorithms for Linear Time Suffix Array Construction",
     8	// especially section 3 (https://ieeexplore.ieee.org/document/5582081).
     9	// See also http://zork.net/~st/jottings/sais.html.
    10	//
    11	// With optimizations inspired by Yuta Mori's sais-lite
    12	// (https://sites.google.com/site/yuta256/sais).
    13	//
    14	// And with other new optimizations.
    15	
    16	// Many of these functions are parameterized by the sizes of
    17	// the types they operate on. The generator gen.go makes
    18	// copies of these functions for use with other sizes.
    19	// Specifically:
    20	//
    21	// - A function with a name ending in _8_32 takes []byte and []int32 arguments
    22	//   and is duplicated into _32_32, _8_64, and _64_64 forms.
    23	//   The _32_32 and _64_64_ suffixes are shortened to plain _32 and _64.
    24	//   Any lines in the function body that contain the text "byte-only" or "256"
    25	//   are stripped when creating _32_32 and _64_64 forms.
    26	//   (Those lines are typically 8-bit-specific optimizations.)
    27	//
    28	// - A function with a name ending only in _32 operates on []int32
    29	//   and is duplicated into a _64 form. (Note that it may still take a []byte,
    30	//   but there is no need for a version of the function in which the []byte
    31	//   is widened to a full integer array.)
    32	
    33	// The overall runtime of this code is linear in the input size:
    34	// it runs a sequence of linear passes to reduce the problem to
    35	// a subproblem at most half as big, invokes itself recursively,
    36	// and then runs a sequence of linear passes to turn the answer
    37	// for the subproblem into the answer for the original problem.
    38	// This gives T(N) = O(N) + T(N/2) = O(N) + O(N/2) + O(N/4) + ... = O(N).
    39	//
    40	// The outline of the code, with the forward and backward scans
    41	// through O(N)-sized arrays called out, is:
    42	//
    43	// sais_I_N
    44	//	placeLMS_I_B
    45	//		bucketMax_I_B
    46	//			freq_I_B
    47	//				<scan +text> (1)
    48	//			<scan +freq> (2)
    49	//		<scan -text, random bucket> (3)
    50	//	induceSubL_I_B
    51	//		bucketMin_I_B
    52	//			freq_I_B
    53	//				<scan +text, often optimized away> (4)
    54	//			<scan +freq> (5)
    55	//		<scan +sa, random text, random bucket> (6)
    56	//	induceSubS_I_B
    57	//		bucketMax_I_B
    58	//			freq_I_B
    59	//				<scan +text, often optimized away> (7)
    60	//			<scan +freq> (8)
    61	//		<scan -sa, random text, random bucket> (9)
    62	//	assignID_I_B
    63	//		<scan +sa, random text substrings> (10)
    64	//	map_B
    65	//		<scan -sa> (11)
    66	//	recurse_B
    67	//		(recursive call to sais_B_B for a subproblem of size at most 1/2 input, often much smaller)
    68	//	unmap_I_B
    69	//		<scan -text> (12)
    70	//		<scan +sa> (13)
    71	//	expand_I_B
    72	//		bucketMax_I_B
    73	//			freq_I_B
    74	//				<scan +text, often optimized away> (14)
    75	//			<scan +freq> (15)
    76	//		<scan -sa, random text, random bucket> (16)
    77	//	induceL_I_B
    78	//		bucketMin_I_B
    79	//			freq_I_B
    80	//				<scan +text, often optimized away> (17)
    81	//			<scan +freq> (18)
    82	//		<scan +sa, random text, random bucket> (19)
    83	//	induceS_I_B
    84	//		bucketMax_I_B
    85	//			freq_I_B
    86	//				<scan +text, often optimized away> (20)
    87	//			<scan +freq> (21)
    88	//		<scan -sa, random text, random bucket> (22)
    89	//
    90	// Here, _B indicates the suffix array size (_32 or _64) and _I the input size (_8 or _B).
    91	//
    92	// The outline shows there are in general 22 scans through
    93	// O(N)-sized arrays for a given level of the recursion.
    94	// In the top level, operating on 8-bit input text,
    95	// the six freq scans are fixed size (256) instead of potentially
    96	// input-sized. Also, the frequency is counted once and cached
    97	// whenever there is room to do so (there is nearly always room in general,
    98	// and always room at the top level), which eliminates all but
    99	// the first freq_I_B text scans (that is, 5 of the 6).
   100	// So the top level of the recursion only does 22 - 6 - 5 = 11
   101	// input-sized scans and a typical level does 16 scans.
   102	//
   103	// The linear scans do not cost anywhere near as much as
   104	// the random accesses to the text made during a few of
   105	// the scans (specifically #6, #9, #16, #19, #22 marked above).
   106	// In real texts, there is not much but some locality to
   107	// the accesses, due to the repetitive structure of the text
   108	// (the same reason Burrows-Wheeler compression is so effective).
   109	// For random inputs, there is no locality, which makes those
   110	// accesses even more expensive, especially once the text
   111	// no longer fits in cache.
   112	// For example, running on 50 MB of Go source code, induceSubL_8_32
   113	// (which runs only once, at the top level of the recursion)
   114	// takes 0.44s, while on 50 MB of random input, it takes 2.55s.
   115	// Nearly all the relative slowdown is explained by the text access:
   116	//
   117	//		c0, c1 := text[k-1], text[k]
   118	//
   119	// That line runs for 0.23s on the Go text and 2.02s on random text.
   120	
   121	//go:generate go run gen.go
   122	
   123	package suffixarray
   124	
   125	// text_32 returns the suffix array for the input text.
   126	// It requires that len(text) fit in an int32
   127	// and that the caller zero sa.
   128	func text_32(text []byte, sa []int32) {
   129		if int(int32(len(text))) != len(text) || len(text) != len(sa) {
   130			panic("suffixarray: misuse of text_32")
   131		}
   132		sais_8_32(text, 256, sa, make([]int32, 2*256))
   133	}
   134	
   135	// sais_8_32 computes the suffix array of text.
   136	// The text must contain only values in [0, textMax).
   137	// The suffix array is stored in sa, which the caller
   138	// must ensure is already zeroed.
   139	// The caller must also provide temporary space tmp
   140	// with len(tmp) ≥ textMax. If len(tmp) ≥ 2*textMax
   141	// then the algorithm runs a little faster.
   142	// If sais_8_32 modifies tmp, it sets tmp[0] = -1 on return.
   143	func sais_8_32(text []byte, textMax int, sa, tmp []int32) {
   144		if len(sa) != len(text) || len(tmp) < int(textMax) {
   145			panic("suffixarray: misuse of sais_8_32")
   146		}
   147	
   148		// Trivial base cases. Sorting 0 or 1 things is easy.
   149		if len(text) == 0 {
   150			return
   151		}
   152		if len(text) == 1 {
   153			sa[0] = 0
   154			return
   155		}
   156	
   157		// Establish slices indexed by text character
   158		// holding character frequency and bucket-sort offsets.
   159		// If there's only enough tmp for one slice,
   160		// we make it the bucket offsets and recompute
   161		// the character frequency each time we need it.
   162		var freq, bucket []int32
   163		if len(tmp) >= 2*textMax {
   164			freq, bucket = tmp[:textMax], tmp[textMax:2*textMax]
   165			freq[0] = -1 // mark as uninitialized
   166		} else {
   167			freq, bucket = nil, tmp[:textMax]
   168		}
   169	
   170		// The SAIS algorithm.
   171		// Each of these calls makes one scan through sa.
   172		// See the individual functions for documentation
   173		// about each's role in the algorithm.
   174		numLMS := placeLMS_8_32(text, sa, freq, bucket)
   175		if numLMS <= 1 {
   176			// 0 or 1 items are already sorted. Do nothing.
   177		} else {
   178			induceSubL_8_32(text, sa, freq, bucket)
   179			induceSubS_8_32(text, sa, freq, bucket)
   180			length_8_32(text, sa, numLMS)
   181			maxID := assignID_8_32(text, sa, numLMS)
   182			if maxID < numLMS {
   183				map_32(sa, numLMS)
   184				recurse_32(sa, tmp, numLMS, maxID)
   185				unmap_8_32(text, sa, numLMS)
   186			} else {
   187				// If maxID == numLMS, then each LMS-substring
   188				// is unique, so the relative ordering of two LMS-suffixes
   189				// is determined by just the leading LMS-substring.
   190				// That is, the LMS-suffix sort order matches the
   191				// (simpler) LMS-substring sort order.
   192				// Copy the original LMS-substring order into the
   193				// suffix array destination.
   194				copy(sa, sa[len(sa)-numLMS:])
   195			}
   196			expand_8_32(text, freq, bucket, sa, numLMS)
   197		}
   198		induceL_8_32(text, sa, freq, bucket)
   199		induceS_8_32(text, sa, freq, bucket)
   200	
   201		// Mark for caller that we overwrote tmp.
   202		tmp[0] = -1
   203	}
   204	
   205	// freq_8_32 returns the character frequencies
   206	// for text, as a slice indexed by character value.
   207	// If freq is nil, freq_8_32 uses and returns bucket.
   208	// If freq is non-nil, freq_8_32 assumes that freq[0] >= 0
   209	// means the frequencies are already computed.
   210	// If the frequency data is overwritten or uninitialized,
   211	// the caller must set freq[0] = -1 to force recomputation
   212	// the next time it is needed.
   213	func freq_8_32(text []byte, freq, bucket []int32) []int32 {
   214		if freq != nil && freq[0] >= 0 {
   215			return freq // already computed
   216		}
   217		if freq == nil {
   218			freq = bucket
   219		}
   220	
   221		freq = freq[:256] // eliminate bounds check for freq[c] below
   222		for i := range freq {
   223			freq[i] = 0
   224		}
   225		for _, c := range text {
   226			freq[c]++
   227		}
   228		return freq
   229	}
   230	
   231	// bucketMin_8_32 stores into bucket[c] the minimum index
   232	// in the bucket for character c in a bucket-sort of text.
   233	func bucketMin_8_32(text []byte, freq, bucket []int32) {
   234		freq = freq_8_32(text, freq, bucket)
   235		freq = freq[:256]     // establish len(freq) = 256, so 0 ≤ i < 256 below
   236		bucket = bucket[:256] // eliminate bounds check for bucket[i] below
   237		total := int32(0)
   238		for i, n := range freq {
   239			bucket[i] = total
   240			total += n
   241		}
   242	}
   243	
   244	// bucketMax_8_32 stores into bucket[c] the maximum index
   245	// in the bucket for character c in a bucket-sort of text.
   246	// The bucket indexes for c are [min, max).
   247	// That is, max is one past the final index in that bucket.
   248	func bucketMax_8_32(text []byte, freq, bucket []int32) {
   249		freq = freq_8_32(text, freq, bucket)
   250		freq = freq[:256]     // establish len(freq) = 256, so 0 ≤ i < 256 below
   251		bucket = bucket[:256] // eliminate bounds check for bucket[i] below
   252		total := int32(0)
   253		for i, n := range freq {
   254			total += n
   255			bucket[i] = total
   256		}
   257	}
   258	
   259	// The SAIS algorithm proceeds in a sequence of scans through sa.
   260	// Each of the following functions implements one scan,
   261	// and the functions appear here in the order they execute in the algorithm.
   262	
   263	// placeLMS_8_32 places into sa the indexes of the
   264	// final characters of the LMS substrings of text,
   265	// sorted into the rightmost ends of their correct buckets
   266	// in the suffix array.
   267	//
   268	// The imaginary sentinel character at the end of the text
   269	// is the final character of the final LMS substring, but there
   270	// is no bucket for the imaginary sentinel character,
   271	// which has a smaller value than any real character.
   272	// The caller must therefore pretend that sa[-1] == len(text).
   273	//
   274	// The text indexes of LMS-substring characters are always ≥ 1
   275	// (the first LMS-substring must be preceded by one or more L-type
   276	// characters that are not part of any LMS-substring),
   277	// so using 0 as a “not present” suffix array entry is safe,
   278	// both in this function and in most later functions
   279	// (until induceL_8_32 below).
   280	func placeLMS_8_32(text []byte, sa, freq, bucket []int32) int {
   281		bucketMax_8_32(text, freq, bucket)
   282	
   283		numLMS := 0
   284		lastB := int32(-1)
   285		bucket = bucket[:256] // eliminate bounds check for bucket[c1] below
   286	
   287		// The next stanza of code (until the blank line) loop backward
   288		// over text, stopping to execute a code body at each position i
   289		// such that text[i] is an L-character and text[i+1] is an S-character.
   290		// That is, i+1 is the position of the start of an LMS-substring.
   291		// These could be hoisted out into a function with a callback,
   292		// but at a significant speed cost. Instead, we just write these
   293		// seven lines a few times in this source file. The copies below
   294		// refer back to the pattern established by this original as the
   295		// "LMS-substring iterator".
   296		//
   297		// In every scan through the text, c0, c1 are successive characters of text.
   298		// In this backward scan, c0 == text[i] and c1 == text[i+1].
   299		// By scanning backward, we can keep track of whether the current
   300		// position is type-S or type-L according to the usual definition:
   301		//
   302		//	- position len(text) is type S with text[len(text)] == -1 (the sentinel)
   303		//	- position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S.
   304		//	- position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L.
   305		//
   306		// The backward scan lets us maintain the current type,
   307		// update it when we see c0 != c1, and otherwise leave it alone.
   308		// We want to identify all S positions with a preceding L.
   309		// Position len(text) is one such position by definition, but we have
   310		// nowhere to write it down, so we eliminate it by untruthfully
   311		// setting isTypeS = false at the start of the loop.
   312		c0, c1, isTypeS := byte(0), byte(0), false
   313		for i := len(text) - 1; i >= 0; i-- {
   314			c0, c1 = text[i], c0
   315			if c0 < c1 {
   316				isTypeS = true
   317			} else if c0 > c1 && isTypeS {
   318				isTypeS = false
   319	
   320				// Bucket the index i+1 for the start of an LMS-substring.
   321				b := bucket[c1] - 1
   322				bucket[c1] = b
   323				sa[b] = int32(i + 1)
   324				lastB = b
   325				numLMS++
   326			}
   327		}
   328	
   329		// We recorded the LMS-substring starts but really want the ends.
   330		// Luckily, with two differences, the start indexes and the end indexes are the same.
   331		// The first difference is that the rightmost LMS-substring's end index is len(text),
   332		// so the caller must pretend that sa[-1] == len(text), as noted above.
   333		// The second difference is that the first leftmost LMS-substring start index
   334		// does not end an earlier LMS-substring, so as an optimization we can omit
   335		// that leftmost LMS-substring start index (the last one we wrote).
   336		//
   337		// Exception: if numLMS <= 1, the caller is not going to bother with
   338		// the recursion at all and will treat the result as containing LMS-substring starts.
   339		// In that case, we don't remove the final entry.
   340		if numLMS > 1 {
   341			sa[lastB] = 0
   342		}
   343		return numLMS
   344	}
   345	
   346	// induceSubL_8_32 inserts the L-type text indexes of LMS-substrings
   347	// into sa, assuming that the final characters of the LMS-substrings
   348	// are already inserted into sa, sorted by final character, and at the
   349	// right (not left) end of the corresponding character bucket.
   350	// Each LMS-substring has the form (as a regexp) /S+L+S/:
   351	// one or more S-type, one or more L-type, final S-type.
   352	// induceSubL_8_32 leaves behind only the leftmost L-type text
   353	// index for each LMS-substring. That is, it removes the final S-type
   354	// indexes that are present on entry, and it inserts but then removes
   355	// the interior L-type indexes too.
   356	// (Only the leftmost L-type index is needed by induceSubS_8_32.)
   357	func induceSubL_8_32(text []byte, sa, freq, bucket []int32) {
   358		// Initialize positions for left side of character buckets.
   359		bucketMin_8_32(text, freq, bucket)
   360		bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
   361	
   362		// As we scan the array left-to-right, each sa[i] = j > 0 is a correctly
   363		// sorted suffix array entry (for text[j:]) for which we know that j-1 is type L.
   364		// Because j-1 is type L, inserting it into sa now will sort it correctly.
   365		// But we want to distinguish a j-1 with j-2 of type L from type S.
   366		// We can process the former but want to leave the latter for the caller.
   367		// We record the difference by negating j-1 if it is preceded by type S.
   368		// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
   369		// happen at sa[i´] for some i´ > i, that is, in the portion of sa we have
   370		// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
   371		// and so on, in sorted but not necessarily adjacent order, until it finds
   372		// one preceded by an index of type S, at which point it must stop.
   373		//
   374		// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
   375		// and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing
   376		// only the indexes of the leftmost L-type indexes for each LMS-substring.
   377		//
   378		// The suffix array sa therefore serves simultaneously as input, output,
   379		// and a miraculously well-tailored work queue.
   380	
   381		// placeLMS_8_32 left out the implicit entry sa[-1] == len(text),
   382		// corresponding to the identified type-L index len(text)-1.
   383		// Process it before the left-to-right scan of sa proper.
   384		// See body in loop for commentary.
   385		k := len(text) - 1
   386		c0, c1 := text[k-1], text[k]
   387		if c0 < c1 {
   388			k = -k
   389		}
   390	
   391		// Cache recently used bucket index:
   392		// we're processing suffixes in sorted order
   393		// and accessing buckets indexed by the
   394		// byte before the sorted order, which still
   395		// has very good locality.
   396		// Invariant: b is cached, possibly dirty copy of bucket[cB].
   397		cB := c1
   398		b := bucket[cB]
   399		sa[b] = int32(k)
   400		b++
   401	
   402		for i := 0; i < len(sa); i++ {
   403			j := int(sa[i])
   404			if j == 0 {
   405				// Skip empty entry.
   406				continue
   407			}
   408			if j < 0 {
   409				// Leave discovered type-S index for caller.
   410				sa[i] = int32(-j)
   411				continue
   412			}
   413			sa[i] = 0
   414	
   415			// Index j was on work queue, meaning k := j-1 is L-type,
   416			// so we can now place k correctly into sa.
   417			// If k-1 is L-type, queue k for processing later in this loop.
   418			// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
   419			k := j - 1
   420			c0, c1 := text[k-1], text[k]
   421			if c0 < c1 {
   422				k = -k
   423			}
   424	
   425			if cB != c1 {
   426				bucket[cB] = b
   427				cB = c1
   428				b = bucket[cB]
   429			}
   430			sa[b] = int32(k)
   431			b++
   432		}
   433	}
   434	
   435	// induceSubS_8_32 inserts the S-type text indexes of LMS-substrings
   436	// into sa, assuming that the leftmost L-type text indexes are already
   437	// inserted into sa, sorted by LMS-substring suffix, and at the
   438	// left end of the corresponding character bucket.
   439	// Each LMS-substring has the form (as a regexp) /S+L+S/:
   440	// one or more S-type, one or more L-type, final S-type.
   441	// induceSubS_8_32 leaves behind only the leftmost S-type text
   442	// index for each LMS-substring, in sorted order, at the right end of sa.
   443	// That is, it removes the L-type indexes that are present on entry,
   444	// and it inserts but then removes the interior S-type indexes too,
   445	// leaving the LMS-substring start indexes packed into sa[len(sa)-numLMS:].
   446	// (Only the LMS-substring start indexes are processed by the recursion.)
   447	func induceSubS_8_32(text []byte, sa, freq, bucket []int32) {
   448		// Initialize positions for right side of character buckets.
   449		bucketMax_8_32(text, freq, bucket)
   450		bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
   451	
   452		// Analogous to induceSubL_8_32 above,
   453		// as we scan the array right-to-left, each sa[i] = j > 0 is a correctly
   454		// sorted suffix array entry (for text[j:]) for which we know that j-1 is type S.
   455		// Because j-1 is type S, inserting it into sa now will sort it correctly.
   456		// But we want to distinguish a j-1 with j-2 of type S from type L.
   457		// We can process the former but want to leave the latter for the caller.
   458		// We record the difference by negating j-1 if it is preceded by type L.
   459		// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
   460		// happen at sa[i´] for some i´ < i, that is, in the portion of sa we have
   461		// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
   462		// and so on, in sorted but not necessarily adjacent order, until it finds
   463		// one preceded by an index of type L, at which point it must stop.
   464		// That index (preceded by one of type L) is an LMS-substring start.
   465		//
   466		// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
   467		// and we flip sa[i] < 0 to -sa[i] and compact into the top of sa,
   468		// so that the loop finishes with the top of sa containing exactly
   469		// the LMS-substring start indexes, sorted by LMS-substring.
   470	
   471		// Cache recently used bucket index:
   472		cB := byte(0)
   473		b := bucket[cB]
   474	
   475		top := len(sa)
   476		for i := len(sa) - 1; i >= 0; i-- {
   477			j := int(sa[i])
   478			if j == 0 {
   479				// Skip empty entry.
   480				continue
   481			}
   482			sa[i] = 0
   483			if j < 0 {
   484				// Leave discovered LMS-substring start index for caller.
   485				top--
   486				sa[top] = int32(-j)
   487				continue
   488			}
   489	
   490			// Index j was on work queue, meaning k := j-1 is S-type,
   491			// so we can now place k correctly into sa.
   492			// If k-1 is S-type, queue k for processing later in this loop.
   493			// If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller.
   494			k := j - 1
   495			c1 := text[k]
   496			c0 := text[k-1]
   497			if c0 > c1 {
   498				k = -k
   499			}
   500	
   501			if cB != c1 {
   502				bucket[cB] = b
   503				cB = c1
   504				b = bucket[cB]
   505			}
   506			b--
   507			sa[b] = int32(k)
   508		}
   509	}
   510	
   511	// length_8_32 computes and records the length of each LMS-substring in text.
   512	// The length of the LMS-substring at index j is stored at sa[j/2],
   513	// avoiding the LMS-substring indexes already stored in the top half of sa.
   514	// (If index j is an LMS-substring start, then index j-1 is type L and cannot be.)
   515	// There are two exceptions, made for optimizations in name_8_32 below.
   516	//
   517	// First, the final LMS-substring is recorded as having length 0, which is otherwise
   518	// impossible, instead of giving it a length that includes the implicit sentinel.
   519	// This ensures the final LMS-substring has length unequal to all others
   520	// and therefore can be detected as different without text comparison
   521	// (it is unequal because it is the only one that ends in the implicit sentinel,
   522	// and the text comparison would be problematic since the implicit sentinel
   523	// is not actually present at text[len(text)]).
   524	//
   525	// Second, to avoid text comparison entirely, if an LMS-substring is very short,
   526	// sa[j/2] records its actual text instead of its length, so that if two such
   527	// substrings have matching “length,” the text need not be read at all.
   528	// The definition of “very short” is that the text bytes must pack into an uint32,
   529	// and the unsigned encoding e must be ≥ len(text), so that it can be
   530	// distinguished from a valid length.
   531	func length_8_32(text []byte, sa []int32, numLMS int) {
   532		end := 0 // index of current LMS-substring end (0 indicates final LMS-substring)
   533	
   534		// The encoding of N text bytes into a “length” word
   535		// adds 1 to each byte, packs them into the bottom
   536		// N*8 bits of a word, and then bitwise inverts the result.
   537		// That is, the text sequence A B C (hex 41 42 43)
   538		// encodes as ^uint32(0x42_43_44).
   539		// LMS-substrings can never start or end with 0xFF.
   540		// Adding 1 ensures the encoded byte sequence never
   541		// starts or ends with 0x00, so that present bytes can be
   542		// distinguished from zero-padding in the top bits,
   543		// so the length need not be separately encoded.
   544		// Inverting the bytes increases the chance that a
   545		// 4-byte encoding will still be ≥ len(text).
   546		// In particular, if the first byte is ASCII (<= 0x7E, so +1 <= 0x7F)
   547		// then the high bit of the inversion will be set,
   548		// making it clearly not a valid length (it would be a negative one).
   549		//
   550		// cx holds the pre-inverted encoding (the packed incremented bytes).
   551		cx := uint32(0) // byte-only
   552	
   553		// This stanza (until the blank line) is the "LMS-substring iterator",
   554		// described in placeLMS_8_32 above, with one line added to maintain cx.
   555		c0, c1, isTypeS := byte(0), byte(0), false
   556		for i := len(text) - 1; i >= 0; i-- {
   557			c0, c1 = text[i], c0
   558			cx = cx<<8 | uint32(c1+1) // byte-only
   559			if c0 < c1 {
   560				isTypeS = true
   561			} else if c0 > c1 && isTypeS {
   562				isTypeS = false
   563	
   564				// Index j = i+1 is the start of an LMS-substring.
   565				// Compute length or encoded text to store in sa[j/2].
   566				j := i + 1
   567				var code int32
   568				if end == 0 {
   569					code = 0
   570				} else {
   571					code = int32(end - j)
   572					if code <= 32/8 && ^cx >= uint32(len(text)) { // byte-only
   573						code = int32(^cx) // byte-only
   574					} // byte-only
   575				}
   576				sa[j>>1] = code
   577				end = j + 1
   578				cx = uint32(c1 + 1) // byte-only
   579			}
   580		}
   581	}
   582	
   583	// assignID_8_32 assigns a dense ID numbering to the
   584	// set of LMS-substrings respecting string ordering and equality,
   585	// returning the maximum assigned ID.
   586	// For example given the input "ababab", the LMS-substrings
   587	// are "aba", "aba", and "ab", renumbered as 2 2 1.
   588	// sa[len(sa)-numLMS:] holds the LMS-substring indexes
   589	// sorted in string order, so to assign numbers we can
   590	// consider each in turn, removing adjacent duplicates.
   591	// The new ID for the LMS-substring at index j is written to sa[j/2],
   592	// overwriting the length previously stored there (by length_8_32 above).
   593	func assignID_8_32(text []byte, sa []int32, numLMS int) int {
   594		id := 0
   595		lastLen := int32(-1) // impossible
   596		lastPos := int32(0)
   597		for _, j := range sa[len(sa)-numLMS:] {
   598			// Is the LMS-substring at index j new, or is it the same as the last one we saw?
   599			n := sa[j/2]
   600			if n != lastLen {
   601				goto New
   602			}
   603			if uint32(n) >= uint32(len(text)) {
   604				// “Length” is really encoded full text, and they match.
   605				goto Same
   606			}
   607			{
   608				// Compare actual texts.
   609				n := int(n)
   610				this := text[j:][:n]
   611				last := text[lastPos:][:n]
   612				for i := 0; i < n; i++ {
   613					if this[i] != last[i] {
   614						goto New
   615					}
   616				}
   617				goto Same
   618			}
   619		New:
   620			id++
   621			lastPos = j
   622			lastLen = n
   623		Same:
   624			sa[j/2] = int32(id)
   625		}
   626		return id
   627	}
   628	
   629	// map_32 maps the LMS-substrings in text to their new IDs,
   630	// producing the subproblem for the recursion.
   631	// The mapping itself was mostly applied by assignID_8_32:
   632	// sa[i] is either 0, the ID for the LMS-substring at index 2*i,
   633	// or the ID for the LMS-substring at index 2*i+1.
   634	// To produce the subproblem we need only remove the zeros
   635	// and change ID into ID-1 (our IDs start at 1, but text chars start at 0).
   636	//
   637	// map_32 packs the result, which is the input to the recursion,
   638	// into the top of sa, so that the recursion result can be stored
   639	// in the bottom of sa, which sets up for expand_8_32 well.
   640	func map_32(sa []int32, numLMS int) {
   641		w := len(sa)
   642		for i := len(sa) / 2; i >= 0; i-- {
   643			j := sa[i]
   644			if j > 0 {
   645				w--
   646				sa[w] = j - 1
   647			}
   648		}
   649	}
   650	
   651	// recurse_32 calls sais_32 recursively to solve the subproblem we've built.
   652	// The subproblem is at the right end of sa, the suffix array result will be
   653	// written at the left end of sa, and the middle of sa is available for use as
   654	// temporary frequency and bucket storage.
   655	func recurse_32(sa, oldTmp []int32, numLMS, maxID int) {
   656		dst, saTmp, text := sa[:numLMS], sa[numLMS:len(sa)-numLMS], sa[len(sa)-numLMS:]
   657	
   658		// Set up temporary space for recursive call.
   659		// We must pass sais_32 a tmp buffer wiith at least maxID entries.
   660		//
   661		// The subproblem is guaranteed to have length at most len(sa)/2,
   662		// so that sa can hold both the subproblem and its suffix array.
   663		// Nearly all the time, however, the subproblem has length < len(sa)/3,
   664		// in which case there is a subproblem-sized middle of sa that
   665		// we can reuse for temporary space (saTmp).
   666		// When recurse_32 is called from sais_8_32, oldTmp is length 512
   667		// (from text_32), and saTmp will typically be much larger, so we'll use saTmp.
   668		// When deeper recursions come back to recurse_32, now oldTmp is
   669		// the saTmp from the top-most recursion, it is typically larger than
   670		// the current saTmp (because the current sa gets smaller and smaller
   671		// as the recursion gets deeper), and we keep reusing that top-most
   672		// large saTmp instead of the offered smaller ones.
   673		//
   674		// Why is the subproblem length so often just under len(sa)/3?
   675		// See Nong, Zhang, and Chen, section 3.6 for a plausible explanation.
   676		// In brief, the len(sa)/2 case would correspond to an SLSLSLSLSLSL pattern
   677		// in the input, perfect alternation of larger and smaller input bytes.
   678		// Real text doesn't do that. If each L-type index is randomly followed
   679		// by either an L-type or S-type index, then half the substrings will
   680		// be of the form SLS, but the other half will be longer. Of that half,
   681		// half (a quarter overall) will be SLLS; an eighth will be SLLLS, and so on.
   682		// Not counting the final S in each (which overlaps the first S in the next),
   683		// This works out to an average length 2×½ + 3×¼ + 4×⅛ + ... = 3.
   684		// The space we need is further reduced by the fact that many of the
   685		// short patterns like SLS will often be the same character sequences
   686		// repeated throughout the text, reducing maxID relative to numLMS.
   687		//
   688		// For short inputs, the averages may not run in our favor, but then we
   689		// can often fall back to using the length-512 tmp available in the
   690		// top-most call. (Also a short allocation would not be a big deal.)
   691		//
   692		// For pathological inputs, we fall back to allocating a new tmp of length
   693		// max(maxID, numLMS/2). This level of the recursion needs maxID,
   694		// and all deeper levels of the recursion will need no more than numLMS/2,
   695		// so this one allocation is guaranteed to suffice for the entire stack
   696		// of recursive calls.
   697		tmp := oldTmp
   698		if len(tmp) < len(saTmp) {
   699			tmp = saTmp
   700		}
   701		if len(tmp) < numLMS {
   702			// TestSAIS/forcealloc reaches this code.
   703			n := maxID
   704			if n < numLMS/2 {
   705				n = numLMS / 2
   706			}
   707			tmp = make([]int32, n)
   708		}
   709	
   710		// sais_32 requires that the caller arrange to clear dst,
   711		// because in general the caller may know dst is
   712		// freshly-allocated and already cleared. But this one is not.
   713		for i := range dst {
   714			dst[i] = 0
   715		}
   716		sais_32(text, maxID, dst, tmp)
   717	}
   718	
   719	// unmap_8_32 unmaps the subproblem back to the original.
   720	// sa[:numLMS] is the LMS-substring numbers, which don't matter much anymore.
   721	// sa[len(sa)-numLMS:] is the sorted list of those LMS-substring numbers.
   722	// The key part is that if the list says K that means the K'th substring.
   723	// We can replace sa[:numLMS] with the indexes of the LMS-substrings.
   724	// Then if the list says K it really means sa[K].
   725	// Having mapped the list back to LMS-substring indexes,
   726	// we can place those into the right buckets.
   727	func unmap_8_32(text []byte, sa []int32, numLMS int) {
   728		unmap := sa[len(sa)-numLMS:]
   729		j := len(unmap)
   730	
   731		// "LMS-substring iterator" (see placeLMS_8_32 above).
   732		c0, c1, isTypeS := byte(0), byte(0), false
   733		for i := len(text) - 1; i >= 0; i-- {
   734			c0, c1 = text[i], c0
   735			if c0 < c1 {
   736				isTypeS = true
   737			} else if c0 > c1 && isTypeS {
   738				isTypeS = false
   739	
   740				// Populate inverse map.
   741				j--
   742				unmap[j] = int32(i + 1)
   743			}
   744		}
   745	
   746		// Apply inverse map to subproblem suffix array.
   747		sa = sa[:numLMS]
   748		for i := 0; i < len(sa); i++ {
   749			sa[i] = unmap[sa[i]]
   750		}
   751	}
   752	
   753	// expand_8_32 distributes the compacted, sorted LMS-suffix indexes
   754	// from sa[:numLMS] into the tops of the appropriate buckets in sa,
   755	// preserving the sorted order and making room for the L-type indexes
   756	// to be slotted into the sorted sequence by induceL_8_32.
   757	func expand_8_32(text []byte, freq, bucket, sa []int32, numLMS int) {
   758		bucketMax_8_32(text, freq, bucket)
   759		bucket = bucket[:256] // eliminate bound check for bucket[c] below
   760	
   761		// Loop backward through sa, always tracking
   762		// the next index to populate from sa[:numLMS].
   763		// When we get to one, populate it.
   764		// Zero the rest of the slots; they have dead values in them.
   765		x := numLMS - 1
   766		saX := sa[x]
   767		c := text[saX]
   768		b := bucket[c] - 1
   769		bucket[c] = b
   770	
   771		for i := len(sa) - 1; i >= 0; i-- {
   772			if i != int(b) {
   773				sa[i] = 0
   774				continue
   775			}
   776			sa[i] = saX
   777	
   778			// Load next entry to put down (if any).
   779			if x > 0 {
   780				x--
   781				saX = sa[x] // TODO bounds check
   782				c = text[saX]
   783				b = bucket[c] - 1
   784				bucket[c] = b
   785			}
   786		}
   787	}
   788	
   789	// induceL_8_32 inserts L-type text indexes into sa,
   790	// assuming that the leftmost S-type indexes are inserted
   791	// into sa, in sorted order, in the right bucket halves.
   792	// It leaves all the L-type indexes in sa, but the
   793	// leftmost L-type indexes are negated, to mark them
   794	// for processing by induceS_8_32.
   795	func induceL_8_32(text []byte, sa, freq, bucket []int32) {
   796		// Initialize positions for left side of character buckets.
   797		bucketMin_8_32(text, freq, bucket)
   798		bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
   799	
   800		// This scan is similar to the one in induceSubL_8_32 above.
   801		// That one arranges to clear all but the leftmost L-type indexes.
   802		// This scan leaves all the L-type indexes and the original S-type
   803		// indexes, but it negates the positive leftmost L-type indexes
   804		// (the ones that induceS_8_32 needs to process).
   805	
   806		// expand_8_32 left out the implicit entry sa[-1] == len(text),
   807		// corresponding to the identified type-L index len(text)-1.
   808		// Process it before the left-to-right scan of sa proper.
   809		// See body in loop for commentary.
   810		k := len(text) - 1
   811		c0, c1 := text[k-1], text[k]
   812		if c0 < c1 {
   813			k = -k
   814		}
   815	
   816		// Cache recently used bucket index.
   817		cB := c1
   818		b := bucket[cB]
   819		sa[b] = int32(k)
   820		b++
   821	
   822		for i := 0; i < len(sa); i++ {
   823			j := int(sa[i])
   824			if j <= 0 {
   825				// Skip empty or negated entry (including negated zero).
   826				continue
   827			}
   828	
   829			// Index j was on work queue, meaning k := j-1 is L-type,
   830			// so we can now place k correctly into sa.
   831			// If k-1 is L-type, queue k for processing later in this loop.
   832			// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
   833			// If k is zero, k-1 doesn't exist, so we only need to leave it
   834			// for the caller. The caller can't tell the difference between
   835			// an empty slot and a non-empty zero, but there's no need
   836			// to distinguish them anyway: the final suffix array will end up
   837			// with one zero somewhere, and that will be a real zero.
   838			k := j - 1
   839			c1 := text[k]
   840			if k > 0 {
   841				if c0 := text[k-1]; c0 < c1 {
   842					k = -k
   843				}
   844			}
   845	
   846			if cB != c1 {
   847				bucket[cB] = b
   848				cB = c1
   849				b = bucket[cB]
   850			}
   851			sa[b] = int32(k)
   852			b++
   853		}
   854	}
   855	
   856	func induceS_8_32(text []byte, sa, freq, bucket []int32) {
   857		// Initialize positions for right side of character buckets.
   858		bucketMax_8_32(text, freq, bucket)
   859		bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
   860	
   861		cB := byte(0)
   862		b := bucket[cB]
   863	
   864		for i := len(sa) - 1; i >= 0; i-- {
   865			j := int(sa[i])
   866			if j >= 0 {
   867				// Skip non-flagged entry.
   868				// (This loop can't see an empty entry; 0 means the real zero index.)
   869				continue
   870			}
   871	
   872			// Negative j is a work queue entry; rewrite to positive j for final suffix array.
   873			j = -j
   874			sa[i] = int32(j)
   875	
   876			// Index j was on work queue (encoded as -j but now decoded),
   877			// meaning k := j-1 is L-type,
   878			// so we can now place k correctly into sa.
   879			// If k-1 is S-type, queue -k for processing later in this loop.
   880			// If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller.
   881			// If k is zero, k-1 doesn't exist, so we only need to leave it
   882			// for the caller.
   883			k := j - 1
   884			c1 := text[k]
   885			if k > 0 {
   886				if c0 := text[k-1]; c0 <= c1 {
   887					k = -k
   888				}
   889			}
   890	
   891			if cB != c1 {
   892				bucket[cB] = b
   893				cB = c1
   894				b = bucket[cB]
   895			}
   896			b--
   897			sa[b] = int32(k)
   898		}
   899	}
   900	

View as plain text