Source file src/index/suffixarray/sais2.go

     1	// Copyright 2019 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Code generated by go generate; DO NOT EDIT.
     6	
     7	package suffixarray
     8	
     9	func text_64(text []byte, sa []int64) {
    10		if int(int64(len(text))) != len(text) || len(text) != len(sa) {
    11			panic("suffixarray: misuse of text_64")
    12		}
    13		sais_8_64(text, 256, sa, make([]int64, 2*256))
    14	}
    15	
    16	func sais_8_64(text []byte, textMax int, sa, tmp []int64) {
    17		if len(sa) != len(text) || len(tmp) < int(textMax) {
    18			panic("suffixarray: misuse of sais_8_64")
    19		}
    20	
    21		// Trivial base cases. Sorting 0 or 1 things is easy.
    22		if len(text) == 0 {
    23			return
    24		}
    25		if len(text) == 1 {
    26			sa[0] = 0
    27			return
    28		}
    29	
    30		// Establish slices indexed by text character
    31		// holding character frequency and bucket-sort offsets.
    32		// If there's only enough tmp for one slice,
    33		// we make it the bucket offsets and recompute
    34		// the character frequency each time we need it.
    35		var freq, bucket []int64
    36		if len(tmp) >= 2*textMax {
    37			freq, bucket = tmp[:textMax], tmp[textMax:2*textMax]
    38			freq[0] = -1 // mark as uninitialized
    39		} else {
    40			freq, bucket = nil, tmp[:textMax]
    41		}
    42	
    43		// The SAIS algorithm.
    44		// Each of these calls makes one scan through sa.
    45		// See the individual functions for documentation
    46		// about each's role in the algorithm.
    47		numLMS := placeLMS_8_64(text, sa, freq, bucket)
    48		if numLMS <= 1 {
    49			// 0 or 1 items are already sorted. Do nothing.
    50		} else {
    51			induceSubL_8_64(text, sa, freq, bucket)
    52			induceSubS_8_64(text, sa, freq, bucket)
    53			length_8_64(text, sa, numLMS)
    54			maxID := assignID_8_64(text, sa, numLMS)
    55			if maxID < numLMS {
    56				map_64(sa, numLMS)
    57				recurse_64(sa, tmp, numLMS, maxID)
    58				unmap_8_64(text, sa, numLMS)
    59			} else {
    60				// If maxID == numLMS, then each LMS-substring
    61				// is unique, so the relative ordering of two LMS-suffixes
    62				// is determined by just the leading LMS-substring.
    63				// That is, the LMS-suffix sort order matches the
    64				// (simpler) LMS-substring sort order.
    65				// Copy the original LMS-substring order into the
    66				// suffix array destination.
    67				copy(sa, sa[len(sa)-numLMS:])
    68			}
    69			expand_8_64(text, freq, bucket, sa, numLMS)
    70		}
    71		induceL_8_64(text, sa, freq, bucket)
    72		induceS_8_64(text, sa, freq, bucket)
    73	
    74		// Mark for caller that we overwrote tmp.
    75		tmp[0] = -1
    76	}
    77	
    78	func sais_32(text []int32, textMax int, sa, tmp []int32) {
    79		if len(sa) != len(text) || len(tmp) < int(textMax) {
    80			panic("suffixarray: misuse of sais_32")
    81		}
    82	
    83		// Trivial base cases. Sorting 0 or 1 things is easy.
    84		if len(text) == 0 {
    85			return
    86		}
    87		if len(text) == 1 {
    88			sa[0] = 0
    89			return
    90		}
    91	
    92		// Establish slices indexed by text character
    93		// holding character frequency and bucket-sort offsets.
    94		// If there's only enough tmp for one slice,
    95		// we make it the bucket offsets and recompute
    96		// the character frequency each time we need it.
    97		var freq, bucket []int32
    98		if len(tmp) >= 2*textMax {
    99			freq, bucket = tmp[:textMax], tmp[textMax:2*textMax]
   100			freq[0] = -1 // mark as uninitialized
   101		} else {
   102			freq, bucket = nil, tmp[:textMax]
   103		}
   104	
   105		// The SAIS algorithm.
   106		// Each of these calls makes one scan through sa.
   107		// See the individual functions for documentation
   108		// about each's role in the algorithm.
   109		numLMS := placeLMS_32(text, sa, freq, bucket)
   110		if numLMS <= 1 {
   111			// 0 or 1 items are already sorted. Do nothing.
   112		} else {
   113			induceSubL_32(text, sa, freq, bucket)
   114			induceSubS_32(text, sa, freq, bucket)
   115			length_32(text, sa, numLMS)
   116			maxID := assignID_32(text, sa, numLMS)
   117			if maxID < numLMS {
   118				map_32(sa, numLMS)
   119				recurse_32(sa, tmp, numLMS, maxID)
   120				unmap_32(text, sa, numLMS)
   121			} else {
   122				// If maxID == numLMS, then each LMS-substring
   123				// is unique, so the relative ordering of two LMS-suffixes
   124				// is determined by just the leading LMS-substring.
   125				// That is, the LMS-suffix sort order matches the
   126				// (simpler) LMS-substring sort order.
   127				// Copy the original LMS-substring order into the
   128				// suffix array destination.
   129				copy(sa, sa[len(sa)-numLMS:])
   130			}
   131			expand_32(text, freq, bucket, sa, numLMS)
   132		}
   133		induceL_32(text, sa, freq, bucket)
   134		induceS_32(text, sa, freq, bucket)
   135	
   136		// Mark for caller that we overwrote tmp.
   137		tmp[0] = -1
   138	}
   139	
   140	func sais_64(text []int64, textMax int, sa, tmp []int64) {
   141		if len(sa) != len(text) || len(tmp) < int(textMax) {
   142			panic("suffixarray: misuse of sais_64")
   143		}
   144	
   145		// Trivial base cases. Sorting 0 or 1 things is easy.
   146		if len(text) == 0 {
   147			return
   148		}
   149		if len(text) == 1 {
   150			sa[0] = 0
   151			return
   152		}
   153	
   154		// Establish slices indexed by text character
   155		// holding character frequency and bucket-sort offsets.
   156		// If there's only enough tmp for one slice,
   157		// we make it the bucket offsets and recompute
   158		// the character frequency each time we need it.
   159		var freq, bucket []int64
   160		if len(tmp) >= 2*textMax {
   161			freq, bucket = tmp[:textMax], tmp[textMax:2*textMax]
   162			freq[0] = -1 // mark as uninitialized
   163		} else {
   164			freq, bucket = nil, tmp[:textMax]
   165		}
   166	
   167		// The SAIS algorithm.
   168		// Each of these calls makes one scan through sa.
   169		// See the individual functions for documentation
   170		// about each's role in the algorithm.
   171		numLMS := placeLMS_64(text, sa, freq, bucket)
   172		if numLMS <= 1 {
   173			// 0 or 1 items are already sorted. Do nothing.
   174		} else {
   175			induceSubL_64(text, sa, freq, bucket)
   176			induceSubS_64(text, sa, freq, bucket)
   177			length_64(text, sa, numLMS)
   178			maxID := assignID_64(text, sa, numLMS)
   179			if maxID < numLMS {
   180				map_64(sa, numLMS)
   181				recurse_64(sa, tmp, numLMS, maxID)
   182				unmap_64(text, sa, numLMS)
   183			} else {
   184				// If maxID == numLMS, then each LMS-substring
   185				// is unique, so the relative ordering of two LMS-suffixes
   186				// is determined by just the leading LMS-substring.
   187				// That is, the LMS-suffix sort order matches the
   188				// (simpler) LMS-substring sort order.
   189				// Copy the original LMS-substring order into the
   190				// suffix array destination.
   191				copy(sa, sa[len(sa)-numLMS:])
   192			}
   193			expand_64(text, freq, bucket, sa, numLMS)
   194		}
   195		induceL_64(text, sa, freq, bucket)
   196		induceS_64(text, sa, freq, bucket)
   197	
   198		// Mark for caller that we overwrote tmp.
   199		tmp[0] = -1
   200	}
   201	
   202	func freq_8_64(text []byte, freq, bucket []int64) []int64 {
   203		if freq != nil && freq[0] >= 0 {
   204			return freq // already computed
   205		}
   206		if freq == nil {
   207			freq = bucket
   208		}
   209	
   210		freq = freq[:256] // eliminate bounds check for freq[c] below
   211		for i := range freq {
   212			freq[i] = 0
   213		}
   214		for _, c := range text {
   215			freq[c]++
   216		}
   217		return freq
   218	}
   219	
   220	func freq_32(text []int32, freq, bucket []int32) []int32 {
   221		if freq != nil && freq[0] >= 0 {
   222			return freq // already computed
   223		}
   224		if freq == nil {
   225			freq = bucket
   226		}
   227	
   228		for i := range freq {
   229			freq[i] = 0
   230		}
   231		for _, c := range text {
   232			freq[c]++
   233		}
   234		return freq
   235	}
   236	
   237	func freq_64(text []int64, freq, bucket []int64) []int64 {
   238		if freq != nil && freq[0] >= 0 {
   239			return freq // already computed
   240		}
   241		if freq == nil {
   242			freq = bucket
   243		}
   244	
   245		for i := range freq {
   246			freq[i] = 0
   247		}
   248		for _, c := range text {
   249			freq[c]++
   250		}
   251		return freq
   252	}
   253	
   254	func bucketMin_8_64(text []byte, freq, bucket []int64) {
   255		freq = freq_8_64(text, freq, bucket)
   256		freq = freq[:256]     // establish len(freq) = 256, so 0 ≤ i < 256 below
   257		bucket = bucket[:256] // eliminate bounds check for bucket[i] below
   258		total := int64(0)
   259		for i, n := range freq {
   260			bucket[i] = total
   261			total += n
   262		}
   263	}
   264	
   265	func bucketMin_32(text []int32, freq, bucket []int32) {
   266		freq = freq_32(text, freq, bucket)
   267		total := int32(0)
   268		for i, n := range freq {
   269			bucket[i] = total
   270			total += n
   271		}
   272	}
   273	
   274	func bucketMin_64(text []int64, freq, bucket []int64) {
   275		freq = freq_64(text, freq, bucket)
   276		total := int64(0)
   277		for i, n := range freq {
   278			bucket[i] = total
   279			total += n
   280		}
   281	}
   282	
   283	func bucketMax_8_64(text []byte, freq, bucket []int64) {
   284		freq = freq_8_64(text, freq, bucket)
   285		freq = freq[:256]     // establish len(freq) = 256, so 0 ≤ i < 256 below
   286		bucket = bucket[:256] // eliminate bounds check for bucket[i] below
   287		total := int64(0)
   288		for i, n := range freq {
   289			total += n
   290			bucket[i] = total
   291		}
   292	}
   293	
   294	func bucketMax_32(text []int32, freq, bucket []int32) {
   295		freq = freq_32(text, freq, bucket)
   296		total := int32(0)
   297		for i, n := range freq {
   298			total += n
   299			bucket[i] = total
   300		}
   301	}
   302	
   303	func bucketMax_64(text []int64, freq, bucket []int64) {
   304		freq = freq_64(text, freq, bucket)
   305		total := int64(0)
   306		for i, n := range freq {
   307			total += n
   308			bucket[i] = total
   309		}
   310	}
   311	
   312	func placeLMS_8_64(text []byte, sa, freq, bucket []int64) int {
   313		bucketMax_8_64(text, freq, bucket)
   314	
   315		numLMS := 0
   316		lastB := int64(-1)
   317		bucket = bucket[:256] // eliminate bounds check for bucket[c1] below
   318	
   319		// The next stanza of code (until the blank line) loop backward
   320		// over text, stopping to execute a code body at each position i
   321		// such that text[i] is an L-character and text[i+1] is an S-character.
   322		// That is, i+1 is the position of the start of an LMS-substring.
   323		// These could be hoisted out into a function with a callback,
   324		// but at a significant speed cost. Instead, we just write these
   325		// seven lines a few times in this source file. The copies below
   326		// refer back to the pattern established by this original as the
   327		// "LMS-substring iterator".
   328		//
   329		// In every scan through the text, c0, c1 are successive characters of text.
   330		// In this backward scan, c0 == text[i] and c1 == text[i+1].
   331		// By scanning backward, we can keep track of whether the current
   332		// position is type-S or type-L according to the usual definition:
   333		//
   334		//	- position len(text) is type S with text[len(text)] == -1 (the sentinel)
   335		//	- position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S.
   336		//	- position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L.
   337		//
   338		// The backward scan lets us maintain the current type,
   339		// update it when we see c0 != c1, and otherwise leave it alone.
   340		// We want to identify all S positions with a preceding L.
   341		// Position len(text) is one such position by definition, but we have
   342		// nowhere to write it down, so we eliminate it by untruthfully
   343		// setting isTypeS = false at the start of the loop.
   344		c0, c1, isTypeS := byte(0), byte(0), false
   345		for i := len(text) - 1; i >= 0; i-- {
   346			c0, c1 = text[i], c0
   347			if c0 < c1 {
   348				isTypeS = true
   349			} else if c0 > c1 && isTypeS {
   350				isTypeS = false
   351	
   352				// Bucket the index i+1 for the start of an LMS-substring.
   353				b := bucket[c1] - 1
   354				bucket[c1] = b
   355				sa[b] = int64(i + 1)
   356				lastB = b
   357				numLMS++
   358			}
   359		}
   360	
   361		// We recorded the LMS-substring starts but really want the ends.
   362		// Luckily, with two differences, the start indexes and the end indexes are the same.
   363		// The first difference is that the rightmost LMS-substring's end index is len(text),
   364		// so the caller must pretend that sa[-1] == len(text), as noted above.
   365		// The second difference is that the first leftmost LMS-substring start index
   366		// does not end an earlier LMS-substring, so as an optimization we can omit
   367		// that leftmost LMS-substring start index (the last one we wrote).
   368		//
   369		// Exception: if numLMS <= 1, the caller is not going to bother with
   370		// the recursion at all and will treat the result as containing LMS-substring starts.
   371		// In that case, we don't remove the final entry.
   372		if numLMS > 1 {
   373			sa[lastB] = 0
   374		}
   375		return numLMS
   376	}
   377	
   378	func placeLMS_32(text []int32, sa, freq, bucket []int32) int {
   379		bucketMax_32(text, freq, bucket)
   380	
   381		numLMS := 0
   382		lastB := int32(-1)
   383	
   384		// The next stanza of code (until the blank line) loop backward
   385		// over text, stopping to execute a code body at each position i
   386		// such that text[i] is an L-character and text[i+1] is an S-character.
   387		// That is, i+1 is the position of the start of an LMS-substring.
   388		// These could be hoisted out into a function with a callback,
   389		// but at a significant speed cost. Instead, we just write these
   390		// seven lines a few times in this source file. The copies below
   391		// refer back to the pattern established by this original as the
   392		// "LMS-substring iterator".
   393		//
   394		// In every scan through the text, c0, c1 are successive characters of text.
   395		// In this backward scan, c0 == text[i] and c1 == text[i+1].
   396		// By scanning backward, we can keep track of whether the current
   397		// position is type-S or type-L according to the usual definition:
   398		//
   399		//	- position len(text) is type S with text[len(text)] == -1 (the sentinel)
   400		//	- position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S.
   401		//	- position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L.
   402		//
   403		// The backward scan lets us maintain the current type,
   404		// update it when we see c0 != c1, and otherwise leave it alone.
   405		// We want to identify all S positions with a preceding L.
   406		// Position len(text) is one such position by definition, but we have
   407		// nowhere to write it down, so we eliminate it by untruthfully
   408		// setting isTypeS = false at the start of the loop.
   409		c0, c1, isTypeS := int32(0), int32(0), false
   410		for i := len(text) - 1; i >= 0; i-- {
   411			c0, c1 = text[i], c0
   412			if c0 < c1 {
   413				isTypeS = true
   414			} else if c0 > c1 && isTypeS {
   415				isTypeS = false
   416	
   417				// Bucket the index i+1 for the start of an LMS-substring.
   418				b := bucket[c1] - 1
   419				bucket[c1] = b
   420				sa[b] = int32(i + 1)
   421				lastB = b
   422				numLMS++
   423			}
   424		}
   425	
   426		// We recorded the LMS-substring starts but really want the ends.
   427		// Luckily, with two differences, the start indexes and the end indexes are the same.
   428		// The first difference is that the rightmost LMS-substring's end index is len(text),
   429		// so the caller must pretend that sa[-1] == len(text), as noted above.
   430		// The second difference is that the first leftmost LMS-substring start index
   431		// does not end an earlier LMS-substring, so as an optimization we can omit
   432		// that leftmost LMS-substring start index (the last one we wrote).
   433		//
   434		// Exception: if numLMS <= 1, the caller is not going to bother with
   435		// the recursion at all and will treat the result as containing LMS-substring starts.
   436		// In that case, we don't remove the final entry.
   437		if numLMS > 1 {
   438			sa[lastB] = 0
   439		}
   440		return numLMS
   441	}
   442	
   443	func placeLMS_64(text []int64, sa, freq, bucket []int64) int {
   444		bucketMax_64(text, freq, bucket)
   445	
   446		numLMS := 0
   447		lastB := int64(-1)
   448	
   449		// The next stanza of code (until the blank line) loop backward
   450		// over text, stopping to execute a code body at each position i
   451		// such that text[i] is an L-character and text[i+1] is an S-character.
   452		// That is, i+1 is the position of the start of an LMS-substring.
   453		// These could be hoisted out into a function with a callback,
   454		// but at a significant speed cost. Instead, we just write these
   455		// seven lines a few times in this source file. The copies below
   456		// refer back to the pattern established by this original as the
   457		// "LMS-substring iterator".
   458		//
   459		// In every scan through the text, c0, c1 are successive characters of text.
   460		// In this backward scan, c0 == text[i] and c1 == text[i+1].
   461		// By scanning backward, we can keep track of whether the current
   462		// position is type-S or type-L according to the usual definition:
   463		//
   464		//	- position len(text) is type S with text[len(text)] == -1 (the sentinel)
   465		//	- position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S.
   466		//	- position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L.
   467		//
   468		// The backward scan lets us maintain the current type,
   469		// update it when we see c0 != c1, and otherwise leave it alone.
   470		// We want to identify all S positions with a preceding L.
   471		// Position len(text) is one such position by definition, but we have
   472		// nowhere to write it down, so we eliminate it by untruthfully
   473		// setting isTypeS = false at the start of the loop.
   474		c0, c1, isTypeS := int64(0), int64(0), false
   475		for i := len(text) - 1; i >= 0; i-- {
   476			c0, c1 = text[i], c0
   477			if c0 < c1 {
   478				isTypeS = true
   479			} else if c0 > c1 && isTypeS {
   480				isTypeS = false
   481	
   482				// Bucket the index i+1 for the start of an LMS-substring.
   483				b := bucket[c1] - 1
   484				bucket[c1] = b
   485				sa[b] = int64(i + 1)
   486				lastB = b
   487				numLMS++
   488			}
   489		}
   490	
   491		// We recorded the LMS-substring starts but really want the ends.
   492		// Luckily, with two differences, the start indexes and the end indexes are the same.
   493		// The first difference is that the rightmost LMS-substring's end index is len(text),
   494		// so the caller must pretend that sa[-1] == len(text), as noted above.
   495		// The second difference is that the first leftmost LMS-substring start index
   496		// does not end an earlier LMS-substring, so as an optimization we can omit
   497		// that leftmost LMS-substring start index (the last one we wrote).
   498		//
   499		// Exception: if numLMS <= 1, the caller is not going to bother with
   500		// the recursion at all and will treat the result as containing LMS-substring starts.
   501		// In that case, we don't remove the final entry.
   502		if numLMS > 1 {
   503			sa[lastB] = 0
   504		}
   505		return numLMS
   506	}
   507	
   508	func induceSubL_8_64(text []byte, sa, freq, bucket []int64) {
   509		// Initialize positions for left side of character buckets.
   510		bucketMin_8_64(text, freq, bucket)
   511		bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
   512	
   513		// As we scan the array left-to-right, each sa[i] = j > 0 is a correctly
   514		// sorted suffix array entry (for text[j:]) for which we know that j-1 is type L.
   515		// Because j-1 is type L, inserting it into sa now will sort it correctly.
   516		// But we want to distinguish a j-1 with j-2 of type L from type S.
   517		// We can process the former but want to leave the latter for the caller.
   518		// We record the difference by negating j-1 if it is preceded by type S.
   519		// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
   520		// happen at sa[i´] for some i´ > i, that is, in the portion of sa we have
   521		// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
   522		// and so on, in sorted but not necessarily adjacent order, until it finds
   523		// one preceded by an index of type S, at which point it must stop.
   524		//
   525		// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
   526		// and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing
   527		// only the indexes of the leftmost L-type indexes for each LMS-substring.
   528		//
   529		// The suffix array sa therefore serves simultaneously as input, output,
   530		// and a miraculously well-tailored work queue.
   531	
   532		// placeLMS_8_64 left out the implicit entry sa[-1] == len(text),
   533		// corresponding to the identified type-L index len(text)-1.
   534		// Process it before the left-to-right scan of sa proper.
   535		// See body in loop for commentary.
   536		k := len(text) - 1
   537		c0, c1 := text[k-1], text[k]
   538		if c0 < c1 {
   539			k = -k
   540		}
   541	
   542		// Cache recently used bucket index:
   543		// we're processing suffixes in sorted order
   544		// and accessing buckets indexed by the
   545		// byte before the sorted order, which still
   546		// has very good locality.
   547		// Invariant: b is cached, possibly dirty copy of bucket[cB].
   548		cB := c1
   549		b := bucket[cB]
   550		sa[b] = int64(k)
   551		b++
   552	
   553		for i := 0; i < len(sa); i++ {
   554			j := int(sa[i])
   555			if j == 0 {
   556				// Skip empty entry.
   557				continue
   558			}
   559			if j < 0 {
   560				// Leave discovered type-S index for caller.
   561				sa[i] = int64(-j)
   562				continue
   563			}
   564			sa[i] = 0
   565	
   566			// Index j was on work queue, meaning k := j-1 is L-type,
   567			// so we can now place k correctly into sa.
   568			// If k-1 is L-type, queue k for processing later in this loop.
   569			// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
   570			k := j - 1
   571			c0, c1 := text[k-1], text[k]
   572			if c0 < c1 {
   573				k = -k
   574			}
   575	
   576			if cB != c1 {
   577				bucket[cB] = b
   578				cB = c1
   579				b = bucket[cB]
   580			}
   581			sa[b] = int64(k)
   582			b++
   583		}
   584	}
   585	
   586	func induceSubL_32(text []int32, sa, freq, bucket []int32) {
   587		// Initialize positions for left side of character buckets.
   588		bucketMin_32(text, freq, bucket)
   589	
   590		// As we scan the array left-to-right, each sa[i] = j > 0 is a correctly
   591		// sorted suffix array entry (for text[j:]) for which we know that j-1 is type L.
   592		// Because j-1 is type L, inserting it into sa now will sort it correctly.
   593		// But we want to distinguish a j-1 with j-2 of type L from type S.
   594		// We can process the former but want to leave the latter for the caller.
   595		// We record the difference by negating j-1 if it is preceded by type S.
   596		// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
   597		// happen at sa[i´] for some i´ > i, that is, in the portion of sa we have
   598		// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
   599		// and so on, in sorted but not necessarily adjacent order, until it finds
   600		// one preceded by an index of type S, at which point it must stop.
   601		//
   602		// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
   603		// and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing
   604		// only the indexes of the leftmost L-type indexes for each LMS-substring.
   605		//
   606		// The suffix array sa therefore serves simultaneously as input, output,
   607		// and a miraculously well-tailored work queue.
   608	
   609		// placeLMS_32 left out the implicit entry sa[-1] == len(text),
   610		// corresponding to the identified type-L index len(text)-1.
   611		// Process it before the left-to-right scan of sa proper.
   612		// See body in loop for commentary.
   613		k := len(text) - 1
   614		c0, c1 := text[k-1], text[k]
   615		if c0 < c1 {
   616			k = -k
   617		}
   618	
   619		// Cache recently used bucket index:
   620		// we're processing suffixes in sorted order
   621		// and accessing buckets indexed by the
   622		// int32 before the sorted order, which still
   623		// has very good locality.
   624		// Invariant: b is cached, possibly dirty copy of bucket[cB].
   625		cB := c1
   626		b := bucket[cB]
   627		sa[b] = int32(k)
   628		b++
   629	
   630		for i := 0; i < len(sa); i++ {
   631			j := int(sa[i])
   632			if j == 0 {
   633				// Skip empty entry.
   634				continue
   635			}
   636			if j < 0 {
   637				// Leave discovered type-S index for caller.
   638				sa[i] = int32(-j)
   639				continue
   640			}
   641			sa[i] = 0
   642	
   643			// Index j was on work queue, meaning k := j-1 is L-type,
   644			// so we can now place k correctly into sa.
   645			// If k-1 is L-type, queue k for processing later in this loop.
   646			// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
   647			k := j - 1
   648			c0, c1 := text[k-1], text[k]
   649			if c0 < c1 {
   650				k = -k
   651			}
   652	
   653			if cB != c1 {
   654				bucket[cB] = b
   655				cB = c1
   656				b = bucket[cB]
   657			}
   658			sa[b] = int32(k)
   659			b++
   660		}
   661	}
   662	
   663	func induceSubL_64(text []int64, sa, freq, bucket []int64) {
   664		// Initialize positions for left side of character buckets.
   665		bucketMin_64(text, freq, bucket)
   666	
   667		// As we scan the array left-to-right, each sa[i] = j > 0 is a correctly
   668		// sorted suffix array entry (for text[j:]) for which we know that j-1 is type L.
   669		// Because j-1 is type L, inserting it into sa now will sort it correctly.
   670		// But we want to distinguish a j-1 with j-2 of type L from type S.
   671		// We can process the former but want to leave the latter for the caller.
   672		// We record the difference by negating j-1 if it is preceded by type S.
   673		// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
   674		// happen at sa[i´] for some i´ > i, that is, in the portion of sa we have
   675		// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
   676		// and so on, in sorted but not necessarily adjacent order, until it finds
   677		// one preceded by an index of type S, at which point it must stop.
   678		//
   679		// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
   680		// and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing
   681		// only the indexes of the leftmost L-type indexes for each LMS-substring.
   682		//
   683		// The suffix array sa therefore serves simultaneously as input, output,
   684		// and a miraculously well-tailored work queue.
   685	
   686		// placeLMS_64 left out the implicit entry sa[-1] == len(text),
   687		// corresponding to the identified type-L index len(text)-1.
   688		// Process it before the left-to-right scan of sa proper.
   689		// See body in loop for commentary.
   690		k := len(text) - 1
   691		c0, c1 := text[k-1], text[k]
   692		if c0 < c1 {
   693			k = -k
   694		}
   695	
   696		// Cache recently used bucket index:
   697		// we're processing suffixes in sorted order
   698		// and accessing buckets indexed by the
   699		// int64 before the sorted order, which still
   700		// has very good locality.
   701		// Invariant: b is cached, possibly dirty copy of bucket[cB].
   702		cB := c1
   703		b := bucket[cB]
   704		sa[b] = int64(k)
   705		b++
   706	
   707		for i := 0; i < len(sa); i++ {
   708			j := int(sa[i])
   709			if j == 0 {
   710				// Skip empty entry.
   711				continue
   712			}
   713			if j < 0 {
   714				// Leave discovered type-S index for caller.
   715				sa[i] = int64(-j)
   716				continue
   717			}
   718			sa[i] = 0
   719	
   720			// Index j was on work queue, meaning k := j-1 is L-type,
   721			// so we can now place k correctly into sa.
   722			// If k-1 is L-type, queue k for processing later in this loop.
   723			// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
   724			k := j - 1
   725			c0, c1 := text[k-1], text[k]
   726			if c0 < c1 {
   727				k = -k
   728			}
   729	
   730			if cB != c1 {
   731				bucket[cB] = b
   732				cB = c1
   733				b = bucket[cB]
   734			}
   735			sa[b] = int64(k)
   736			b++
   737		}
   738	}
   739	
   740	func induceSubS_8_64(text []byte, sa, freq, bucket []int64) {
   741		// Initialize positions for right side of character buckets.
   742		bucketMax_8_64(text, freq, bucket)
   743		bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
   744	
   745		// Analogous to induceSubL_8_64 above,
   746		// as we scan the array right-to-left, each sa[i] = j > 0 is a correctly
   747		// sorted suffix array entry (for text[j:]) for which we know that j-1 is type S.
   748		// Because j-1 is type S, inserting it into sa now will sort it correctly.
   749		// But we want to distinguish a j-1 with j-2 of type S from type L.
   750		// We can process the former but want to leave the latter for the caller.
   751		// We record the difference by negating j-1 if it is preceded by type L.
   752		// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
   753		// happen at sa[i´] for some i´ < i, that is, in the portion of sa we have
   754		// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
   755		// and so on, in sorted but not necessarily adjacent order, until it finds
   756		// one preceded by an index of type L, at which point it must stop.
   757		// That index (preceded by one of type L) is an LMS-substring start.
   758		//
   759		// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
   760		// and we flip sa[i] < 0 to -sa[i] and compact into the top of sa,
   761		// so that the loop finishes with the top of sa containing exactly
   762		// the LMS-substring start indexes, sorted by LMS-substring.
   763	
   764		// Cache recently used bucket index:
   765		cB := byte(0)
   766		b := bucket[cB]
   767	
   768		top := len(sa)
   769		for i := len(sa) - 1; i >= 0; i-- {
   770			j := int(sa[i])
   771			if j == 0 {
   772				// Skip empty entry.
   773				continue
   774			}
   775			sa[i] = 0
   776			if j < 0 {
   777				// Leave discovered LMS-substring start index for caller.
   778				top--
   779				sa[top] = int64(-j)
   780				continue
   781			}
   782	
   783			// Index j was on work queue, meaning k := j-1 is S-type,
   784			// so we can now place k correctly into sa.
   785			// If k-1 is S-type, queue k for processing later in this loop.
   786			// If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller.
   787			k := j - 1
   788			c1 := text[k]
   789			c0 := text[k-1]
   790			if c0 > c1 {
   791				k = -k
   792			}
   793	
   794			if cB != c1 {
   795				bucket[cB] = b
   796				cB = c1
   797				b = bucket[cB]
   798			}
   799			b--
   800			sa[b] = int64(k)
   801		}
   802	}
   803	
   804	func induceSubS_32(text []int32, sa, freq, bucket []int32) {
   805		// Initialize positions for right side of character buckets.
   806		bucketMax_32(text, freq, bucket)
   807	
   808		// Analogous to induceSubL_32 above,
   809		// as we scan the array right-to-left, each sa[i] = j > 0 is a correctly
   810		// sorted suffix array entry (for text[j:]) for which we know that j-1 is type S.
   811		// Because j-1 is type S, inserting it into sa now will sort it correctly.
   812		// But we want to distinguish a j-1 with j-2 of type S from type L.
   813		// We can process the former but want to leave the latter for the caller.
   814		// We record the difference by negating j-1 if it is preceded by type L.
   815		// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
   816		// happen at sa[i´] for some i´ < i, that is, in the portion of sa we have
   817		// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
   818		// and so on, in sorted but not necessarily adjacent order, until it finds
   819		// one preceded by an index of type L, at which point it must stop.
   820		// That index (preceded by one of type L) is an LMS-substring start.
   821		//
   822		// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
   823		// and we flip sa[i] < 0 to -sa[i] and compact into the top of sa,
   824		// so that the loop finishes with the top of sa containing exactly
   825		// the LMS-substring start indexes, sorted by LMS-substring.
   826	
   827		// Cache recently used bucket index:
   828		cB := int32(0)
   829		b := bucket[cB]
   830	
   831		top := len(sa)
   832		for i := len(sa) - 1; i >= 0; i-- {
   833			j := int(sa[i])
   834			if j == 0 {
   835				// Skip empty entry.
   836				continue
   837			}
   838			sa[i] = 0
   839			if j < 0 {
   840				// Leave discovered LMS-substring start index for caller.
   841				top--
   842				sa[top] = int32(-j)
   843				continue
   844			}
   845	
   846			// Index j was on work queue, meaning k := j-1 is S-type,
   847			// so we can now place k correctly into sa.
   848			// If k-1 is S-type, queue k for processing later in this loop.
   849			// If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller.
   850			k := j - 1
   851			c1 := text[k]
   852			c0 := text[k-1]
   853			if c0 > c1 {
   854				k = -k
   855			}
   856	
   857			if cB != c1 {
   858				bucket[cB] = b
   859				cB = c1
   860				b = bucket[cB]
   861			}
   862			b--
   863			sa[b] = int32(k)
   864		}
   865	}
   866	
   867	func induceSubS_64(text []int64, sa, freq, bucket []int64) {
   868		// Initialize positions for right side of character buckets.
   869		bucketMax_64(text, freq, bucket)
   870	
   871		// Analogous to induceSubL_64 above,
   872		// as we scan the array right-to-left, each sa[i] = j > 0 is a correctly
   873		// sorted suffix array entry (for text[j:]) for which we know that j-1 is type S.
   874		// Because j-1 is type S, inserting it into sa now will sort it correctly.
   875		// But we want to distinguish a j-1 with j-2 of type S from type L.
   876		// We can process the former but want to leave the latter for the caller.
   877		// We record the difference by negating j-1 if it is preceded by type L.
   878		// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
   879		// happen at sa[i´] for some i´ < i, that is, in the portion of sa we have
   880		// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
   881		// and so on, in sorted but not necessarily adjacent order, until it finds
   882		// one preceded by an index of type L, at which point it must stop.
   883		// That index (preceded by one of type L) is an LMS-substring start.
   884		//
   885		// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
   886		// and we flip sa[i] < 0 to -sa[i] and compact into the top of sa,
   887		// so that the loop finishes with the top of sa containing exactly
   888		// the LMS-substring start indexes, sorted by LMS-substring.
   889	
   890		// Cache recently used bucket index:
   891		cB := int64(0)
   892		b := bucket[cB]
   893	
   894		top := len(sa)
   895		for i := len(sa) - 1; i >= 0; i-- {
   896			j := int(sa[i])
   897			if j == 0 {
   898				// Skip empty entry.
   899				continue
   900			}
   901			sa[i] = 0
   902			if j < 0 {
   903				// Leave discovered LMS-substring start index for caller.
   904				top--
   905				sa[top] = int64(-j)
   906				continue
   907			}
   908	
   909			// Index j was on work queue, meaning k := j-1 is S-type,
   910			// so we can now place k correctly into sa.
   911			// If k-1 is S-type, queue k for processing later in this loop.
   912			// If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller.
   913			k := j - 1
   914			c1 := text[k]
   915			c0 := text[k-1]
   916			if c0 > c1 {
   917				k = -k
   918			}
   919	
   920			if cB != c1 {
   921				bucket[cB] = b
   922				cB = c1
   923				b = bucket[cB]
   924			}
   925			b--
   926			sa[b] = int64(k)
   927		}
   928	}
   929	
   930	func length_8_64(text []byte, sa []int64, numLMS int) {
   931		end := 0 // index of current LMS-substring end (0 indicates final LMS-substring)
   932	
   933		// The encoding of N text bytes into a “length” word
   934		// adds 1 to each byte, packs them into the bottom
   935		// N*8 bits of a word, and then bitwise inverts the result.
   936		// That is, the text sequence A B C (hex 41 42 43)
   937		// encodes as ^uint64(0x42_43_44).
   938		// LMS-substrings can never start or end with 0xFF.
   939		// Adding 1 ensures the encoded byte sequence never
   940		// starts or ends with 0x00, so that present bytes can be
   941		// distinguished from zero-padding in the top bits,
   942		// so the length need not be separately encoded.
   943		// Inverting the bytes increases the chance that a
   944		// 4-byte encoding will still be ≥ len(text).
   945		// In particular, if the first byte is ASCII (<= 0x7E, so +1 <= 0x7F)
   946		// then the high bit of the inversion will be set,
   947		// making it clearly not a valid length (it would be a negative one).
   948		//
   949		// cx holds the pre-inverted encoding (the packed incremented bytes).
   950		cx := uint64(0) // byte-only
   951	
   952		// This stanza (until the blank line) is the "LMS-substring iterator",
   953		// described in placeLMS_8_64 above, with one line added to maintain cx.
   954		c0, c1, isTypeS := byte(0), byte(0), false
   955		for i := len(text) - 1; i >= 0; i-- {
   956			c0, c1 = text[i], c0
   957			cx = cx<<8 | uint64(c1+1) // byte-only
   958			if c0 < c1 {
   959				isTypeS = true
   960			} else if c0 > c1 && isTypeS {
   961				isTypeS = false
   962	
   963				// Index j = i+1 is the start of an LMS-substring.
   964				// Compute length or encoded text to store in sa[j/2].
   965				j := i + 1
   966				var code int64
   967				if end == 0 {
   968					code = 0
   969				} else {
   970					code = int64(end - j)
   971					if code <= 64/8 && ^cx >= uint64(len(text)) { // byte-only
   972						code = int64(^cx) // byte-only
   973					} // byte-only
   974				}
   975				sa[j>>1] = code
   976				end = j + 1
   977				cx = uint64(c1 + 1) // byte-only
   978			}
   979		}
   980	}
   981	
   982	func length_32(text []int32, sa []int32, numLMS int) {
   983		end := 0 // index of current LMS-substring end (0 indicates final LMS-substring)
   984	
   985		// The encoding of N text int32s into a “length” word
   986		// adds 1 to each int32, packs them into the bottom
   987		// N*8 bits of a word, and then bitwise inverts the result.
   988		// That is, the text sequence A B C (hex 41 42 43)
   989		// encodes as ^uint32(0x42_43_44).
   990		// LMS-substrings can never start or end with 0xFF.
   991		// Adding 1 ensures the encoded int32 sequence never
   992		// starts or ends with 0x00, so that present int32s can be
   993		// distinguished from zero-padding in the top bits,
   994		// so the length need not be separately encoded.
   995		// Inverting the int32s increases the chance that a
   996		// 4-int32 encoding will still be ≥ len(text).
   997		// In particular, if the first int32 is ASCII (<= 0x7E, so +1 <= 0x7F)
   998		// then the high bit of the inversion will be set,
   999		// making it clearly not a valid length (it would be a negative one).
  1000		//
  1001		// cx holds the pre-inverted encoding (the packed incremented int32s).
  1002	
  1003		// This stanza (until the blank line) is the "LMS-substring iterator",
  1004		// described in placeLMS_32 above, with one line added to maintain cx.
  1005		c0, c1, isTypeS := int32(0), int32(0), false
  1006		for i := len(text) - 1; i >= 0; i-- {
  1007			c0, c1 = text[i], c0
  1008			if c0 < c1 {
  1009				isTypeS = true
  1010			} else if c0 > c1 && isTypeS {
  1011				isTypeS = false
  1012	
  1013				// Index j = i+1 is the start of an LMS-substring.
  1014				// Compute length or encoded text to store in sa[j/2].
  1015				j := i + 1
  1016				var code int32
  1017				if end == 0 {
  1018					code = 0
  1019				} else {
  1020					code = int32(end - j)
  1021				}
  1022				sa[j>>1] = code
  1023				end = j + 1
  1024			}
  1025		}
  1026	}
  1027	
  1028	func length_64(text []int64, sa []int64, numLMS int) {
  1029		end := 0 // index of current LMS-substring end (0 indicates final LMS-substring)
  1030	
  1031		// The encoding of N text int64s into a “length” word
  1032		// adds 1 to each int64, packs them into the bottom
  1033		// N*8 bits of a word, and then bitwise inverts the result.
  1034		// That is, the text sequence A B C (hex 41 42 43)
  1035		// encodes as ^uint64(0x42_43_44).
  1036		// LMS-substrings can never start or end with 0xFF.
  1037		// Adding 1 ensures the encoded int64 sequence never
  1038		// starts or ends with 0x00, so that present int64s can be
  1039		// distinguished from zero-padding in the top bits,
  1040		// so the length need not be separately encoded.
  1041		// Inverting the int64s increases the chance that a
  1042		// 4-int64 encoding will still be ≥ len(text).
  1043		// In particular, if the first int64 is ASCII (<= 0x7E, so +1 <= 0x7F)
  1044		// then the high bit of the inversion will be set,
  1045		// making it clearly not a valid length (it would be a negative one).
  1046		//
  1047		// cx holds the pre-inverted encoding (the packed incremented int64s).
  1048	
  1049		// This stanza (until the blank line) is the "LMS-substring iterator",
  1050		// described in placeLMS_64 above, with one line added to maintain cx.
  1051		c0, c1, isTypeS := int64(0), int64(0), false
  1052		for i := len(text) - 1; i >= 0; i-- {
  1053			c0, c1 = text[i], c0
  1054			if c0 < c1 {
  1055				isTypeS = true
  1056			} else if c0 > c1 && isTypeS {
  1057				isTypeS = false
  1058	
  1059				// Index j = i+1 is the start of an LMS-substring.
  1060				// Compute length or encoded text to store in sa[j/2].
  1061				j := i + 1
  1062				var code int64
  1063				if end == 0 {
  1064					code = 0
  1065				} else {
  1066					code = int64(end - j)
  1067				}
  1068				sa[j>>1] = code
  1069				end = j + 1
  1070			}
  1071		}
  1072	}
  1073	
  1074	func assignID_8_64(text []byte, sa []int64, numLMS int) int {
  1075		id := 0
  1076		lastLen := int64(-1) // impossible
  1077		lastPos := int64(0)
  1078		for _, j := range sa[len(sa)-numLMS:] {
  1079			// Is the LMS-substring at index j new, or is it the same as the last one we saw?
  1080			n := sa[j/2]
  1081			if n != lastLen {
  1082				goto New
  1083			}
  1084			if uint64(n) >= uint64(len(text)) {
  1085				// “Length” is really encoded full text, and they match.
  1086				goto Same
  1087			}
  1088			{
  1089				// Compare actual texts.
  1090				n := int(n)
  1091				this := text[j:][:n]
  1092				last := text[lastPos:][:n]
  1093				for i := 0; i < n; i++ {
  1094					if this[i] != last[i] {
  1095						goto New
  1096					}
  1097				}
  1098				goto Same
  1099			}
  1100		New:
  1101			id++
  1102			lastPos = j
  1103			lastLen = n
  1104		Same:
  1105			sa[j/2] = int64(id)
  1106		}
  1107		return id
  1108	}
  1109	
  1110	func assignID_32(text []int32, sa []int32, numLMS int) int {
  1111		id := 0
  1112		lastLen := int32(-1) // impossible
  1113		lastPos := int32(0)
  1114		for _, j := range sa[len(sa)-numLMS:] {
  1115			// Is the LMS-substring at index j new, or is it the same as the last one we saw?
  1116			n := sa[j/2]
  1117			if n != lastLen {
  1118				goto New
  1119			}
  1120			if uint32(n) >= uint32(len(text)) {
  1121				// “Length” is really encoded full text, and they match.
  1122				goto Same
  1123			}
  1124			{
  1125				// Compare actual texts.
  1126				n := int(n)
  1127				this := text[j:][:n]
  1128				last := text[lastPos:][:n]
  1129				for i := 0; i < n; i++ {
  1130					if this[i] != last[i] {
  1131						goto New
  1132					}
  1133				}
  1134				goto Same
  1135			}
  1136		New:
  1137			id++
  1138			lastPos = j
  1139			lastLen = n
  1140		Same:
  1141			sa[j/2] = int32(id)
  1142		}
  1143		return id
  1144	}
  1145	
  1146	func assignID_64(text []int64, sa []int64, numLMS int) int {
  1147		id := 0
  1148		lastLen := int64(-1) // impossible
  1149		lastPos := int64(0)
  1150		for _, j := range sa[len(sa)-numLMS:] {
  1151			// Is the LMS-substring at index j new, or is it the same as the last one we saw?
  1152			n := sa[j/2]
  1153			if n != lastLen {
  1154				goto New
  1155			}
  1156			if uint64(n) >= uint64(len(text)) {
  1157				// “Length” is really encoded full text, and they match.
  1158				goto Same
  1159			}
  1160			{
  1161				// Compare actual texts.
  1162				n := int(n)
  1163				this := text[j:][:n]
  1164				last := text[lastPos:][:n]
  1165				for i := 0; i < n; i++ {
  1166					if this[i] != last[i] {
  1167						goto New
  1168					}
  1169				}
  1170				goto Same
  1171			}
  1172		New:
  1173			id++
  1174			lastPos = j
  1175			lastLen = n
  1176		Same:
  1177			sa[j/2] = int64(id)
  1178		}
  1179		return id
  1180	}
  1181	
  1182	func map_64(sa []int64, numLMS int) {
  1183		w := len(sa)
  1184		for i := len(sa) / 2; i >= 0; i-- {
  1185			j := sa[i]
  1186			if j > 0 {
  1187				w--
  1188				sa[w] = j - 1
  1189			}
  1190		}
  1191	}
  1192	
  1193	func recurse_64(sa, oldTmp []int64, numLMS, maxID int) {
  1194		dst, saTmp, text := sa[:numLMS], sa[numLMS:len(sa)-numLMS], sa[len(sa)-numLMS:]
  1195	
  1196		// Set up temporary space for recursive call.
  1197		// We must pass sais_64 a tmp buffer wiith at least maxID entries.
  1198		//
  1199		// The subproblem is guaranteed to have length at most len(sa)/2,
  1200		// so that sa can hold both the subproblem and its suffix array.
  1201		// Nearly all the time, however, the subproblem has length < len(sa)/3,
  1202		// in which case there is a subproblem-sized middle of sa that
  1203		// we can reuse for temporary space (saTmp).
  1204		// When recurse_64 is called from sais_8_64, oldTmp is length 512
  1205		// (from text_64), and saTmp will typically be much larger, so we'll use saTmp.
  1206		// When deeper recursions come back to recurse_64, now oldTmp is
  1207		// the saTmp from the top-most recursion, it is typically larger than
  1208		// the current saTmp (because the current sa gets smaller and smaller
  1209		// as the recursion gets deeper), and we keep reusing that top-most
  1210		// large saTmp instead of the offered smaller ones.
  1211		//
  1212		// Why is the subproblem length so often just under len(sa)/3?
  1213		// See Nong, Zhang, and Chen, section 3.6 for a plausible explanation.
  1214		// In brief, the len(sa)/2 case would correspond to an SLSLSLSLSLSL pattern
  1215		// in the input, perfect alternation of larger and smaller input bytes.
  1216		// Real text doesn't do that. If each L-type index is randomly followed
  1217		// by either an L-type or S-type index, then half the substrings will
  1218		// be of the form SLS, but the other half will be longer. Of that half,
  1219		// half (a quarter overall) will be SLLS; an eighth will be SLLLS, and so on.
  1220		// Not counting the final S in each (which overlaps the first S in the next),
  1221		// This works out to an average length 2×½ + 3×¼ + 4×⅛ + ... = 3.
  1222		// The space we need is further reduced by the fact that many of the
  1223		// short patterns like SLS will often be the same character sequences
  1224		// repeated throughout the text, reducing maxID relative to numLMS.
  1225		//
  1226		// For short inputs, the averages may not run in our favor, but then we
  1227		// can often fall back to using the length-512 tmp available in the
  1228		// top-most call. (Also a short allocation would not be a big deal.)
  1229		//
  1230		// For pathological inputs, we fall back to allocating a new tmp of length
  1231		// max(maxID, numLMS/2). This level of the recursion needs maxID,
  1232		// and all deeper levels of the recursion will need no more than numLMS/2,
  1233		// so this one allocation is guaranteed to suffice for the entire stack
  1234		// of recursive calls.
  1235		tmp := oldTmp
  1236		if len(tmp) < len(saTmp) {
  1237			tmp = saTmp
  1238		}
  1239		if len(tmp) < numLMS {
  1240			// TestSAIS/forcealloc reaches this code.
  1241			n := maxID
  1242			if n < numLMS/2 {
  1243				n = numLMS / 2
  1244			}
  1245			tmp = make([]int64, n)
  1246		}
  1247	
  1248		// sais_64 requires that the caller arrange to clear dst,
  1249		// because in general the caller may know dst is
  1250		// freshly-allocated and already cleared. But this one is not.
  1251		for i := range dst {
  1252			dst[i] = 0
  1253		}
  1254		sais_64(text, maxID, dst, tmp)
  1255	}
  1256	
  1257	func unmap_8_64(text []byte, sa []int64, numLMS int) {
  1258		unmap := sa[len(sa)-numLMS:]
  1259		j := len(unmap)
  1260	
  1261		// "LMS-substring iterator" (see placeLMS_8_64 above).
  1262		c0, c1, isTypeS := byte(0), byte(0), false
  1263		for i := len(text) - 1; i >= 0; i-- {
  1264			c0, c1 = text[i], c0
  1265			if c0 < c1 {
  1266				isTypeS = true
  1267			} else if c0 > c1 && isTypeS {
  1268				isTypeS = false
  1269	
  1270				// Populate inverse map.
  1271				j--
  1272				unmap[j] = int64(i + 1)
  1273			}
  1274		}
  1275	
  1276		// Apply inverse map to subproblem suffix array.
  1277		sa = sa[:numLMS]
  1278		for i := 0; i < len(sa); i++ {
  1279			sa[i] = unmap[sa[i]]
  1280		}
  1281	}
  1282	
  1283	func unmap_32(text []int32, sa []int32, numLMS int) {
  1284		unmap := sa[len(sa)-numLMS:]
  1285		j := len(unmap)
  1286	
  1287		// "LMS-substring iterator" (see placeLMS_32 above).
  1288		c0, c1, isTypeS := int32(0), int32(0), false
  1289		for i := len(text) - 1; i >= 0; i-- {
  1290			c0, c1 = text[i], c0
  1291			if c0 < c1 {
  1292				isTypeS = true
  1293			} else if c0 > c1 && isTypeS {
  1294				isTypeS = false
  1295	
  1296				// Populate inverse map.
  1297				j--
  1298				unmap[j] = int32(i + 1)
  1299			}
  1300		}
  1301	
  1302		// Apply inverse map to subproblem suffix array.
  1303		sa = sa[:numLMS]
  1304		for i := 0; i < len(sa); i++ {
  1305			sa[i] = unmap[sa[i]]
  1306		}
  1307	}
  1308	
  1309	func unmap_64(text []int64, sa []int64, numLMS int) {
  1310		unmap := sa[len(sa)-numLMS:]
  1311		j := len(unmap)
  1312	
  1313		// "LMS-substring iterator" (see placeLMS_64 above).
  1314		c0, c1, isTypeS := int64(0), int64(0), false
  1315		for i := len(text) - 1; i >= 0; i-- {
  1316			c0, c1 = text[i], c0
  1317			if c0 < c1 {
  1318				isTypeS = true
  1319			} else if c0 > c1 && isTypeS {
  1320				isTypeS = false
  1321	
  1322				// Populate inverse map.
  1323				j--
  1324				unmap[j] = int64(i + 1)
  1325			}
  1326		}
  1327	
  1328		// Apply inverse map to subproblem suffix array.
  1329		sa = sa[:numLMS]
  1330		for i := 0; i < len(sa); i++ {
  1331			sa[i] = unmap[sa[i]]
  1332		}
  1333	}
  1334	
  1335	func expand_8_64(text []byte, freq, bucket, sa []int64, numLMS int) {
  1336		bucketMax_8_64(text, freq, bucket)
  1337		bucket = bucket[:256] // eliminate bound check for bucket[c] below
  1338	
  1339		// Loop backward through sa, always tracking
  1340		// the next index to populate from sa[:numLMS].
  1341		// When we get to one, populate it.
  1342		// Zero the rest of the slots; they have dead values in them.
  1343		x := numLMS - 1
  1344		saX := sa[x]
  1345		c := text[saX]
  1346		b := bucket[c] - 1
  1347		bucket[c] = b
  1348	
  1349		for i := len(sa) - 1; i >= 0; i-- {
  1350			if i != int(b) {
  1351				sa[i] = 0
  1352				continue
  1353			}
  1354			sa[i] = saX
  1355	
  1356			// Load next entry to put down (if any).
  1357			if x > 0 {
  1358				x--
  1359				saX = sa[x] // TODO bounds check
  1360				c = text[saX]
  1361				b = bucket[c] - 1
  1362				bucket[c] = b
  1363			}
  1364		}
  1365	}
  1366	
  1367	func expand_32(text []int32, freq, bucket, sa []int32, numLMS int) {
  1368		bucketMax_32(text, freq, bucket)
  1369	
  1370		// Loop backward through sa, always tracking
  1371		// the next index to populate from sa[:numLMS].
  1372		// When we get to one, populate it.
  1373		// Zero the rest of the slots; they have dead values in them.
  1374		x := numLMS - 1
  1375		saX := sa[x]
  1376		c := text[saX]
  1377		b := bucket[c] - 1
  1378		bucket[c] = b
  1379	
  1380		for i := len(sa) - 1; i >= 0; i-- {
  1381			if i != int(b) {
  1382				sa[i] = 0
  1383				continue
  1384			}
  1385			sa[i] = saX
  1386	
  1387			// Load next entry to put down (if any).
  1388			if x > 0 {
  1389				x--
  1390				saX = sa[x] // TODO bounds check
  1391				c = text[saX]
  1392				b = bucket[c] - 1
  1393				bucket[c] = b
  1394			}
  1395		}
  1396	}
  1397	
  1398	func expand_64(text []int64, freq, bucket, sa []int64, numLMS int) {
  1399		bucketMax_64(text, freq, bucket)
  1400	
  1401		// Loop backward through sa, always tracking
  1402		// the next index to populate from sa[:numLMS].
  1403		// When we get to one, populate it.
  1404		// Zero the rest of the slots; they have dead values in them.
  1405		x := numLMS - 1
  1406		saX := sa[x]
  1407		c := text[saX]
  1408		b := bucket[c] - 1
  1409		bucket[c] = b
  1410	
  1411		for i := len(sa) - 1; i >= 0; i-- {
  1412			if i != int(b) {
  1413				sa[i] = 0
  1414				continue
  1415			}
  1416			sa[i] = saX
  1417	
  1418			// Load next entry to put down (if any).
  1419			if x > 0 {
  1420				x--
  1421				saX = sa[x] // TODO bounds check
  1422				c = text[saX]
  1423				b = bucket[c] - 1
  1424				bucket[c] = b
  1425			}
  1426		}
  1427	}
  1428	
  1429	func induceL_8_64(text []byte, sa, freq, bucket []int64) {
  1430		// Initialize positions for left side of character buckets.
  1431		bucketMin_8_64(text, freq, bucket)
  1432		bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
  1433	
  1434		// This scan is similar to the one in induceSubL_8_64 above.
  1435		// That one arranges to clear all but the leftmost L-type indexes.
  1436		// This scan leaves all the L-type indexes and the original S-type
  1437		// indexes, but it negates the positive leftmost L-type indexes
  1438		// (the ones that induceS_8_64 needs to process).
  1439	
  1440		// expand_8_64 left out the implicit entry sa[-1] == len(text),
  1441		// corresponding to the identified type-L index len(text)-1.
  1442		// Process it before the left-to-right scan of sa proper.
  1443		// See body in loop for commentary.
  1444		k := len(text) - 1
  1445		c0, c1 := text[k-1], text[k]
  1446		if c0 < c1 {
  1447			k = -k
  1448		}
  1449	
  1450		// Cache recently used bucket index.
  1451		cB := c1
  1452		b := bucket[cB]
  1453		sa[b] = int64(k)
  1454		b++
  1455	
  1456		for i := 0; i < len(sa); i++ {
  1457			j := int(sa[i])
  1458			if j <= 0 {
  1459				// Skip empty or negated entry (including negated zero).
  1460				continue
  1461			}
  1462	
  1463			// Index j was on work queue, meaning k := j-1 is L-type,
  1464			// so we can now place k correctly into sa.
  1465			// If k-1 is L-type, queue k for processing later in this loop.
  1466			// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
  1467			// If k is zero, k-1 doesn't exist, so we only need to leave it
  1468			// for the caller. The caller can't tell the difference between
  1469			// an empty slot and a non-empty zero, but there's no need
  1470			// to distinguish them anyway: the final suffix array will end up
  1471			// with one zero somewhere, and that will be a real zero.
  1472			k := j - 1
  1473			c1 := text[k]
  1474			if k > 0 {
  1475				if c0 := text[k-1]; c0 < c1 {
  1476					k = -k
  1477				}
  1478			}
  1479	
  1480			if cB != c1 {
  1481				bucket[cB] = b
  1482				cB = c1
  1483				b = bucket[cB]
  1484			}
  1485			sa[b] = int64(k)
  1486			b++
  1487		}
  1488	}
  1489	
  1490	func induceL_32(text []int32, sa, freq, bucket []int32) {
  1491		// Initialize positions for left side of character buckets.
  1492		bucketMin_32(text, freq, bucket)
  1493	
  1494		// This scan is similar to the one in induceSubL_32 above.
  1495		// That one arranges to clear all but the leftmost L-type indexes.
  1496		// This scan leaves all the L-type indexes and the original S-type
  1497		// indexes, but it negates the positive leftmost L-type indexes
  1498		// (the ones that induceS_32 needs to process).
  1499	
  1500		// expand_32 left out the implicit entry sa[-1] == len(text),
  1501		// corresponding to the identified type-L index len(text)-1.
  1502		// Process it before the left-to-right scan of sa proper.
  1503		// See body in loop for commentary.
  1504		k := len(text) - 1
  1505		c0, c1 := text[k-1], text[k]
  1506		if c0 < c1 {
  1507			k = -k
  1508		}
  1509	
  1510		// Cache recently used bucket index.
  1511		cB := c1
  1512		b := bucket[cB]
  1513		sa[b] = int32(k)
  1514		b++
  1515	
  1516		for i := 0; i < len(sa); i++ {
  1517			j := int(sa[i])
  1518			if j <= 0 {
  1519				// Skip empty or negated entry (including negated zero).
  1520				continue
  1521			}
  1522	
  1523			// Index j was on work queue, meaning k := j-1 is L-type,
  1524			// so we can now place k correctly into sa.
  1525			// If k-1 is L-type, queue k for processing later in this loop.
  1526			// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
  1527			// If k is zero, k-1 doesn't exist, so we only need to leave it
  1528			// for the caller. The caller can't tell the difference between
  1529			// an empty slot and a non-empty zero, but there's no need
  1530			// to distinguish them anyway: the final suffix array will end up
  1531			// with one zero somewhere, and that will be a real zero.
  1532			k := j - 1
  1533			c1 := text[k]
  1534			if k > 0 {
  1535				if c0 := text[k-1]; c0 < c1 {
  1536					k = -k
  1537				}
  1538			}
  1539	
  1540			if cB != c1 {
  1541				bucket[cB] = b
  1542				cB = c1
  1543				b = bucket[cB]
  1544			}
  1545			sa[b] = int32(k)
  1546			b++
  1547		}
  1548	}
  1549	
  1550	func induceL_64(text []int64, sa, freq, bucket []int64) {
  1551		// Initialize positions for left side of character buckets.
  1552		bucketMin_64(text, freq, bucket)
  1553	
  1554		// This scan is similar to the one in induceSubL_64 above.
  1555		// That one arranges to clear all but the leftmost L-type indexes.
  1556		// This scan leaves all the L-type indexes and the original S-type
  1557		// indexes, but it negates the positive leftmost L-type indexes
  1558		// (the ones that induceS_64 needs to process).
  1559	
  1560		// expand_64 left out the implicit entry sa[-1] == len(text),
  1561		// corresponding to the identified type-L index len(text)-1.
  1562		// Process it before the left-to-right scan of sa proper.
  1563		// See body in loop for commentary.
  1564		k := len(text) - 1
  1565		c0, c1 := text[k-1], text[k]
  1566		if c0 < c1 {
  1567			k = -k
  1568		}
  1569	
  1570		// Cache recently used bucket index.
  1571		cB := c1
  1572		b := bucket[cB]
  1573		sa[b] = int64(k)
  1574		b++
  1575	
  1576		for i := 0; i < len(sa); i++ {
  1577			j := int(sa[i])
  1578			if j <= 0 {
  1579				// Skip empty or negated entry (including negated zero).
  1580				continue
  1581			}
  1582	
  1583			// Index j was on work queue, meaning k := j-1 is L-type,
  1584			// so we can now place k correctly into sa.
  1585			// If k-1 is L-type, queue k for processing later in this loop.
  1586			// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
  1587			// If k is zero, k-1 doesn't exist, so we only need to leave it
  1588			// for the caller. The caller can't tell the difference between
  1589			// an empty slot and a non-empty zero, but there's no need
  1590			// to distinguish them anyway: the final suffix array will end up
  1591			// with one zero somewhere, and that will be a real zero.
  1592			k := j - 1
  1593			c1 := text[k]
  1594			if k > 0 {
  1595				if c0 := text[k-1]; c0 < c1 {
  1596					k = -k
  1597				}
  1598			}
  1599	
  1600			if cB != c1 {
  1601				bucket[cB] = b
  1602				cB = c1
  1603				b = bucket[cB]
  1604			}
  1605			sa[b] = int64(k)
  1606			b++
  1607		}
  1608	}
  1609	
  1610	func induceS_8_64(text []byte, sa, freq, bucket []int64) {
  1611		// Initialize positions for right side of character buckets.
  1612		bucketMax_8_64(text, freq, bucket)
  1613		bucket = bucket[:256] // eliminate bounds check for bucket[cB] below
  1614	
  1615		cB := byte(0)
  1616		b := bucket[cB]
  1617	
  1618		for i := len(sa) - 1; i >= 0; i-- {
  1619			j := int(sa[i])
  1620			if j >= 0 {
  1621				// Skip non-flagged entry.
  1622				// (This loop can't see an empty entry; 0 means the real zero index.)
  1623				continue
  1624			}
  1625	
  1626			// Negative j is a work queue entry; rewrite to positive j for final suffix array.
  1627			j = -j
  1628			sa[i] = int64(j)
  1629	
  1630			// Index j was on work queue (encoded as -j but now decoded),
  1631			// meaning k := j-1 is L-type,
  1632			// so we can now place k correctly into sa.
  1633			// If k-1 is S-type, queue -k for processing later in this loop.
  1634			// If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller.
  1635			// If k is zero, k-1 doesn't exist, so we only need to leave it
  1636			// for the caller.
  1637			k := j - 1
  1638			c1 := text[k]
  1639			if k > 0 {
  1640				if c0 := text[k-1]; c0 <= c1 {
  1641					k = -k
  1642				}
  1643			}
  1644	
  1645			if cB != c1 {
  1646				bucket[cB] = b
  1647				cB = c1
  1648				b = bucket[cB]
  1649			}
  1650			b--
  1651			sa[b] = int64(k)
  1652		}
  1653	}
  1654	
  1655	func induceS_32(text []int32, sa, freq, bucket []int32) {
  1656		// Initialize positions for right side of character buckets.
  1657		bucketMax_32(text, freq, bucket)
  1658	
  1659		cB := int32(0)
  1660		b := bucket[cB]
  1661	
  1662		for i := len(sa) - 1; i >= 0; i-- {
  1663			j := int(sa[i])
  1664			if j >= 0 {
  1665				// Skip non-flagged entry.
  1666				// (This loop can't see an empty entry; 0 means the real zero index.)
  1667				continue
  1668			}
  1669	
  1670			// Negative j is a work queue entry; rewrite to positive j for final suffix array.
  1671			j = -j
  1672			sa[i] = int32(j)
  1673	
  1674			// Index j was on work queue (encoded as -j but now decoded),
  1675			// meaning k := j-1 is L-type,
  1676			// so we can now place k correctly into sa.
  1677			// If k-1 is S-type, queue -k for processing later in this loop.
  1678			// If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller.
  1679			// If k is zero, k-1 doesn't exist, so we only need to leave it
  1680			// for the caller.
  1681			k := j - 1
  1682			c1 := text[k]
  1683			if k > 0 {
  1684				if c0 := text[k-1]; c0 <= c1 {
  1685					k = -k
  1686				}
  1687			}
  1688	
  1689			if cB != c1 {
  1690				bucket[cB] = b
  1691				cB = c1
  1692				b = bucket[cB]
  1693			}
  1694			b--
  1695			sa[b] = int32(k)
  1696		}
  1697	}
  1698	
  1699	func induceS_64(text []int64, sa, freq, bucket []int64) {
  1700		// Initialize positions for right side of character buckets.
  1701		bucketMax_64(text, freq, bucket)
  1702	
  1703		cB := int64(0)
  1704		b := bucket[cB]
  1705	
  1706		for i := len(sa) - 1; i >= 0; i-- {
  1707			j := int(sa[i])
  1708			if j >= 0 {
  1709				// Skip non-flagged entry.
  1710				// (This loop can't see an empty entry; 0 means the real zero index.)
  1711				continue
  1712			}
  1713	
  1714			// Negative j is a work queue entry; rewrite to positive j for final suffix array.
  1715			j = -j
  1716			sa[i] = int64(j)
  1717	
  1718			// Index j was on work queue (encoded as -j but now decoded),
  1719			// meaning k := j-1 is L-type,
  1720			// so we can now place k correctly into sa.
  1721			// If k-1 is S-type, queue -k for processing later in this loop.
  1722			// If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller.
  1723			// If k is zero, k-1 doesn't exist, so we only need to leave it
  1724			// for the caller.
  1725			k := j - 1
  1726			c1 := text[k]
  1727			if k > 0 {
  1728				if c0 := text[k-1]; c0 <= c1 {
  1729					k = -k
  1730				}
  1731			}
  1732	
  1733			if cB != c1 {
  1734				bucket[cB] = b
  1735				cB = c1
  1736				b = bucket[cB]
  1737			}
  1738			b--
  1739			sa[b] = int64(k)
  1740		}
  1741	}
  1742
View as plain text