Source file src/pkg/cmd/compile/internal/ssa/magic.go

     1	// Copyright 2016 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	package ssa
     6	
     7	import (
     8		"math/big"
     9		"math/bits"
    10	)
    11	
    12	// So you want to compute x / c for some constant c?
    13	// Machine division instructions are slow, so we try to
    14	// compute this division with a multiplication + a few
    15	// other cheap instructions instead.
    16	// (We assume here that c != 0, +/- 1, or +/- 2^i.  Those
    17	// cases are easy to handle in different ways).
    18	
    19	// Technique from https://gmplib.org/~tege/divcnst-pldi94.pdf
    20	
    21	// First consider unsigned division.
    22	// Our strategy is to precompute 1/c then do
    23	//   ⎣x / c⎦ = ⎣x * (1/c)⎦.
    24	// 1/c is less than 1, so we can't compute it directly in
    25	// integer arithmetic.  Let's instead compute 2^e/c
    26	// for a value of e TBD (^ = exponentiation).  Then
    27	//   ⎣x / c⎦ = ⎣x * (2^e/c) / 2^e⎦.
    28	// Dividing by 2^e is easy.  2^e/c isn't an integer, unfortunately.
    29	// So we must approximate it.  Let's call its approximation m.
    30	// We'll then compute
    31	//   ⎣x * m / 2^e⎦
    32	// Which we want to be equal to ⎣x / c⎦ for 0 <= x < 2^n-1
    33	// where n is the word size.
    34	// Setting x = c gives us c * m >= 2^e.
    35	// We'll chose m = ⎡2^e/c⎤ to satisfy that equation.
    36	// What remains is to choose e.
    37	// Let m = 2^e/c + delta, 0 <= delta < 1
    38	//   ⎣x * (2^e/c + delta) / 2^e⎦
    39	//   ⎣x / c + x * delta / 2^e⎦
    40	// We must have x * delta / 2^e < 1/c so that this
    41	// additional term never rounds differently than ⎣x / c⎦ does.
    42	// Rearranging,
    43	//   2^e > x * delta * c
    44	// x can be at most 2^n-1 and delta can be at most 1.
    45	// So it is sufficient to have 2^e >= 2^n*c.
    46	// So we'll choose e = n + s, with s = ⎡log2(c)⎤.
    47	//
    48	// An additional complication arises because m has n+1 bits in it.
    49	// Hardware restricts us to n bit by n bit multiplies.
    50	// We divide into 3 cases:
    51	//
    52	// Case 1: m is even.
    53	//   ⎣x / c⎦ = ⎣x * m / 2^(n+s)⎦
    54	//   ⎣x / c⎦ = ⎣x * (m/2) / 2^(n+s-1)⎦
    55	//   ⎣x / c⎦ = ⎣x * (m/2) / 2^n / 2^(s-1)⎦
    56	//   ⎣x / c⎦ = ⎣⎣x * (m/2) / 2^n⎦ / 2^(s-1)⎦
    57	//   multiply + shift
    58	//
    59	// Case 2: c is even.
    60	//   ⎣x / c⎦ = ⎣(x/2) / (c/2)⎦
    61	//   ⎣x / c⎦ = ⎣⎣x/2⎦ / (c/2)⎦
    62	//     This is just the original problem, with x' = ⎣x/2⎦, c' = c/2, n' = n-1.
    63	//       s' = s-1
    64	//       m' = ⎡2^(n'+s')/c'⎤
    65	//          = ⎡2^(n+s-1)/c⎤
    66	//          = ⎡m/2⎤
    67	//   ⎣x / c⎦ = ⎣x' * m' / 2^(n'+s')⎦
    68	//   ⎣x / c⎦ = ⎣⎣x/2⎦ * ⎡m/2⎤ / 2^(n+s-2)⎦
    69	//   ⎣x / c⎦ = ⎣⎣⎣x/2⎦ * ⎡m/2⎤ / 2^n⎦ / 2^(s-2)⎦
    70	//   shift + multiply + shift
    71	//
    72	// Case 3: everything else
    73	//   let k = m - 2^n. k fits in n bits.
    74	//   ⎣x / c⎦ = ⎣x * m / 2^(n+s)⎦
    75	//   ⎣x / c⎦ = ⎣x * (2^n + k) / 2^(n+s)⎦
    76	//   ⎣x / c⎦ = ⎣(x + x * k / 2^n) / 2^s⎦
    77	//   ⎣x / c⎦ = ⎣(x + ⎣x * k / 2^n⎦) / 2^s⎦
    78	//   ⎣x / c⎦ = ⎣(x + ⎣x * k / 2^n⎦) / 2^s⎦
    79	//   ⎣x / c⎦ = ⎣⎣(x + ⎣x * k / 2^n⎦) / 2⎦ / 2^(s-1)⎦
    80	//   multiply + avg + shift
    81	//
    82	// These can be implemented in hardware using:
    83	//  ⎣a * b / 2^n⎦ - aka high n bits of an n-bit by n-bit multiply.
    84	//  ⎣(a+b) / 2⎦   - aka "average" of two n-bit numbers.
    85	//                  (Not just a regular add & shift because the intermediate result
    86	//                   a+b has n+1 bits in it.  Nevertheless, can be done
    87	//                   in 2 instructions on x86.)
    88	
    89	// umagicOK reports whether we should strength reduce a n-bit divide by c.
    90	func umagicOK(n uint, c int64) bool {
    91		// Convert from ConstX auxint values to the real uint64 constant they represent.
    92		d := uint64(c) << (64 - n) >> (64 - n)
    93	
    94		// Doesn't work for 0.
    95		// Don't use for powers of 2.
    96		return d&(d-1) != 0
    97	}
    98	
    99	type umagicData struct {
   100		s int64  // ⎡log2(c)⎤
   101		m uint64 // ⎡2^(n+s)/c⎤ - 2^n
   102	}
   103	
   104	// umagic computes the constants needed to strength reduce unsigned n-bit divides by the constant uint64(c).
   105	// The return values satisfy for all 0 <= x < 2^n
   106	//  floor(x / uint64(c)) = x * (m + 2^n) >> (n+s)
   107	func umagic(n uint, c int64) umagicData {
   108		// Convert from ConstX auxint values to the real uint64 constant they represent.
   109		d := uint64(c) << (64 - n) >> (64 - n)
   110	
   111		C := new(big.Int).SetUint64(d)
   112		s := C.BitLen()
   113		M := big.NewInt(1)
   114		M.Lsh(M, n+uint(s))     // 2^(n+s)
   115		M.Add(M, C)             // 2^(n+s)+c
   116		M.Sub(M, big.NewInt(1)) // 2^(n+s)+c-1
   117		M.Div(M, C)             // ⎡2^(n+s)/c⎤
   118		if M.Bit(int(n)) != 1 {
   119			panic("n+1st bit isn't set")
   120		}
   121		M.SetBit(M, int(n), 0)
   122		m := M.Uint64()
   123		return umagicData{s: int64(s), m: m}
   124	}
   125	
   126	// For signed division, we use a similar strategy.
   127	// First, we enforce a positive c.
   128	//   x / c = -(x / (-c))
   129	// This will require an additional Neg op for c<0.
   130	//
   131	// If x is positive we're in a very similar state
   132	// to the unsigned case above.  We define:
   133	//   s = ⎡log2(c)⎤-1
   134	//   m = ⎡2^(n+s)/c⎤
   135	// Then
   136	//   ⎣x / c⎦ = ⎣x * m / 2^(n+s)⎦
   137	// If x is negative we have
   138	//   ⎡x / c⎤ = ⎣x * m / 2^(n+s)⎦ + 1
   139	// (TODO: derivation?)
   140	//
   141	// The multiply is a bit odd, as it is a signed n-bit value
   142	// times an unsigned n-bit value.  For n smaller than the
   143	// word size, we can extend x and m appropriately and use the
   144	// signed multiply instruction.  For n == word size,
   145	// we must use the signed multiply high and correct
   146	// the result by adding x*2^n.
   147	//
   148	// Adding 1 if x<0 is done by subtracting x>>(n-1).
   149	
   150	func smagicOK(n uint, c int64) bool {
   151		if c < 0 {
   152			// Doesn't work for negative c.
   153			return false
   154		}
   155		// Doesn't work for 0.
   156		// Don't use it for powers of 2.
   157		return c&(c-1) != 0
   158	}
   159	
   160	type smagicData struct {
   161		s int64  // ⎡log2(c)⎤-1
   162		m uint64 // ⎡2^(n+s)/c⎤
   163	}
   164	
   165	// magic computes the constants needed to strength reduce signed n-bit divides by the constant c.
   166	// Must have c>0.
   167	// The return values satisfy for all -2^(n-1) <= x < 2^(n-1)
   168	//  trunc(x / c) = x * m >> (n+s) + (x < 0 ? 1 : 0)
   169	func smagic(n uint, c int64) smagicData {
   170		C := new(big.Int).SetInt64(c)
   171		s := C.BitLen() - 1
   172		M := big.NewInt(1)
   173		M.Lsh(M, n+uint(s))     // 2^(n+s)
   174		M.Add(M, C)             // 2^(n+s)+c
   175		M.Sub(M, big.NewInt(1)) // 2^(n+s)+c-1
   176		M.Div(M, C)             // ⎡2^(n+s)/c⎤
   177		if M.Bit(int(n)) != 0 {
   178			panic("n+1st bit is set")
   179		}
   180		if M.Bit(int(n-1)) == 0 {
   181			panic("nth bit is not set")
   182		}
   183		m := M.Uint64()
   184		return smagicData{s: int64(s), m: m}
   185	}
   186	
   187	// Divisibility x%c == 0 can be checked more efficiently than directly computing
   188	// the modulus x%c and comparing against 0.
   189	//
   190	// The same "Division by invariant integers using multiplication" paper
   191	// by Granlund and Montgomery referenced above briefly mentions this method
   192	// and it is further elaborated in "Hacker's Delight" by Warren Section 10-17
   193	//
   194	// The first thing to note is that for odd integers, exact division can be computed
   195	// by using the modular inverse with respect to the word size 2^n.
   196	//
   197	// Given c, compute m such that (c * m) mod 2^n == 1
   198	// Then if c divides x (x%c ==0), the quotient is given by q = x/c == x*m mod 2^n
   199	//
   200	// x can range from 0, c, 2c, 3c, ... ⎣(2^n - 1)/c⎦ * c the maximum multiple
   201	// Thus, x*m mod 2^n is 0, 1, 2, 3, ... ⎣(2^n - 1)/c⎦
   202	// i.e. the quotient takes all values from zero up to max = ⎣(2^n - 1)/c⎦
   203	//
   204	// If x is not divisible by c, then x*m mod 2^n must take some larger value than max.
   205	//
   206	// This gives x*m mod 2^n <= ⎣(2^n - 1)/c⎦ as a test for divisibility
   207	// involving one multiplication and compare.
   208	//
   209	// To extend this to even integers, consider c = d0 * 2^k where d0 is odd.
   210	// We can test whether x is divisible by both d0 and 2^k.
   211	// For d0, the test is the same as above.  Let m be such that m*d0 mod 2^n == 1
   212	// Then x*m mod 2^n <= ⎣(2^n - 1)/d0⎦ is the first test.
   213	// The test for divisibility by 2^k is a check for k trailing zeroes.
   214	// Note that since d0 is odd, m is odd and thus x*m will have the same number of
   215	// trailing zeroes as x.  So the two tests are,
   216	//
   217	// x*m mod 2^n <= ⎣(2^n - 1)/d0⎦
   218	// and x*m ends in k zero bits
   219	//
   220	// These can be combined into a single comparison by the following
   221	// (theorem ZRU in Hacker's Delight) for unsigned integers.
   222	//
   223	// x <= a and x ends in k zero bits if and only if RotRight(x ,k) <= ⎣a/(2^k)⎦
   224	// Where RotRight(x ,k) is right rotation of x by k bits.
   225	//
   226	// To prove the first direction, x <= a -> ⎣x/(2^k)⎦ <= ⎣a/(2^k)⎦
   227	// But since x ends in k zeroes all the rotated bits would be zero too.
   228	// So RotRight(x, k) == ⎣x/(2^k)⎦ <= ⎣a/(2^k)⎦
   229	//
   230	// If x does not end in k zero bits, then RotRight(x, k)
   231	// has some non-zero bits in the k highest bits.
   232	// ⎣x/(2^k)⎦ has all zeroes in the k highest bits,
   233	// so RotRight(x, k) > ⎣x/(2^k)⎦
   234	//
   235	// Finally, if x > a and has k trailing zero bits, then RotRight(x, k) == ⎣x/(2^k)⎦
   236	// and ⎣x/(2^k)⎦ must be greater than ⎣a/(2^k)⎦, that is the top n-k bits of x must
   237	// be greater than the top n-k bits of a because the rest of x bits are zero.
   238	//
   239	// So the two conditions about can be replaced with the single test
   240	//
   241	// RotRight(x*m mod 2^n, k) <= ⎣(2^n - 1)/c⎦
   242	//
   243	// Where d0*2^k was replaced by c on the right hand side.
   244	
   245	// uivisibleOK reports whether we should strength reduce a n-bit dividisibilty check by c.
   246	func udivisibleOK(n uint, c int64) bool {
   247		// Convert from ConstX auxint values to the real uint64 constant they represent.
   248		d := uint64(c) << (64 - n) >> (64 - n)
   249	
   250		// Doesn't work for 0.
   251		// Don't use for powers of 2.
   252		return d&(d-1) != 0
   253	}
   254	
   255	type udivisibleData struct {
   256		k   int64  // trailingZeros(c)
   257		m   uint64 // m * (c>>k) mod 2^n == 1 multiplicative inverse of odd portion modulo 2^n
   258		max uint64 // ⎣(2^n - 1)/ c⎦ max value to for divisibility
   259	}
   260	
   261	func udivisible(n uint, c int64) udivisibleData {
   262		// Convert from ConstX auxint values to the real uint64 constant they represent.
   263		d := uint64(c) << (64 - n) >> (64 - n)
   264	
   265		k := bits.TrailingZeros64(d)
   266		d0 := d >> uint(k) // the odd portion of the divisor
   267	
   268		mask := ^uint64(0) >> (64 - n)
   269	
   270		// Calculate the multiplicative inverse via Newton's method.
   271		// Quadratic convergence doubles the number of correct bits per iteration.
   272		m := d0            // initial guess correct to 3-bits d0*d0 mod 8 == 1
   273		m = m * (2 - m*d0) // 6-bits
   274		m = m * (2 - m*d0) // 12-bits
   275		m = m * (2 - m*d0) // 24-bits
   276		m = m * (2 - m*d0) // 48-bits
   277		m = m * (2 - m*d0) // 96-bits >= 64-bits
   278		m = m & mask
   279	
   280		max := mask / d
   281	
   282		return udivisibleData{
   283			k:   int64(k),
   284			m:   m,
   285			max: max,
   286		}
   287	}
   288	
   289	// For signed integers, a similar method follows.
   290	//
   291	// Given c > 1 and odd, compute m such that (c * m) mod 2^n == 1
   292	// Then if c divides x (x%c ==0), the quotient is given by q = x/c == x*m mod 2^n
   293	//
   294	// x can range from ⎡-2^(n-1)/c⎤ * c, ... -c, 0, c, ...  ⎣(2^(n-1) - 1)/c⎦ * c
   295	// Thus, x*m mod 2^n is ⎡-2^(n-1)/c⎤, ... -2, -1, 0, 1, 2, ... ⎣(2^(n-1) - 1)/c⎦
   296	//
   297	// So, x is a multiple of c if and only if:
   298	// ⎡-2^(n-1)/c⎤ <= x*m mod 2^n <= ⎣(2^(n-1) - 1)/c⎦
   299	//
   300	// Since c > 1 and odd, this can be simplified by
   301	// ⎡-2^(n-1)/c⎤ == ⎡(-2^(n-1) + 1)/c⎤ == -⎣(2^(n-1) - 1)/c⎦
   302	//
   303	// -⎣(2^(n-1) - 1)/c⎦ <= x*m mod 2^n <= ⎣(2^(n-1) - 1)/c⎦
   304	//
   305	// To extend this to even integers, consider c = d0 * 2^k where d0 is odd.
   306	// We can test whether x is divisible by both d0 and 2^k.
   307	//
   308	// Let m be such that (d0 * m) mod 2^n == 1.
   309	// Let q = x*m mod 2^n. Then c divides x if:
   310	//
   311	// -⎣(2^(n-1) - 1)/d0⎦ <= q <= ⎣(2^(n-1) - 1)/d0⎦ and q ends in at least k 0-bits
   312	//
   313	// To transform this to a single comparison, we use the following theorem (ZRS in Hacker's Delight).
   314	//
   315	// For a >= 0 the following conditions are equivalent:
   316	// 1) -a <= x <= a and x ends in at least k 0-bits
   317	// 2) RotRight(x+a', k) <= ⎣2a'/2^k⎦
   318	//
   319	// Where a' = a & -2^k (a with its right k bits set to zero)
   320	//
   321	// To see that 1 & 2 are equivalent, note that -a <= x <= a is equivalent to
   322	// -a' <= x <= a' if and only if x ends in at least k 0-bits.  Adding -a' to each side gives,
   323	// 0 <= x + a' <= 2a' and x + a' ends in at least k 0-bits if and only if x does since a' has
   324	// k 0-bits by definition.  We can use theorem ZRU above with x -> x + a' and a -> 2a' giving 1) == 2).
   325	//
   326	// Let m be such that (d0 * m) mod 2^n == 1.
   327	// Let q = x*m mod 2^n.
   328	// Let a' = ⎣(2^(n-1) - 1)/d0⎦ & -2^k
   329	//
   330	// Then the divisibility test is:
   331	//
   332	// RotRight(q+a', k) <= ⎣2a'/2^k⎦
   333	//
   334	// Note that the calculation is performed using unsigned integers.
   335	// Since a' can have n-1 bits, 2a' may have n bits and there is no risk of overflow.
   336	
   337	// sdivisibleOK reports whether we should strength reduce a n-bit dividisibilty check by c.
   338	func sdivisibleOK(n uint, c int64) bool {
   339		if c < 0 {
   340			// Doesn't work for negative c.
   341			return false
   342		}
   343		// Doesn't work for 0.
   344		// Don't use it for powers of 2.
   345		return c&(c-1) != 0
   346	}
   347	
   348	type sdivisibleData struct {
   349		k   int64  // trailingZeros(c)
   350		m   uint64 // m * (c>>k) mod 2^n == 1 multiplicative inverse of odd portion modulo 2^n
   351		a   uint64 // ⎣(2^(n-1) - 1)/ (c>>k)⎦ & -(1<<k) additive constant
   352		max uint64 // ⎣(2 a) / (1<<k)⎦ max value to for divisibility
   353	}
   354	
   355	func sdivisible(n uint, c int64) sdivisibleData {
   356		d := uint64(c)
   357		k := bits.TrailingZeros64(d)
   358		d0 := d >> uint(k) // the odd portion of the divisor
   359	
   360		mask := ^uint64(0) >> (64 - n)
   361	
   362		// Calculate the multiplicative inverse via Newton's method.
   363		// Quadratic convergence doubles the number of correct bits per iteration.
   364		m := d0            // initial guess correct to 3-bits d0*d0 mod 8 == 1
   365		m = m * (2 - m*d0) // 6-bits
   366		m = m * (2 - m*d0) // 12-bits
   367		m = m * (2 - m*d0) // 24-bits
   368		m = m * (2 - m*d0) // 48-bits
   369		m = m * (2 - m*d0) // 96-bits >= 64-bits
   370		m = m & mask
   371	
   372		a := ((mask >> 1) / d0) & -(1 << uint(k))
   373		max := (2 * a) >> uint(k)
   374	
   375		return sdivisibleData{
   376			k:   int64(k),
   377			m:   m,
   378			a:   a,
   379			max: max,
   380		}
   381	}
   382
View as plain text