Source file src/pkg/math/big/nat.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// This file implements unsigned multi-precision integers (natural
     6	// numbers). They are the building blocks for the implementation
     7	// of signed integers, rationals, and floating-point numbers.
     8	//
     9	// Caution: This implementation relies on the function "alias"
    10	//          which assumes that (nat) slice capacities are never
    11	//          changed (no 3-operand slice expressions). If that
    12	//          changes, alias needs to be updated for correctness.
    13	
    14	package big
    15	
    16	import (
    17		"encoding/binary"
    18		"math/bits"
    19		"math/rand"
    20		"sync"
    21	)
    22	
    23	// An unsigned integer x of the form
    24	//
    25	//   x = x[n-1]*_B^(n-1) + x[n-2]*_B^(n-2) + ... + x[1]*_B + x[0]
    26	//
    27	// with 0 <= x[i] < _B and 0 <= i < n is stored in a slice of length n,
    28	// with the digits x[i] as the slice elements.
    29	//
    30	// A number is normalized if the slice contains no leading 0 digits.
    31	// During arithmetic operations, denormalized values may occur but are
    32	// always normalized before returning the final result. The normalized
    33	// representation of 0 is the empty or nil slice (length = 0).
    34	//
    35	type nat []Word
    36	
    37	var (
    38		natOne  = nat{1}
    39		natTwo  = nat{2}
    40		natFive = nat{5}
    41		natTen  = nat{10}
    42	)
    43	
    44	func (z nat) clear() {
    45		for i := range z {
    46			z[i] = 0
    47		}
    48	}
    49	
    50	func (z nat) norm() nat {
    51		i := len(z)
    52		for i > 0 && z[i-1] == 0 {
    53			i--
    54		}
    55		return z[0:i]
    56	}
    57	
    58	func (z nat) make(n int) nat {
    59		if n <= cap(z) {
    60			return z[:n] // reuse z
    61		}
    62		if n == 1 {
    63			// Most nats start small and stay that way; don't over-allocate.
    64			return make(nat, 1)
    65		}
    66		// Choosing a good value for e has significant performance impact
    67		// because it increases the chance that a value can be reused.
    68		const e = 4 // extra capacity
    69		return make(nat, n, n+e)
    70	}
    71	
    72	func (z nat) setWord(x Word) nat {
    73		if x == 0 {
    74			return z[:0]
    75		}
    76		z = z.make(1)
    77		z[0] = x
    78		return z
    79	}
    80	
    81	func (z nat) setUint64(x uint64) nat {
    82		// single-word value
    83		if w := Word(x); uint64(w) == x {
    84			return z.setWord(w)
    85		}
    86		// 2-word value
    87		z = z.make(2)
    88		z[1] = Word(x >> 32)
    89		z[0] = Word(x)
    90		return z
    91	}
    92	
    93	func (z nat) set(x nat) nat {
    94		z = z.make(len(x))
    95		copy(z, x)
    96		return z
    97	}
    98	
    99	func (z nat) add(x, y nat) nat {
   100		m := len(x)
   101		n := len(y)
   102	
   103		switch {
   104		case m < n:
   105			return z.add(y, x)
   106		case m == 0:
   107			// n == 0 because m >= n; result is 0
   108			return z[:0]
   109		case n == 0:
   110			// result is x
   111			return z.set(x)
   112		}
   113		// m > 0
   114	
   115		z = z.make(m + 1)
   116		c := addVV(z[0:n], x, y)
   117		if m > n {
   118			c = addVW(z[n:m], x[n:], c)
   119		}
   120		z[m] = c
   121	
   122		return z.norm()
   123	}
   124	
   125	func (z nat) sub(x, y nat) nat {
   126		m := len(x)
   127		n := len(y)
   128	
   129		switch {
   130		case m < n:
   131			panic("underflow")
   132		case m == 0:
   133			// n == 0 because m >= n; result is 0
   134			return z[:0]
   135		case n == 0:
   136			// result is x
   137			return z.set(x)
   138		}
   139		// m > 0
   140	
   141		z = z.make(m)
   142		c := subVV(z[0:n], x, y)
   143		if m > n {
   144			c = subVW(z[n:], x[n:], c)
   145		}
   146		if c != 0 {
   147			panic("underflow")
   148		}
   149	
   150		return z.norm()
   151	}
   152	
   153	func (x nat) cmp(y nat) (r int) {
   154		m := len(x)
   155		n := len(y)
   156		if m != n || m == 0 {
   157			switch {
   158			case m < n:
   159				r = -1
   160			case m > n:
   161				r = 1
   162			}
   163			return
   164		}
   165	
   166		i := m - 1
   167		for i > 0 && x[i] == y[i] {
   168			i--
   169		}
   170	
   171		switch {
   172		case x[i] < y[i]:
   173			r = -1
   174		case x[i] > y[i]:
   175			r = 1
   176		}
   177		return
   178	}
   179	
   180	func (z nat) mulAddWW(x nat, y, r Word) nat {
   181		m := len(x)
   182		if m == 0 || y == 0 {
   183			return z.setWord(r) // result is r
   184		}
   185		// m > 0
   186	
   187		z = z.make(m + 1)
   188		z[m] = mulAddVWW(z[0:m], x, y, r)
   189	
   190		return z.norm()
   191	}
   192	
   193	// basicMul multiplies x and y and leaves the result in z.
   194	// The (non-normalized) result is placed in z[0 : len(x) + len(y)].
   195	func basicMul(z, x, y nat) {
   196		z[0 : len(x)+len(y)].clear() // initialize z
   197		for i, d := range y {
   198			if d != 0 {
   199				z[len(x)+i] = addMulVVW(z[i:i+len(x)], x, d)
   200			}
   201		}
   202	}
   203	
   204	// montgomery computes z mod m = x*y*2**(-n*_W) mod m,
   205	// assuming k = -1/m mod 2**_W.
   206	// z is used for storing the result which is returned;
   207	// z must not alias x, y or m.
   208	// See Gueron, "Efficient Software Implementations of Modular Exponentiation".
   209	// https://eprint.iacr.org/2011/239.pdf
   210	// In the terminology of that paper, this is an "Almost Montgomery Multiplication":
   211	// x and y are required to satisfy 0 <= z < 2**(n*_W) and then the result
   212	// z is guaranteed to satisfy 0 <= z < 2**(n*_W), but it may not be < m.
   213	func (z nat) montgomery(x, y, m nat, k Word, n int) nat {
   214		// This code assumes x, y, m are all the same length, n.
   215		// (required by addMulVVW and the for loop).
   216		// It also assumes that x, y are already reduced mod m,
   217		// or else the result will not be properly reduced.
   218		if len(x) != n || len(y) != n || len(m) != n {
   219			panic("math/big: mismatched montgomery number lengths")
   220		}
   221		z = z.make(n * 2)
   222		z.clear()
   223		var c Word
   224		for i := 0; i < n; i++ {
   225			d := y[i]
   226			c2 := addMulVVW(z[i:n+i], x, d)
   227			t := z[i] * k
   228			c3 := addMulVVW(z[i:n+i], m, t)
   229			cx := c + c2
   230			cy := cx + c3
   231			z[n+i] = cy
   232			if cx < c2 || cy < c3 {
   233				c = 1
   234			} else {
   235				c = 0
   236			}
   237		}
   238		if c != 0 {
   239			subVV(z[:n], z[n:], m)
   240		} else {
   241			copy(z[:n], z[n:])
   242		}
   243		return z[:n]
   244	}
   245	
   246	// Fast version of z[0:n+n>>1].add(z[0:n+n>>1], x[0:n]) w/o bounds checks.
   247	// Factored out for readability - do not use outside karatsuba.
   248	func karatsubaAdd(z, x nat, n int) {
   249		if c := addVV(z[0:n], z, x); c != 0 {
   250			addVW(z[n:n+n>>1], z[n:], c)
   251		}
   252	}
   253	
   254	// Like karatsubaAdd, but does subtract.
   255	func karatsubaSub(z, x nat, n int) {
   256		if c := subVV(z[0:n], z, x); c != 0 {
   257			subVW(z[n:n+n>>1], z[n:], c)
   258		}
   259	}
   260	
   261	// Operands that are shorter than karatsubaThreshold are multiplied using
   262	// "grade school" multiplication; for longer operands the Karatsuba algorithm
   263	// is used.
   264	var karatsubaThreshold = 40 // computed by calibrate_test.go
   265	
   266	// karatsuba multiplies x and y and leaves the result in z.
   267	// Both x and y must have the same length n and n must be a
   268	// power of 2. The result vector z must have len(z) >= 6*n.
   269	// The (non-normalized) result is placed in z[0 : 2*n].
   270	func karatsuba(z, x, y nat) {
   271		n := len(y)
   272	
   273		// Switch to basic multiplication if numbers are odd or small.
   274		// (n is always even if karatsubaThreshold is even, but be
   275		// conservative)
   276		if n&1 != 0 || n < karatsubaThreshold || n < 2 {
   277			basicMul(z, x, y)
   278			return
   279		}
   280		// n&1 == 0 && n >= karatsubaThreshold && n >= 2
   281	
   282		// Karatsuba multiplication is based on the observation that
   283		// for two numbers x and y with:
   284		//
   285		//   x = x1*b + x0
   286		//   y = y1*b + y0
   287		//
   288		// the product x*y can be obtained with 3 products z2, z1, z0
   289		// instead of 4:
   290		//
   291		//   x*y = x1*y1*b*b + (x1*y0 + x0*y1)*b + x0*y0
   292		//       =    z2*b*b +              z1*b +    z0
   293		//
   294		// with:
   295		//
   296		//   xd = x1 - x0
   297		//   yd = y0 - y1
   298		//
   299		//   z1 =      xd*yd                    + z2 + z0
   300		//      = (x1-x0)*(y0 - y1)             + z2 + z0
   301		//      = x1*y0 - x1*y1 - x0*y0 + x0*y1 + z2 + z0
   302		//      = x1*y0 -    z2 -    z0 + x0*y1 + z2 + z0
   303		//      = x1*y0                 + x0*y1
   304	
   305		// split x, y into "digits"
   306		n2 := n >> 1              // n2 >= 1
   307		x1, x0 := x[n2:], x[0:n2] // x = x1*b + y0
   308		y1, y0 := y[n2:], y[0:n2] // y = y1*b + y0
   309	
   310		// z is used for the result and temporary storage:
   311		//
   312		//   6*n     5*n     4*n     3*n     2*n     1*n     0*n
   313		// z = [z2 copy|z0 copy| xd*yd | yd:xd | x1*y1 | x0*y0 ]
   314		//
   315		// For each recursive call of karatsuba, an unused slice of
   316		// z is passed in that has (at least) half the length of the
   317		// caller's z.
   318	
   319		// compute z0 and z2 with the result "in place" in z
   320		karatsuba(z, x0, y0)     // z0 = x0*y0
   321		karatsuba(z[n:], x1, y1) // z2 = x1*y1
   322	
   323		// compute xd (or the negative value if underflow occurs)
   324		s := 1 // sign of product xd*yd
   325		xd := z[2*n : 2*n+n2]
   326		if subVV(xd, x1, x0) != 0 { // x1-x0
   327			s = -s
   328			subVV(xd, x0, x1) // x0-x1
   329		}
   330	
   331		// compute yd (or the negative value if underflow occurs)
   332		yd := z[2*n+n2 : 3*n]
   333		if subVV(yd, y0, y1) != 0 { // y0-y1
   334			s = -s
   335			subVV(yd, y1, y0) // y1-y0
   336		}
   337	
   338		// p = (x1-x0)*(y0-y1) == x1*y0 - x1*y1 - x0*y0 + x0*y1 for s > 0
   339		// p = (x0-x1)*(y0-y1) == x0*y0 - x0*y1 - x1*y0 + x1*y1 for s < 0
   340		p := z[n*3:]
   341		karatsuba(p, xd, yd)
   342	
   343		// save original z2:z0
   344		// (ok to use upper half of z since we're done recursing)
   345		r := z[n*4:]
   346		copy(r, z[:n*2])
   347	
   348		// add up all partial products
   349		//
   350		//   2*n     n     0
   351		// z = [ z2  | z0  ]
   352		//   +    [ z0  ]
   353		//   +    [ z2  ]
   354		//   +    [  p  ]
   355		//
   356		karatsubaAdd(z[n2:], r, n)
   357		karatsubaAdd(z[n2:], r[n:], n)
   358		if s > 0 {
   359			karatsubaAdd(z[n2:], p, n)
   360		} else {
   361			karatsubaSub(z[n2:], p, n)
   362		}
   363	}
   364	
   365	// alias reports whether x and y share the same base array.
   366	// Note: alias assumes that the capacity of underlying arrays
   367	//       is never changed for nat values; i.e. that there are
   368	//       no 3-operand slice expressions in this code (or worse,
   369	//       reflect-based operations to the same effect).
   370	func alias(x, y nat) bool {
   371		return cap(x) > 0 && cap(y) > 0 && &x[0:cap(x)][cap(x)-1] == &y[0:cap(y)][cap(y)-1]
   372	}
   373	
   374	// addAt implements z += x<<(_W*i); z must be long enough.
   375	// (we don't use nat.add because we need z to stay the same
   376	// slice, and we don't need to normalize z after each addition)
   377	func addAt(z, x nat, i int) {
   378		if n := len(x); n > 0 {
   379			if c := addVV(z[i:i+n], z[i:], x); c != 0 {
   380				j := i + n
   381				if j < len(z) {
   382					addVW(z[j:], z[j:], c)
   383				}
   384			}
   385		}
   386	}
   387	
   388	func max(x, y int) int {
   389		if x > y {
   390			return x
   391		}
   392		return y
   393	}
   394	
   395	// karatsubaLen computes an approximation to the maximum k <= n such that
   396	// k = p<<i for a number p <= threshold and an i >= 0. Thus, the
   397	// result is the largest number that can be divided repeatedly by 2 before
   398	// becoming about the value of threshold.
   399	func karatsubaLen(n, threshold int) int {
   400		i := uint(0)
   401		for n > threshold {
   402			n >>= 1
   403			i++
   404		}
   405		return n << i
   406	}
   407	
   408	func (z nat) mul(x, y nat) nat {
   409		m := len(x)
   410		n := len(y)
   411	
   412		switch {
   413		case m < n:
   414			return z.mul(y, x)
   415		case m == 0 || n == 0:
   416			return z[:0]
   417		case n == 1:
   418			return z.mulAddWW(x, y[0], 0)
   419		}
   420		// m >= n > 1
   421	
   422		// determine if z can be reused
   423		if alias(z, x) || alias(z, y) {
   424			z = nil // z is an alias for x or y - cannot reuse
   425		}
   426	
   427		// use basic multiplication if the numbers are small
   428		if n < karatsubaThreshold {
   429			z = z.make(m + n)
   430			basicMul(z, x, y)
   431			return z.norm()
   432		}
   433		// m >= n && n >= karatsubaThreshold && n >= 2
   434	
   435		// determine Karatsuba length k such that
   436		//
   437		//   x = xh*b + x0  (0 <= x0 < b)
   438		//   y = yh*b + y0  (0 <= y0 < b)
   439		//   b = 1<<(_W*k)  ("base" of digits xi, yi)
   440		//
   441		k := karatsubaLen(n, karatsubaThreshold)
   442		// k <= n
   443	
   444		// multiply x0 and y0 via Karatsuba
   445		x0 := x[0:k]              // x0 is not normalized
   446		y0 := y[0:k]              // y0 is not normalized
   447		z = z.make(max(6*k, m+n)) // enough space for karatsuba of x0*y0 and full result of x*y
   448		karatsuba(z, x0, y0)
   449		z = z[0 : m+n]  // z has final length but may be incomplete
   450		z[2*k:].clear() // upper portion of z is garbage (and 2*k <= m+n since k <= n <= m)
   451	
   452		// If xh != 0 or yh != 0, add the missing terms to z. For
   453		//
   454		//   xh = xi*b^i + ... + x2*b^2 + x1*b (0 <= xi < b)
   455		//   yh =                         y1*b (0 <= y1 < b)
   456		//
   457		// the missing terms are
   458		//
   459		//   x0*y1*b and xi*y0*b^i, xi*y1*b^(i+1) for i > 0
   460		//
   461		// since all the yi for i > 1 are 0 by choice of k: If any of them
   462		// were > 0, then yh >= b^2 and thus y >= b^2. Then k' = k*2 would
   463		// be a larger valid threshold contradicting the assumption about k.
   464		//
   465		if k < n || m != n {
   466			var t nat
   467	
   468			// add x0*y1*b
   469			x0 := x0.norm()
   470			y1 := y[k:]       // y1 is normalized because y is
   471			t = t.mul(x0, y1) // update t so we don't lose t's underlying array
   472			addAt(z, t, k)
   473	
   474			// add xi*y0<<i, xi*y1*b<<(i+k)
   475			y0 := y0.norm()
   476			for i := k; i < len(x); i += k {
   477				xi := x[i:]
   478				if len(xi) > k {
   479					xi = xi[:k]
   480				}
   481				xi = xi.norm()
   482				t = t.mul(xi, y0)
   483				addAt(z, t, i)
   484				t = t.mul(xi, y1)
   485				addAt(z, t, i+k)
   486			}
   487		}
   488	
   489		return z.norm()
   490	}
   491	
   492	// basicSqr sets z = x*x and is asymptotically faster than basicMul
   493	// by about a factor of 2, but slower for small arguments due to overhead.
   494	// Requirements: len(x) > 0, len(z) == 2*len(x)
   495	// The (non-normalized) result is placed in z.
   496	func basicSqr(z, x nat) {
   497		n := len(x)
   498		t := make(nat, 2*n)            // temporary variable to hold the products
   499		z[1], z[0] = mulWW(x[0], x[0]) // the initial square
   500		for i := 1; i < n; i++ {
   501			d := x[i]
   502			// z collects the squares x[i] * x[i]
   503			z[2*i+1], z[2*i] = mulWW(d, d)
   504			// t collects the products x[i] * x[j] where j < i
   505			t[2*i] = addMulVVW(t[i:2*i], x[0:i], d)
   506		}
   507		t[2*n-1] = shlVU(t[1:2*n-1], t[1:2*n-1], 1) // double the j < i products
   508		addVV(z, z, t)                              // combine the result
   509	}
   510	
   511	// karatsubaSqr squares x and leaves the result in z.
   512	// len(x) must be a power of 2 and len(z) >= 6*len(x).
   513	// The (non-normalized) result is placed in z[0 : 2*len(x)].
   514	//
   515	// The algorithm and the layout of z are the same as for karatsuba.
   516	func karatsubaSqr(z, x nat) {
   517		n := len(x)
   518	
   519		if n&1 != 0 || n < karatsubaSqrThreshold || n < 2 {
   520			basicSqr(z[:2*n], x)
   521			return
   522		}
   523	
   524		n2 := n >> 1
   525		x1, x0 := x[n2:], x[0:n2]
   526	
   527		karatsubaSqr(z, x0)
   528		karatsubaSqr(z[n:], x1)
   529	
   530		// s = sign(xd*yd) == -1 for xd != 0; s == 1 for xd == 0
   531		xd := z[2*n : 2*n+n2]
   532		if subVV(xd, x1, x0) != 0 {
   533			subVV(xd, x0, x1)
   534		}
   535	
   536		p := z[n*3:]
   537		karatsubaSqr(p, xd)
   538	
   539		r := z[n*4:]
   540		copy(r, z[:n*2])
   541	
   542		karatsubaAdd(z[n2:], r, n)
   543		karatsubaAdd(z[n2:], r[n:], n)
   544		karatsubaSub(z[n2:], p, n) // s == -1 for p != 0; s == 1 for p == 0
   545	}
   546	
   547	// Operands that are shorter than basicSqrThreshold are squared using
   548	// "grade school" multiplication; for operands longer than karatsubaSqrThreshold
   549	// we use the Karatsuba algorithm optimized for x == y.
   550	var basicSqrThreshold = 20      // computed by calibrate_test.go
   551	var karatsubaSqrThreshold = 260 // computed by calibrate_test.go
   552	
   553	// z = x*x
   554	func (z nat) sqr(x nat) nat {
   555		n := len(x)
   556		switch {
   557		case n == 0:
   558			return z[:0]
   559		case n == 1:
   560			d := x[0]
   561			z = z.make(2)
   562			z[1], z[0] = mulWW(d, d)
   563			return z.norm()
   564		}
   565	
   566		if alias(z, x) {
   567			z = nil // z is an alias for x - cannot reuse
   568		}
   569	
   570		if n < basicSqrThreshold {
   571			z = z.make(2 * n)
   572			basicMul(z, x, x)
   573			return z.norm()
   574		}
   575		if n < karatsubaSqrThreshold {
   576			z = z.make(2 * n)
   577			basicSqr(z, x)
   578			return z.norm()
   579		}
   580	
   581		// Use Karatsuba multiplication optimized for x == y.
   582		// The algorithm and layout of z are the same as for mul.
   583	
   584		// z = (x1*b + x0)^2 = x1^2*b^2 + 2*x1*x0*b + x0^2
   585	
   586		k := karatsubaLen(n, karatsubaSqrThreshold)
   587	
   588		x0 := x[0:k]
   589		z = z.make(max(6*k, 2*n))
   590		karatsubaSqr(z, x0) // z = x0^2
   591		z = z[0 : 2*n]
   592		z[2*k:].clear()
   593	
   594		if k < n {
   595			var t nat
   596			x0 := x0.norm()
   597			x1 := x[k:]
   598			t = t.mul(x0, x1)
   599			addAt(z, t, k)
   600			addAt(z, t, k) // z = 2*x1*x0*b + x0^2
   601			t = t.sqr(x1)
   602			addAt(z, t, 2*k) // z = x1^2*b^2 + 2*x1*x0*b + x0^2
   603		}
   604	
   605		return z.norm()
   606	}
   607	
   608	// mulRange computes the product of all the unsigned integers in the
   609	// range [a, b] inclusively. If a > b (empty range), the result is 1.
   610	func (z nat) mulRange(a, b uint64) nat {
   611		switch {
   612		case a == 0:
   613			// cut long ranges short (optimization)
   614			return z.setUint64(0)
   615		case a > b:
   616			return z.setUint64(1)
   617		case a == b:
   618			return z.setUint64(a)
   619		case a+1 == b:
   620			return z.mul(nat(nil).setUint64(a), nat(nil).setUint64(b))
   621		}
   622		m := (a + b) / 2
   623		return z.mul(nat(nil).mulRange(a, m), nat(nil).mulRange(m+1, b))
   624	}
   625	
   626	// q = (x-r)/y, with 0 <= r < y
   627	func (z nat) divW(x nat, y Word) (q nat, r Word) {
   628		m := len(x)
   629		switch {
   630		case y == 0:
   631			panic("division by zero")
   632		case y == 1:
   633			q = z.set(x) // result is x
   634			return
   635		case m == 0:
   636			q = z[:0] // result is 0
   637			return
   638		}
   639		// m > 0
   640		z = z.make(m)
   641		r = divWVW(z, 0, x, y)
   642		q = z.norm()
   643		return
   644	}
   645	
   646	func (z nat) div(z2, u, v nat) (q, r nat) {
   647		if len(v) == 0 {
   648			panic("division by zero")
   649		}
   650	
   651		if u.cmp(v) < 0 {
   652			q = z[:0]
   653			r = z2.set(u)
   654			return
   655		}
   656	
   657		if len(v) == 1 {
   658			var r2 Word
   659			q, r2 = z.divW(u, v[0])
   660			r = z2.setWord(r2)
   661			return
   662		}
   663	
   664		q, r = z.divLarge(z2, u, v)
   665		return
   666	}
   667	
   668	// getNat returns a *nat of len n. The contents may not be zero.
   669	// The pool holds *nat to avoid allocation when converting to interface{}.
   670	func getNat(n int) *nat {
   671		var z *nat
   672		if v := natPool.Get(); v != nil {
   673			z = v.(*nat)
   674		}
   675		if z == nil {
   676			z = new(nat)
   677		}
   678		*z = z.make(n)
   679		return z
   680	}
   681	
   682	func putNat(x *nat) {
   683		natPool.Put(x)
   684	}
   685	
   686	var natPool sync.Pool
   687	
   688	// q = (uIn-r)/vIn, with 0 <= r < y
   689	// Uses z as storage for q, and u as storage for r if possible.
   690	// See Knuth, Volume 2, section 4.3.1, Algorithm D.
   691	// Preconditions:
   692	//    len(vIn) >= 2
   693	//    len(uIn) >= len(vIn)
   694	//    u must not alias z
   695	func (z nat) divLarge(u, uIn, vIn nat) (q, r nat) {
   696		n := len(vIn)
   697		m := len(uIn) - n
   698	
   699		// D1.
   700		shift := nlz(vIn[n-1])
   701		// do not modify vIn, it may be used by another goroutine simultaneously
   702		vp := getNat(n)
   703		v := *vp
   704		shlVU(v, vIn, shift)
   705	
   706		// u may safely alias uIn or vIn, the value of uIn is used to set u and vIn was already used
   707		u = u.make(len(uIn) + 1)
   708		u[len(uIn)] = shlVU(u[0:len(uIn)], uIn, shift)
   709	
   710		// z may safely alias uIn or vIn, both values were used already
   711		if alias(z, u) {
   712			z = nil // z is an alias for u - cannot reuse
   713		}
   714		q = z.make(m + 1)
   715	
   716		qhatvp := getNat(n + 1)
   717		qhatv := *qhatvp
   718	
   719		// D2.
   720		vn1 := v[n-1]
   721		for j := m; j >= 0; j-- {
   722			// D3.
   723			qhat := Word(_M)
   724			if ujn := u[j+n]; ujn != vn1 {
   725				var rhat Word
   726				qhat, rhat = divWW(ujn, u[j+n-1], vn1)
   727	
   728				// x1 | x2 = q̂v_{n-2}
   729				vn2 := v[n-2]
   730				x1, x2 := mulWW(qhat, vn2)
   731				// test if q̂v_{n-2} > br̂ + u_{j+n-2}
   732				ujn2 := u[j+n-2]
   733				for greaterThan(x1, x2, rhat, ujn2) {
   734					qhat--
   735					prevRhat := rhat
   736					rhat += vn1
   737					// v[n-1] >= 0, so this tests for overflow.
   738					if rhat < prevRhat {
   739						break
   740					}
   741					x1, x2 = mulWW(qhat, vn2)
   742				}
   743			}
   744	
   745			// D4.
   746			qhatv[n] = mulAddVWW(qhatv[0:n], v, qhat, 0)
   747	
   748			c := subVV(u[j:j+len(qhatv)], u[j:], qhatv)
   749			if c != 0 {
   750				c := addVV(u[j:j+n], u[j:], v)
   751				u[j+n] += c
   752				qhat--
   753			}
   754	
   755			q[j] = qhat
   756		}
   757	
   758		putNat(vp)
   759		putNat(qhatvp)
   760	
   761		q = q.norm()
   762		shrVU(u, u, shift)
   763		r = u.norm()
   764	
   765		return q, r
   766	}
   767	
   768	// Length of x in bits. x must be normalized.
   769	func (x nat) bitLen() int {
   770		if i := len(x) - 1; i >= 0 {
   771			return i*_W + bits.Len(uint(x[i]))
   772		}
   773		return 0
   774	}
   775	
   776	// trailingZeroBits returns the number of consecutive least significant zero
   777	// bits of x.
   778	func (x nat) trailingZeroBits() uint {
   779		if len(x) == 0 {
   780			return 0
   781		}
   782		var i uint
   783		for x[i] == 0 {
   784			i++
   785		}
   786		// x[i] != 0
   787		return i*_W + uint(bits.TrailingZeros(uint(x[i])))
   788	}
   789	
   790	func same(x, y nat) bool {
   791		return len(x) == len(y) && len(x) > 0 && &x[0] == &y[0]
   792	}
   793	
   794	// z = x << s
   795	func (z nat) shl(x nat, s uint) nat {
   796		if s == 0 {
   797			if same(z, x) {
   798				return z
   799			}
   800			if !alias(z, x) {
   801				return z.set(x)
   802			}
   803		}
   804	
   805		m := len(x)
   806		if m == 0 {
   807			return z[:0]
   808		}
   809		// m > 0
   810	
   811		n := m + int(s/_W)
   812		z = z.make(n + 1)
   813		z[n] = shlVU(z[n-m:n], x, s%_W)
   814		z[0 : n-m].clear()
   815	
   816		return z.norm()
   817	}
   818	
   819	// z = x >> s
   820	func (z nat) shr(x nat, s uint) nat {
   821		if s == 0 {
   822			if same(z, x) {
   823				return z
   824			}
   825			if !alias(z, x) {
   826				return z.set(x)
   827			}
   828		}
   829	
   830		m := len(x)
   831		n := m - int(s/_W)
   832		if n <= 0 {
   833			return z[:0]
   834		}
   835		// n > 0
   836	
   837		z = z.make(n)
   838		shrVU(z, x[m-n:], s%_W)
   839	
   840		return z.norm()
   841	}
   842	
   843	func (z nat) setBit(x nat, i uint, b uint) nat {
   844		j := int(i / _W)
   845		m := Word(1) << (i % _W)
   846		n := len(x)
   847		switch b {
   848		case 0:
   849			z = z.make(n)
   850			copy(z, x)
   851			if j >= n {
   852				// no need to grow
   853				return z
   854			}
   855			z[j] &^= m
   856			return z.norm()
   857		case 1:
   858			if j >= n {
   859				z = z.make(j + 1)
   860				z[n:].clear()
   861			} else {
   862				z = z.make(n)
   863			}
   864			copy(z, x)
   865			z[j] |= m
   866			// no need to normalize
   867			return z
   868		}
   869		panic("set bit is not 0 or 1")
   870	}
   871	
   872	// bit returns the value of the i'th bit, with lsb == bit 0.
   873	func (x nat) bit(i uint) uint {
   874		j := i / _W
   875		if j >= uint(len(x)) {
   876			return 0
   877		}
   878		// 0 <= j < len(x)
   879		return uint(x[j] >> (i % _W) & 1)
   880	}
   881	
   882	// sticky returns 1 if there's a 1 bit within the
   883	// i least significant bits, otherwise it returns 0.
   884	func (x nat) sticky(i uint) uint {
   885		j := i / _W
   886		if j >= uint(len(x)) {
   887			if len(x) == 0 {
   888				return 0
   889			}
   890			return 1
   891		}
   892		// 0 <= j < len(x)
   893		for _, x := range x[:j] {
   894			if x != 0 {
   895				return 1
   896			}
   897		}
   898		if x[j]<<(_W-i%_W) != 0 {
   899			return 1
   900		}
   901		return 0
   902	}
   903	
   904	func (z nat) and(x, y nat) nat {
   905		m := len(x)
   906		n := len(y)
   907		if m > n {
   908			m = n
   909		}
   910		// m <= n
   911	
   912		z = z.make(m)
   913		for i := 0; i < m; i++ {
   914			z[i] = x[i] & y[i]
   915		}
   916	
   917		return z.norm()
   918	}
   919	
   920	func (z nat) andNot(x, y nat) nat {
   921		m := len(x)
   922		n := len(y)
   923		if n > m {
   924			n = m
   925		}
   926		// m >= n
   927	
   928		z = z.make(m)
   929		for i := 0; i < n; i++ {
   930			z[i] = x[i] &^ y[i]
   931		}
   932		copy(z[n:m], x[n:m])
   933	
   934		return z.norm()
   935	}
   936	
   937	func (z nat) or(x, y nat) nat {
   938		m := len(x)
   939		n := len(y)
   940		s := x
   941		if m < n {
   942			n, m = m, n
   943			s = y
   944		}
   945		// m >= n
   946	
   947		z = z.make(m)
   948		for i := 0; i < n; i++ {
   949			z[i] = x[i] | y[i]
   950		}
   951		copy(z[n:m], s[n:m])
   952	
   953		return z.norm()
   954	}
   955	
   956	func (z nat) xor(x, y nat) nat {
   957		m := len(x)
   958		n := len(y)
   959		s := x
   960		if m < n {
   961			n, m = m, n
   962			s = y
   963		}
   964		// m >= n
   965	
   966		z = z.make(m)
   967		for i := 0; i < n; i++ {
   968			z[i] = x[i] ^ y[i]
   969		}
   970		copy(z[n:m], s[n:m])
   971	
   972		return z.norm()
   973	}
   974	
   975	// greaterThan reports whether (x1<<_W + x2) > (y1<<_W + y2)
   976	func greaterThan(x1, x2, y1, y2 Word) bool {
   977		return x1 > y1 || x1 == y1 && x2 > y2
   978	}
   979	
   980	// modW returns x % d.
   981	func (x nat) modW(d Word) (r Word) {
   982		// TODO(agl): we don't actually need to store the q value.
   983		var q nat
   984		q = q.make(len(x))
   985		return divWVW(q, 0, x, d)
   986	}
   987	
   988	// random creates a random integer in [0..limit), using the space in z if
   989	// possible. n is the bit length of limit.
   990	func (z nat) random(rand *rand.Rand, limit nat, n int) nat {
   991		if alias(z, limit) {
   992			z = nil // z is an alias for limit - cannot reuse
   993		}
   994		z = z.make(len(limit))
   995	
   996		bitLengthOfMSW := uint(n % _W)
   997		if bitLengthOfMSW == 0 {
   998			bitLengthOfMSW = _W
   999		}
  1000		mask := Word((1 << bitLengthOfMSW) - 1)
  1001	
  1002		for {
  1003			switch _W {
  1004			case 32:
  1005				for i := range z {
  1006					z[i] = Word(rand.Uint32())
  1007				}
  1008			case 64:
  1009				for i := range z {
  1010					z[i] = Word(rand.Uint32()) | Word(rand.Uint32())<<32
  1011				}
  1012			default:
  1013				panic("unknown word size")
  1014			}
  1015			z[len(limit)-1] &= mask
  1016			if z.cmp(limit) < 0 {
  1017				break
  1018			}
  1019		}
  1020	
  1021		return z.norm()
  1022	}
  1023	
  1024	// If m != 0 (i.e., len(m) != 0), expNN sets z to x**y mod m;
  1025	// otherwise it sets z to x**y. The result is the value of z.
  1026	func (z nat) expNN(x, y, m nat) nat {
  1027		if alias(z, x) || alias(z, y) {
  1028			// We cannot allow in-place modification of x or y.
  1029			z = nil
  1030		}
  1031	
  1032		// x**y mod 1 == 0
  1033		if len(m) == 1 && m[0] == 1 {
  1034			return z.setWord(0)
  1035		}
  1036		// m == 0 || m > 1
  1037	
  1038		// x**0 == 1
  1039		if len(y) == 0 {
  1040			return z.setWord(1)
  1041		}
  1042		// y > 0
  1043	
  1044		// x**1 mod m == x mod m
  1045		if len(y) == 1 && y[0] == 1 && len(m) != 0 {
  1046			_, z = nat(nil).div(z, x, m)
  1047			return z
  1048		}
  1049		// y > 1
  1050	
  1051		if len(m) != 0 {
  1052			// We likely end up being as long as the modulus.
  1053			z = z.make(len(m))
  1054		}
  1055		z = z.set(x)
  1056	
  1057		// If the base is non-trivial and the exponent is large, we use
  1058		// 4-bit, windowed exponentiation. This involves precomputing 14 values
  1059		// (x^2...x^15) but then reduces the number of multiply-reduces by a
  1060		// third. Even for a 32-bit exponent, this reduces the number of
  1061		// operations. Uses Montgomery method for odd moduli.
  1062		if x.cmp(natOne) > 0 && len(y) > 1 && len(m) > 0 {
  1063			if m[0]&1 == 1 {
  1064				return z.expNNMontgomery(x, y, m)
  1065			}
  1066			return z.expNNWindowed(x, y, m)
  1067		}
  1068	
  1069		v := y[len(y)-1] // v > 0 because y is normalized and y > 0
  1070		shift := nlz(v) + 1
  1071		v <<= shift
  1072		var q nat
  1073	
  1074		const mask = 1 << (_W - 1)
  1075	
  1076		// We walk through the bits of the exponent one by one. Each time we
  1077		// see a bit, we square, thus doubling the power. If the bit is a one,
  1078		// we also multiply by x, thus adding one to the power.
  1079	
  1080		w := _W - int(shift)
  1081		// zz and r are used to avoid allocating in mul and div as
  1082		// otherwise the arguments would alias.
  1083		var zz, r nat
  1084		for j := 0; j < w; j++ {
  1085			zz = zz.sqr(z)
  1086			zz, z = z, zz
  1087	
  1088			if v&mask != 0 {
  1089				zz = zz.mul(z, x)
  1090				zz, z = z, zz
  1091			}
  1092	
  1093			if len(m) != 0 {
  1094				zz, r = zz.div(r, z, m)
  1095				zz, r, q, z = q, z, zz, r
  1096			}
  1097	
  1098			v <<= 1
  1099		}
  1100	
  1101		for i := len(y) - 2; i >= 0; i-- {
  1102			v = y[i]
  1103	
  1104			for j := 0; j < _W; j++ {
  1105				zz = zz.sqr(z)
  1106				zz, z = z, zz
  1107	
  1108				if v&mask != 0 {
  1109					zz = zz.mul(z, x)
  1110					zz, z = z, zz
  1111				}
  1112	
  1113				if len(m) != 0 {
  1114					zz, r = zz.div(r, z, m)
  1115					zz, r, q, z = q, z, zz, r
  1116				}
  1117	
  1118				v <<= 1
  1119			}
  1120		}
  1121	
  1122		return z.norm()
  1123	}
  1124	
  1125	// expNNWindowed calculates x**y mod m using a fixed, 4-bit window.
  1126	func (z nat) expNNWindowed(x, y, m nat) nat {
  1127		// zz and r are used to avoid allocating in mul and div as otherwise
  1128		// the arguments would alias.
  1129		var zz, r nat
  1130	
  1131		const n = 4
  1132		// powers[i] contains x^i.
  1133		var powers [1 << n]nat
  1134		powers[0] = natOne
  1135		powers[1] = x
  1136		for i := 2; i < 1<<n; i += 2 {
  1137			p2, p, p1 := &powers[i/2], &powers[i], &powers[i+1]
  1138			*p = p.sqr(*p2)
  1139			zz, r = zz.div(r, *p, m)
  1140			*p, r = r, *p
  1141			*p1 = p1.mul(*p, x)
  1142			zz, r = zz.div(r, *p1, m)
  1143			*p1, r = r, *p1
  1144		}
  1145	
  1146		z = z.setWord(1)
  1147	
  1148		for i := len(y) - 1; i >= 0; i-- {
  1149			yi := y[i]
  1150			for j := 0; j < _W; j += n {
  1151				if i != len(y)-1 || j != 0 {
  1152					// Unrolled loop for significant performance
  1153					// gain. Use go test -bench=".*" in crypto/rsa
  1154					// to check performance before making changes.
  1155					zz = zz.sqr(z)
  1156					zz, z = z, zz
  1157					zz, r = zz.div(r, z, m)
  1158					z, r = r, z
  1159	
  1160					zz = zz.sqr(z)
  1161					zz, z = z, zz
  1162					zz, r = zz.div(r, z, m)
  1163					z, r = r, z
  1164	
  1165					zz = zz.sqr(z)
  1166					zz, z = z, zz
  1167					zz, r = zz.div(r, z, m)
  1168					z, r = r, z
  1169	
  1170					zz = zz.sqr(z)
  1171					zz, z = z, zz
  1172					zz, r = zz.div(r, z, m)
  1173					z, r = r, z
  1174				}
  1175	
  1176				zz = zz.mul(z, powers[yi>>(_W-n)])
  1177				zz, z = z, zz
  1178				zz, r = zz.div(r, z, m)
  1179				z, r = r, z
  1180	
  1181				yi <<= n
  1182			}
  1183		}
  1184	
  1185		return z.norm()
  1186	}
  1187	
  1188	// expNNMontgomery calculates x**y mod m using a fixed, 4-bit window.
  1189	// Uses Montgomery representation.
  1190	func (z nat) expNNMontgomery(x, y, m nat) nat {
  1191		numWords := len(m)
  1192	
  1193		// We want the lengths of x and m to be equal.
  1194		// It is OK if x >= m as long as len(x) == len(m).
  1195		if len(x) > numWords {
  1196			_, x = nat(nil).div(nil, x, m)
  1197			// Note: now len(x) <= numWords, not guaranteed ==.
  1198		}
  1199		if len(x) < numWords {
  1200			rr := make(nat, numWords)
  1201			copy(rr, x)
  1202			x = rr
  1203		}
  1204	
  1205		// Ideally the precomputations would be performed outside, and reused
  1206		// k0 = -m**-1 mod 2**_W. Algorithm from: Dumas, J.G. "On Newton–Raphson
  1207		// Iteration for Multiplicative Inverses Modulo Prime Powers".
  1208		k0 := 2 - m[0]
  1209		t := m[0] - 1
  1210		for i := 1; i < _W; i <<= 1 {
  1211			t *= t
  1212			k0 *= (t + 1)
  1213		}
  1214		k0 = -k0
  1215	
  1216		// RR = 2**(2*_W*len(m)) mod m
  1217		RR := nat(nil).setWord(1)
  1218		zz := nat(nil).shl(RR, uint(2*numWords*_W))
  1219		_, RR = nat(nil).div(RR, zz, m)
  1220		if len(RR) < numWords {
  1221			zz = zz.make(numWords)
  1222			copy(zz, RR)
  1223			RR = zz
  1224		}
  1225		// one = 1, with equal length to that of m
  1226		one := make(nat, numWords)
  1227		one[0] = 1
  1228	
  1229		const n = 4
  1230		// powers[i] contains x^i
  1231		var powers [1 << n]nat
  1232		powers[0] = powers[0].montgomery(one, RR, m, k0, numWords)
  1233		powers[1] = powers[1].montgomery(x, RR, m, k0, numWords)
  1234		for i := 2; i < 1<<n; i++ {
  1235			powers[i] = powers[i].montgomery(powers[i-1], powers[1], m, k0, numWords)
  1236		}
  1237	
  1238		// initialize z = 1 (Montgomery 1)
  1239		z = z.make(numWords)
  1240		copy(z, powers[0])
  1241	
  1242		zz = zz.make(numWords)
  1243	
  1244		// same windowed exponent, but with Montgomery multiplications
  1245		for i := len(y) - 1; i >= 0; i-- {
  1246			yi := y[i]
  1247			for j := 0; j < _W; j += n {
  1248				if i != len(y)-1 || j != 0 {
  1249					zz = zz.montgomery(z, z, m, k0, numWords)
  1250					z = z.montgomery(zz, zz, m, k0, numWords)
  1251					zz = zz.montgomery(z, z, m, k0, numWords)
  1252					z = z.montgomery(zz, zz, m, k0, numWords)
  1253				}
  1254				zz = zz.montgomery(z, powers[yi>>(_W-n)], m, k0, numWords)
  1255				z, zz = zz, z
  1256				yi <<= n
  1257			}
  1258		}
  1259		// convert to regular number
  1260		zz = zz.montgomery(z, one, m, k0, numWords)
  1261	
  1262		// One last reduction, just in case.
  1263		// See golang.org/issue/13907.
  1264		if zz.cmp(m) >= 0 {
  1265			// Common case is m has high bit set; in that case,
  1266			// since zz is the same length as m, there can be just
  1267			// one multiple of m to remove. Just subtract.
  1268			// We think that the subtract should be sufficient in general,
  1269			// so do that unconditionally, but double-check,
  1270			// in case our beliefs are wrong.
  1271			// The div is not expected to be reached.
  1272			zz = zz.sub(zz, m)
  1273			if zz.cmp(m) >= 0 {
  1274				_, zz = nat(nil).div(nil, zz, m)
  1275			}
  1276		}
  1277	
  1278		return zz.norm()
  1279	}
  1280	
  1281	// bytes writes the value of z into buf using big-endian encoding.
  1282	// len(buf) must be >= len(z)*_S. The value of z is encoded in the
  1283	// slice buf[i:]. The number i of unused bytes at the beginning of
  1284	// buf is returned as result.
  1285	func (z nat) bytes(buf []byte) (i int) {
  1286		i = len(buf)
  1287		for _, d := range z {
  1288			for j := 0; j < _S; j++ {
  1289				i--
  1290				buf[i] = byte(d)
  1291				d >>= 8
  1292			}
  1293		}
  1294	
  1295		for i < len(buf) && buf[i] == 0 {
  1296			i++
  1297		}
  1298	
  1299		return
  1300	}
  1301	
  1302	// bigEndianWord returns the contents of buf interpreted as a big-endian encoded Word value.
  1303	func bigEndianWord(buf []byte) Word {
  1304		if _W == 64 {
  1305			return Word(binary.BigEndian.Uint64(buf))
  1306		}
  1307		return Word(binary.BigEndian.Uint32(buf))
  1308	}
  1309	
  1310	// setBytes interprets buf as the bytes of a big-endian unsigned
  1311	// integer, sets z to that value, and returns z.
  1312	func (z nat) setBytes(buf []byte) nat {
  1313		z = z.make((len(buf) + _S - 1) / _S)
  1314	
  1315		i := len(buf)
  1316		for k := 0; i >= _S; k++ {
  1317			z[k] = bigEndianWord(buf[i-_S : i])
  1318			i -= _S
  1319		}
  1320		if i > 0 {
  1321			var d Word
  1322			for s := uint(0); i > 0; s += 8 {
  1323				d |= Word(buf[i-1]) << s
  1324				i--
  1325			}
  1326			z[len(z)-1] = d
  1327		}
  1328	
  1329		return z.norm()
  1330	}
  1331	
  1332	// sqrt sets z = ⌊√x⌋
  1333	func (z nat) sqrt(x nat) nat {
  1334		if x.cmp(natOne) <= 0 {
  1335			return z.set(x)
  1336		}
  1337		if alias(z, x) {
  1338			z = nil
  1339		}
  1340	
  1341		// Start with value known to be too large and repeat "z = ⌊(z + ⌊x/z⌋)/2⌋" until it stops getting smaller.
  1342		// See Brent and Zimmermann, Modern Computer Arithmetic, Algorithm 1.13 (SqrtInt).
  1343		// https://members.loria.fr/PZimmermann/mca/pub226.html
  1344		// If x is one less than a perfect square, the sequence oscillates between the correct z and z+1;
  1345		// otherwise it converges to the correct z and stays there.
  1346		var z1, z2 nat
  1347		z1 = z
  1348		z1 = z1.setUint64(1)
  1349		z1 = z1.shl(z1, uint(x.bitLen()+1)/2) // must be ≥ √x
  1350		for n := 0; ; n++ {
  1351			z2, _ = z2.div(nil, x, z1)
  1352			z2 = z2.add(z2, z1)
  1353			z2 = z2.shr(z2, 1)
  1354			if z2.cmp(z1) >= 0 {
  1355				// z1 is answer.
  1356				// Figure out whether z1 or z2 is currently aliased to z by looking at loop count.
  1357				if n&1 == 0 {
  1358					return z1
  1359				}
  1360				return z.set(z1)
  1361			}
  1362			z1, z2 = z2, z1
  1363		}
  1364	}
  1365
View as plain text