Source file src/crypto/elliptic/p256.go

     1	// Copyright 2013 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// +build !amd64,!arm64
     6	
     7	package elliptic
     8	
     9	// This file contains a constant-time, 32-bit implementation of P256.
    10	
    11	import (
    12		"math/big"
    13	)
    14	
    15	type p256Curve struct {
    16		*CurveParams
    17	}
    18	
    19	var (
    20		p256Params *CurveParams
    21	
    22		// RInverse contains 1/R mod p - the inverse of the Montgomery constant
    23		// (2**257).
    24		p256RInverse *big.Int
    25	)
    26	
    27	func initP256() {
    28		// See FIPS 186-3, section D.2.3
    29		p256Params = &CurveParams{Name: "P-256"}
    30		p256Params.P, _ = new(big.Int).SetString("115792089210356248762697446949407573530086143415290314195533631308867097853951", 10)
    31		p256Params.N, _ = new(big.Int).SetString("115792089210356248762697446949407573529996955224135760342422259061068512044369", 10)
    32		p256Params.B, _ = new(big.Int).SetString("5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", 16)
    33		p256Params.Gx, _ = new(big.Int).SetString("6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", 16)
    34		p256Params.Gy, _ = new(big.Int).SetString("4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", 16)
    35		p256Params.BitSize = 256
    36	
    37		p256RInverse, _ = new(big.Int).SetString("7fffffff00000001fffffffe8000000100000000ffffffff0000000180000000", 16)
    38	
    39		// Arch-specific initialization, i.e. let a platform dynamically pick a P256 implementation
    40		initP256Arch()
    41	}
    42	
    43	func (curve p256Curve) Params() *CurveParams {
    44		return curve.CurveParams
    45	}
    46	
    47	// p256GetScalar endian-swaps the big-endian scalar value from in and writes it
    48	// to out. If the scalar is equal or greater than the order of the group, it's
    49	// reduced modulo that order.
    50	func p256GetScalar(out *[32]byte, in []byte) {
    51		n := new(big.Int).SetBytes(in)
    52		var scalarBytes []byte
    53	
    54		if n.Cmp(p256Params.N) >= 0 {
    55			n.Mod(n, p256Params.N)
    56			scalarBytes = n.Bytes()
    57		} else {
    58			scalarBytes = in
    59		}
    60	
    61		for i, v := range scalarBytes {
    62			out[len(scalarBytes)-(1+i)] = v
    63		}
    64	}
    65	
    66	func (p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
    67		var scalarReversed [32]byte
    68		p256GetScalar(&scalarReversed, scalar)
    69	
    70		var x1, y1, z1 [p256Limbs]uint32
    71		p256ScalarBaseMult(&x1, &y1, &z1, &scalarReversed)
    72		return p256ToAffine(&x1, &y1, &z1)
    73	}
    74	
    75	func (p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
    76		var scalarReversed [32]byte
    77		p256GetScalar(&scalarReversed, scalar)
    78	
    79		var px, py, x1, y1, z1 [p256Limbs]uint32
    80		p256FromBig(&px, bigX)
    81		p256FromBig(&py, bigY)
    82		p256ScalarMult(&x1, &y1, &z1, &px, &py, &scalarReversed)
    83		return p256ToAffine(&x1, &y1, &z1)
    84	}
    85	
    86	// Field elements are represented as nine, unsigned 32-bit words.
    87	//
    88	// The value of an field element is:
    89	//   x[0] + (x[1] * 2**29) + (x[2] * 2**57) + ... + (x[8] * 2**228)
    90	//
    91	// That is, each limb is alternately 29 or 28-bits wide in little-endian
    92	// order.
    93	//
    94	// This means that a field element hits 2**257, rather than 2**256 as we would
    95	// like. A 28, 29, ... pattern would cause us to hit 2**256, but that causes
    96	// problems when multiplying as terms end up one bit short of a limb which
    97	// would require much bit-shifting to correct.
    98	//
    99	// Finally, the values stored in a field element are in Montgomery form. So the
   100	// value |y| is stored as (y*R) mod p, where p is the P-256 prime and R is
   101	// 2**257.
   102	
   103	const (
   104		p256Limbs    = 9
   105		bottom29Bits = 0x1fffffff
   106	)
   107	
   108	var (
   109		// p256One is the number 1 as a field element.
   110		p256One  = [p256Limbs]uint32{2, 0, 0, 0xffff800, 0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff, 0}
   111		p256Zero = [p256Limbs]uint32{0, 0, 0, 0, 0, 0, 0, 0, 0}
   112		// p256P is the prime modulus as a field element.
   113		p256P = [p256Limbs]uint32{0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff, 0, 0, 0x200000, 0xf000000, 0xfffffff}
   114		// p2562P is the twice prime modulus as a field element.
   115		p2562P = [p256Limbs]uint32{0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff, 0, 0, 0x400000, 0xe000000, 0x1fffffff}
   116	)
   117	
   118	// p256Precomputed contains precomputed values to aid the calculation of scalar
   119	// multiples of the base point, G. It's actually two, equal length, tables
   120	// concatenated.
   121	//
   122	// The first table contains (x,y) field element pairs for 16 multiples of the
   123	// base point, G.
   124	//
   125	//   Index  |  Index (binary) | Value
   126	//       0  |           0000  | 0G (all zeros, omitted)
   127	//       1  |           0001  | G
   128	//       2  |           0010  | 2**64G
   129	//       3  |           0011  | 2**64G + G
   130	//       4  |           0100  | 2**128G
   131	//       5  |           0101  | 2**128G + G
   132	//       6  |           0110  | 2**128G + 2**64G
   133	//       7  |           0111  | 2**128G + 2**64G + G
   134	//       8  |           1000  | 2**192G
   135	//       9  |           1001  | 2**192G + G
   136	//      10  |           1010  | 2**192G + 2**64G
   137	//      11  |           1011  | 2**192G + 2**64G + G
   138	//      12  |           1100  | 2**192G + 2**128G
   139	//      13  |           1101  | 2**192G + 2**128G + G
   140	//      14  |           1110  | 2**192G + 2**128G + 2**64G
   141	//      15  |           1111  | 2**192G + 2**128G + 2**64G + G
   142	//
   143	// The second table follows the same style, but the terms are 2**32G,
   144	// 2**96G, 2**160G, 2**224G.
   145	//
   146	// This is ~2KB of data.
   147	var p256Precomputed = [p256Limbs * 2 * 15 * 2]uint32{
   148		0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7edc, 0xd4a6eab, 0x3120bee,
   149		0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba21, 0x14b10bb, 0xae3fe3,
   150		0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe49073, 0x3fa36cc, 0x5ebcd2c,
   151		0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea12446, 0xe1ade1e, 0xec91f22,
   152		0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109, 0xa267a00, 0xb57c050,
   153		0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0x7d6dee7, 0x2976e4b,
   154		0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a5a9, 0x843a649, 0xc3ab0fa,
   155		0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11, 0x58c43df, 0xf423fc2,
   156		0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db40f, 0x83e277d, 0xb0dd609,
   157		0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5, 0xe10c9e, 0x33ab581,
   158		0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f, 0x48764cd, 0x76dbcca,
   159		0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b20, 0x4ba3173, 0xc168c33,
   160		0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0, 0x65dd7ff, 0x3a1e4f6,
   161		0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f077, 0xa6add89, 0x4894acd,
   162		0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,
   163		0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c, 0xda0cf5b, 0x812e881,
   164		0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51, 0xc22be3e, 0xe35e65a,
   165		0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9, 0x1c5a839, 0x47a1e26,
   166		0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c502, 0x2f32042, 0xa17769b,
   167		0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a02, 0x3fc93, 0x5620023,
   168		0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c, 0x407f75c, 0xbaab133,
   169		0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea7, 0x3293ac0, 0xcdc98aa,
   170		0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,
   171		0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72, 0x73e1c35, 0xee70fbc,
   172		0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85, 0x27de188, 0x66f70b8,
   173		0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae914, 0x2f3ec51, 0x3826b59,
   174		0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x823d9d2, 0x8213f39,
   175		0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4a, 0xf5ddc3d, 0x3786689,
   176		0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a729, 0x4be3499, 0x52b23aa,
   177		0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb048035, 0xe31de66, 0xc6ecaa3,
   178		0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a7529, 0xcb7beb1, 0xb2a78a1,
   179		0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff658, 0xe3d6511, 0xc7d76f,
   180		0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,
   181		0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d32411, 0xb04a838, 0xd760d2d,
   182		0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11e, 0x20bca9a, 0x66f496b,
   183		0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,
   184		0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56ff, 0x65ef930, 0x21dc4a,
   185		0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15f, 0x624e62e, 0xa90ae2f,
   186		0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522b, 0xdc78583, 0x40eeabb,
   187		0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef34, 0xae2a960, 0x91b8bdc,
   188		0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0x2413c8e, 0x5425bf9,
   189		0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633, 0x7c91952, 0xd806dce,
   190		0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef73, 0x8956f34, 0xe4b5cf2,
   191		0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7, 0x627b614, 0x7371cca,
   192		0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc9, 0x9c19bf2, 0x5882229,
   193		0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b3, 0xe85ff25, 0x408ef57,
   194		0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113, 0xa4a1769, 0x11fbc6c,
   195		0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b7, 0x4acbad9, 0x5efc5fa,
   196		0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc, 0x7bf0fa9, 0x957651,
   197		0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,
   198		0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c12d, 0xf20bd46, 0x1951fa7,
   199		0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74, 0x99bb618, 0x2db944c,
   200		0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e74779, 0x576138, 0x9587927,
   201		0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782d, 0xfc72e0b, 0x701b298,
   202		0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5d8, 0xf858d3a, 0x942eea8,
   203		0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a1, 0x8395659, 0x52ed4e2,
   204		0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c0, 0x6bdf55a, 0x4e4457d,
   205		0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747b, 0x878558d, 0x7d29aa4,
   206		0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d7, 0xa5bef68, 0xb7b30d8,
   207		0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f51951, 0x9d0c177, 0x1c49a78,
   208	}
   209	
   210	// Field element operations:
   211	
   212	// nonZeroToAllOnes returns:
   213	//   0xffffffff for 0 < x <= 2**31
   214	//   0 for x == 0 or x > 2**31.
   215	func nonZeroToAllOnes(x uint32) uint32 {
   216		return ((x - 1) >> 31) - 1
   217	}
   218	
   219	// p256ReduceCarry adds a multiple of p in order to cancel |carry|,
   220	// which is a term at 2**257.
   221	//
   222	// On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28.
   223	// On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29.
   224	func p256ReduceCarry(inout *[p256Limbs]uint32, carry uint32) {
   225		carry_mask := nonZeroToAllOnes(carry)
   226	
   227		inout[0] += carry << 1
   228		inout[3] += 0x10000000 & carry_mask
   229		// carry < 2**3 thus (carry << 11) < 2**14 and we added 2**28 in the
   230		// previous line therefore this doesn't underflow.
   231		inout[3] -= carry << 11
   232		inout[4] += (0x20000000 - 1) & carry_mask
   233		inout[5] += (0x10000000 - 1) & carry_mask
   234		inout[6] += (0x20000000 - 1) & carry_mask
   235		inout[6] -= carry << 22
   236		// This may underflow if carry is non-zero but, if so, we'll fix it in the
   237		// next line.
   238		inout[7] -= 1 & carry_mask
   239		inout[7] += carry << 25
   240	}
   241	
   242	// p256Sum sets out = in+in2.
   243	//
   244	// On entry, in[i]+in2[i] must not overflow a 32-bit word.
   245	// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
   246	func p256Sum(out, in, in2 *[p256Limbs]uint32) {
   247		carry := uint32(0)
   248		for i := 0; ; i++ {
   249			out[i] = in[i] + in2[i]
   250			out[i] += carry
   251			carry = out[i] >> 29
   252			out[i] &= bottom29Bits
   253	
   254			i++
   255			if i == p256Limbs {
   256				break
   257			}
   258	
   259			out[i] = in[i] + in2[i]
   260			out[i] += carry
   261			carry = out[i] >> 28
   262			out[i] &= bottom28Bits
   263		}
   264	
   265		p256ReduceCarry(out, carry)
   266	}
   267	
   268	const (
   269		two30m2    = 1<<30 - 1<<2
   270		two30p13m2 = 1<<30 + 1<<13 - 1<<2
   271		two31m2    = 1<<31 - 1<<2
   272		two31p24m2 = 1<<31 + 1<<24 - 1<<2
   273		two30m27m2 = 1<<30 - 1<<27 - 1<<2
   274	)
   275	
   276	// p256Zero31 is 0 mod p.
   277	var p256Zero31 = [p256Limbs]uint32{two31m3, two30m2, two31m2, two30p13m2, two31m2, two30m2, two31p24m2, two30m27m2, two31m2}
   278	
   279	// p256Diff sets out = in-in2.
   280	//
   281	// On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
   282	//           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
   283	// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   284	func p256Diff(out, in, in2 *[p256Limbs]uint32) {
   285		var carry uint32
   286	
   287		for i := 0; ; i++ {
   288			out[i] = in[i] - in2[i]
   289			out[i] += p256Zero31[i]
   290			out[i] += carry
   291			carry = out[i] >> 29
   292			out[i] &= bottom29Bits
   293	
   294			i++
   295			if i == p256Limbs {
   296				break
   297			}
   298	
   299			out[i] = in[i] - in2[i]
   300			out[i] += p256Zero31[i]
   301			out[i] += carry
   302			carry = out[i] >> 28
   303			out[i] &= bottom28Bits
   304		}
   305	
   306		p256ReduceCarry(out, carry)
   307	}
   308	
   309	// p256ReduceDegree sets out = tmp/R mod p where tmp contains 64-bit words with
   310	// the same 29,28,... bit positions as an field element.
   311	//
   312	// The values in field elements are in Montgomery form: x*R mod p where R =
   313	// 2**257. Since we just multiplied two Montgomery values together, the result
   314	// is x*y*R*R mod p. We wish to divide by R in order for the result also to be
   315	// in Montgomery form.
   316	//
   317	// On entry: tmp[i] < 2**64
   318	// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
   319	func p256ReduceDegree(out *[p256Limbs]uint32, tmp [17]uint64) {
   320		// The following table may be helpful when reading this code:
   321		//
   322		// Limb number:   0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10...
   323		// Width (bits):  29| 28| 29| 28| 29| 28| 29| 28| 29| 28| 29
   324		// Start bit:     0 | 29| 57| 86|114|143|171|200|228|257|285
   325		//   (odd phase): 0 | 28| 57| 85|114|142|171|199|228|256|285
   326		var tmp2 [18]uint32
   327		var carry, x, xMask uint32
   328	
   329		// tmp contains 64-bit words with the same 29,28,29-bit positions as an
   330		// field element. So the top of an element of tmp might overlap with
   331		// another element two positions down. The following loop eliminates
   332		// this overlap.
   333		tmp2[0] = uint32(tmp[0]) & bottom29Bits
   334	
   335		tmp2[1] = uint32(tmp[0]) >> 29
   336		tmp2[1] |= (uint32(tmp[0]>>32) << 3) & bottom28Bits
   337		tmp2[1] += uint32(tmp[1]) & bottom28Bits
   338		carry = tmp2[1] >> 28
   339		tmp2[1] &= bottom28Bits
   340	
   341		for i := 2; i < 17; i++ {
   342			tmp2[i] = (uint32(tmp[i-2] >> 32)) >> 25
   343			tmp2[i] += (uint32(tmp[i-1])) >> 28
   344			tmp2[i] += (uint32(tmp[i-1]>>32) << 4) & bottom29Bits
   345			tmp2[i] += uint32(tmp[i]) & bottom29Bits
   346			tmp2[i] += carry
   347			carry = tmp2[i] >> 29
   348			tmp2[i] &= bottom29Bits
   349	
   350			i++
   351			if i == 17 {
   352				break
   353			}
   354			tmp2[i] = uint32(tmp[i-2]>>32) >> 25
   355			tmp2[i] += uint32(tmp[i-1]) >> 29
   356			tmp2[i] += ((uint32(tmp[i-1] >> 32)) << 3) & bottom28Bits
   357			tmp2[i] += uint32(tmp[i]) & bottom28Bits
   358			tmp2[i] += carry
   359			carry = tmp2[i] >> 28
   360			tmp2[i] &= bottom28Bits
   361		}
   362	
   363		tmp2[17] = uint32(tmp[15]>>32) >> 25
   364		tmp2[17] += uint32(tmp[16]) >> 29
   365		tmp2[17] += uint32(tmp[16]>>32) << 3
   366		tmp2[17] += carry
   367	
   368		// Montgomery elimination of terms:
   369		//
   370		// Since R is 2**257, we can divide by R with a bitwise shift if we can
   371		// ensure that the right-most 257 bits are all zero. We can make that true
   372		// by adding multiplies of p without affecting the value.
   373		//
   374		// So we eliminate limbs from right to left. Since the bottom 29 bits of p
   375		// are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
   376		// We can do that for 8 further limbs and then right shift to eliminate the
   377		// extra factor of R.
   378		for i := 0; ; i += 2 {
   379			tmp2[i+1] += tmp2[i] >> 29
   380			x = tmp2[i] & bottom29Bits
   381			xMask = nonZeroToAllOnes(x)
   382			tmp2[i] = 0
   383	
   384			// The bounds calculations for this loop are tricky. Each iteration of
   385			// the loop eliminates two words by adding values to words to their
   386			// right.
   387			//
   388			// The following table contains the amounts added to each word (as an
   389			// offset from the value of i at the top of the loop). The amounts are
   390			// accounted for from the first and second half of the loop separately
   391			// and are written as, for example, 28 to mean a value <2**28.
   392			//
   393			// Word:                   3   4   5   6   7   8   9   10
   394			// Added in top half:     28  11      29  21  29  28
   395			//                                        28  29
   396			//                                            29
   397			// Added in bottom half:      29  10      28  21  28   28
   398			//                                            29
   399			//
   400			// The value that is currently offset 7 will be offset 5 for the next
   401			// iteration and then offset 3 for the iteration after that. Therefore
   402			// the total value added will be the values added at 7, 5 and 3.
   403			//
   404			// The following table accumulates these values. The sums at the bottom
   405			// are written as, for example, 29+28, to mean a value < 2**29+2**28.
   406			//
   407			// Word:                   3   4   5   6   7   8   9  10  11  12  13
   408			//                        28  11  10  29  21  29  28  28  28  28  28
   409			//                            29  28  11  28  29  28  29  28  29  28
   410			//                                    29  28  21  21  29  21  29  21
   411			//                                        10  29  28  21  28  21  28
   412			//                                        28  29  28  29  28  29  28
   413			//                                            11  10  29  10  29  10
   414			//                                            29  28  11  28  11
   415			//                                                    29      29
   416			//                        --------------------------------------------
   417			//                                                30+ 31+ 30+ 31+ 30+
   418			//                                                28+ 29+ 28+ 29+ 21+
   419			//                                                21+ 28+ 21+ 28+ 10
   420			//                                                10  21+ 10  21+
   421			//                                                    11      11
   422			//
   423			// So the greatest amount is added to tmp2[10] and tmp2[12]. If
   424			// tmp2[10/12] has an initial value of <2**29, then the maximum value
   425			// will be < 2**31 + 2**30 + 2**28 + 2**21 + 2**11, which is < 2**32,
   426			// as required.
   427			tmp2[i+3] += (x << 10) & bottom28Bits
   428			tmp2[i+4] += (x >> 18)
   429	
   430			tmp2[i+6] += (x << 21) & bottom29Bits
   431			tmp2[i+7] += x >> 8
   432	
   433			// At position 200, which is the starting bit position for word 7, we
   434			// have a factor of 0xf000000 = 2**28 - 2**24.
   435			tmp2[i+7] += 0x10000000 & xMask
   436			tmp2[i+8] += (x - 1) & xMask
   437			tmp2[i+7] -= (x << 24) & bottom28Bits
   438			tmp2[i+8] -= x >> 4
   439	
   440			tmp2[i+8] += 0x20000000 & xMask
   441			tmp2[i+8] -= x
   442			tmp2[i+8] += (x << 28) & bottom29Bits
   443			tmp2[i+9] += ((x >> 1) - 1) & xMask
   444	
   445			if i+1 == p256Limbs {
   446				break
   447			}
   448			tmp2[i+2] += tmp2[i+1] >> 28
   449			x = tmp2[i+1] & bottom28Bits
   450			xMask = nonZeroToAllOnes(x)
   451			tmp2[i+1] = 0
   452	
   453			tmp2[i+4] += (x << 11) & bottom29Bits
   454			tmp2[i+5] += (x >> 18)
   455	
   456			tmp2[i+7] += (x << 21) & bottom28Bits
   457			tmp2[i+8] += x >> 7
   458	
   459			// At position 199, which is the starting bit of the 8th word when
   460			// dealing with a context starting on an odd word, we have a factor of
   461			// 0x1e000000 = 2**29 - 2**25. Since we have not updated i, the 8th
   462			// word from i+1 is i+8.
   463			tmp2[i+8] += 0x20000000 & xMask
   464			tmp2[i+9] += (x - 1) & xMask
   465			tmp2[i+8] -= (x << 25) & bottom29Bits
   466			tmp2[i+9] -= x >> 4
   467	
   468			tmp2[i+9] += 0x10000000 & xMask
   469			tmp2[i+9] -= x
   470			tmp2[i+10] += (x - 1) & xMask
   471		}
   472	
   473		// We merge the right shift with a carry chain. The words above 2**257 have
   474		// widths of 28,29,... which we need to correct when copying them down.
   475		carry = 0
   476		for i := 0; i < 8; i++ {
   477			// The maximum value of tmp2[i + 9] occurs on the first iteration and
   478			// is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is
   479			// therefore safe.
   480			out[i] = tmp2[i+9]
   481			out[i] += carry
   482			out[i] += (tmp2[i+10] << 28) & bottom29Bits
   483			carry = out[i] >> 29
   484			out[i] &= bottom29Bits
   485	
   486			i++
   487			out[i] = tmp2[i+9] >> 1
   488			out[i] += carry
   489			carry = out[i] >> 28
   490			out[i] &= bottom28Bits
   491		}
   492	
   493		out[8] = tmp2[17]
   494		out[8] += carry
   495		carry = out[8] >> 29
   496		out[8] &= bottom29Bits
   497	
   498		p256ReduceCarry(out, carry)
   499	}
   500	
   501	// p256Square sets out=in*in.
   502	//
   503	// On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29.
   504	// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   505	func p256Square(out, in *[p256Limbs]uint32) {
   506		var tmp [17]uint64
   507	
   508		tmp[0] = uint64(in[0]) * uint64(in[0])
   509		tmp[1] = uint64(in[0]) * (uint64(in[1]) << 1)
   510		tmp[2] = uint64(in[0])*(uint64(in[2])<<1) +
   511			uint64(in[1])*(uint64(in[1])<<1)
   512		tmp[3] = uint64(in[0])*(uint64(in[3])<<1) +
   513			uint64(in[1])*(uint64(in[2])<<1)
   514		tmp[4] = uint64(in[0])*(uint64(in[4])<<1) +
   515			uint64(in[1])*(uint64(in[3])<<2) +
   516			uint64(in[2])*uint64(in[2])
   517		tmp[5] = uint64(in[0])*(uint64(in[5])<<1) +
   518			uint64(in[1])*(uint64(in[4])<<1) +
   519			uint64(in[2])*(uint64(in[3])<<1)
   520		tmp[6] = uint64(in[0])*(uint64(in[6])<<1) +
   521			uint64(in[1])*(uint64(in[5])<<2) +
   522			uint64(in[2])*(uint64(in[4])<<1) +
   523			uint64(in[3])*(uint64(in[3])<<1)
   524		tmp[7] = uint64(in[0])*(uint64(in[7])<<1) +
   525			uint64(in[1])*(uint64(in[6])<<1) +
   526			uint64(in[2])*(uint64(in[5])<<1) +
   527			uint64(in[3])*(uint64(in[4])<<1)
   528		// tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60,
   529		// which is < 2**64 as required.
   530		tmp[8] = uint64(in[0])*(uint64(in[8])<<1) +
   531			uint64(in[1])*(uint64(in[7])<<2) +
   532			uint64(in[2])*(uint64(in[6])<<1) +
   533			uint64(in[3])*(uint64(in[5])<<2) +
   534			uint64(in[4])*uint64(in[4])
   535		tmp[9] = uint64(in[1])*(uint64(in[8])<<1) +
   536			uint64(in[2])*(uint64(in[7])<<1) +
   537			uint64(in[3])*(uint64(in[6])<<1) +
   538			uint64(in[4])*(uint64(in[5])<<1)
   539		tmp[10] = uint64(in[2])*(uint64(in[8])<<1) +
   540			uint64(in[3])*(uint64(in[7])<<2) +
   541			uint64(in[4])*(uint64(in[6])<<1) +
   542			uint64(in[5])*(uint64(in[5])<<1)
   543		tmp[11] = uint64(in[3])*(uint64(in[8])<<1) +
   544			uint64(in[4])*(uint64(in[7])<<1) +
   545			uint64(in[5])*(uint64(in[6])<<1)
   546		tmp[12] = uint64(in[4])*(uint64(in[8])<<1) +
   547			uint64(in[5])*(uint64(in[7])<<2) +
   548			uint64(in[6])*uint64(in[6])
   549		tmp[13] = uint64(in[5])*(uint64(in[8])<<1) +
   550			uint64(in[6])*(uint64(in[7])<<1)
   551		tmp[14] = uint64(in[6])*(uint64(in[8])<<1) +
   552			uint64(in[7])*(uint64(in[7])<<1)
   553		tmp[15] = uint64(in[7]) * (uint64(in[8]) << 1)
   554		tmp[16] = uint64(in[8]) * uint64(in[8])
   555	
   556		p256ReduceDegree(out, tmp)
   557	}
   558	
   559	// p256Mul sets out=in*in2.
   560	//
   561	// On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
   562	//           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
   563	// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   564	func p256Mul(out, in, in2 *[p256Limbs]uint32) {
   565		var tmp [17]uint64
   566	
   567		tmp[0] = uint64(in[0]) * uint64(in2[0])
   568		tmp[1] = uint64(in[0])*(uint64(in2[1])<<0) +
   569			uint64(in[1])*(uint64(in2[0])<<0)
   570		tmp[2] = uint64(in[0])*(uint64(in2[2])<<0) +
   571			uint64(in[1])*(uint64(in2[1])<<1) +
   572			uint64(in[2])*(uint64(in2[0])<<0)
   573		tmp[3] = uint64(in[0])*(uint64(in2[3])<<0) +
   574			uint64(in[1])*(uint64(in2[2])<<0) +
   575			uint64(in[2])*(uint64(in2[1])<<0) +
   576			uint64(in[3])*(uint64(in2[0])<<0)
   577		tmp[4] = uint64(in[0])*(uint64(in2[4])<<0) +
   578			uint64(in[1])*(uint64(in2[3])<<1) +
   579			uint64(in[2])*(uint64(in2[2])<<0) +
   580			uint64(in[3])*(uint64(in2[1])<<1) +
   581			uint64(in[4])*(uint64(in2[0])<<0)
   582		tmp[5] = uint64(in[0])*(uint64(in2[5])<<0) +
   583			uint64(in[1])*(uint64(in2[4])<<0) +
   584			uint64(in[2])*(uint64(in2[3])<<0) +
   585			uint64(in[3])*(uint64(in2[2])<<0) +
   586			uint64(in[4])*(uint64(in2[1])<<0) +
   587			uint64(in[5])*(uint64(in2[0])<<0)
   588		tmp[6] = uint64(in[0])*(uint64(in2[6])<<0) +
   589			uint64(in[1])*(uint64(in2[5])<<1) +
   590			uint64(in[2])*(uint64(in2[4])<<0) +
   591			uint64(in[3])*(uint64(in2[3])<<1) +
   592			uint64(in[4])*(uint64(in2[2])<<0) +
   593			uint64(in[5])*(uint64(in2[1])<<1) +
   594			uint64(in[6])*(uint64(in2[0])<<0)
   595		tmp[7] = uint64(in[0])*(uint64(in2[7])<<0) +
   596			uint64(in[1])*(uint64(in2[6])<<0) +
   597			uint64(in[2])*(uint64(in2[5])<<0) +
   598			uint64(in[3])*(uint64(in2[4])<<0) +
   599			uint64(in[4])*(uint64(in2[3])<<0) +
   600			uint64(in[5])*(uint64(in2[2])<<0) +
   601			uint64(in[6])*(uint64(in2[1])<<0) +
   602			uint64(in[7])*(uint64(in2[0])<<0)
   603		// tmp[8] has the greatest value but doesn't overflow. See logic in
   604		// p256Square.
   605		tmp[8] = uint64(in[0])*(uint64(in2[8])<<0) +
   606			uint64(in[1])*(uint64(in2[7])<<1) +
   607			uint64(in[2])*(uint64(in2[6])<<0) +
   608			uint64(in[3])*(uint64(in2[5])<<1) +
   609			uint64(in[4])*(uint64(in2[4])<<0) +
   610			uint64(in[5])*(uint64(in2[3])<<1) +
   611			uint64(in[6])*(uint64(in2[2])<<0) +
   612			uint64(in[7])*(uint64(in2[1])<<1) +
   613			uint64(in[8])*(uint64(in2[0])<<0)
   614		tmp[9] = uint64(in[1])*(uint64(in2[8])<<0) +
   615			uint64(in[2])*(uint64(in2[7])<<0) +
   616			uint64(in[3])*(uint64(in2[6])<<0) +
   617			uint64(in[4])*(uint64(in2[5])<<0) +
   618			uint64(in[5])*(uint64(in2[4])<<0) +
   619			uint64(in[6])*(uint64(in2[3])<<0) +
   620			uint64(in[7])*(uint64(in2[2])<<0) +
   621			uint64(in[8])*(uint64(in2[1])<<0)
   622		tmp[10] = uint64(in[2])*(uint64(in2[8])<<0) +
   623			uint64(in[3])*(uint64(in2[7])<<1) +
   624			uint64(in[4])*(uint64(in2[6])<<0) +
   625			uint64(in[5])*(uint64(in2[5])<<1) +
   626			uint64(in[6])*(uint64(in2[4])<<0) +
   627			uint64(in[7])*(uint64(in2[3])<<1) +
   628			uint64(in[8])*(uint64(in2[2])<<0)
   629		tmp[11] = uint64(in[3])*(uint64(in2[8])<<0) +
   630			uint64(in[4])*(uint64(in2[7])<<0) +
   631			uint64(in[5])*(uint64(in2[6])<<0) +
   632			uint64(in[6])*(uint64(in2[5])<<0) +
   633			uint64(in[7])*(uint64(in2[4])<<0) +
   634			uint64(in[8])*(uint64(in2[3])<<0)
   635		tmp[12] = uint64(in[4])*(uint64(in2[8])<<0) +
   636			uint64(in[5])*(uint64(in2[7])<<1) +
   637			uint64(in[6])*(uint64(in2[6])<<0) +
   638			uint64(in[7])*(uint64(in2[5])<<1) +
   639			uint64(in[8])*(uint64(in2[4])<<0)
   640		tmp[13] = uint64(in[5])*(uint64(in2[8])<<0) +
   641			uint64(in[6])*(uint64(in2[7])<<0) +
   642			uint64(in[7])*(uint64(in2[6])<<0) +
   643			uint64(in[8])*(uint64(in2[5])<<0)
   644		tmp[14] = uint64(in[6])*(uint64(in2[8])<<0) +
   645			uint64(in[7])*(uint64(in2[7])<<1) +
   646			uint64(in[8])*(uint64(in2[6])<<0)
   647		tmp[15] = uint64(in[7])*(uint64(in2[8])<<0) +
   648			uint64(in[8])*(uint64(in2[7])<<0)
   649		tmp[16] = uint64(in[8]) * (uint64(in2[8]) << 0)
   650	
   651		p256ReduceDegree(out, tmp)
   652	}
   653	
   654	func p256Assign(out, in *[p256Limbs]uint32) {
   655		*out = *in
   656	}
   657	
   658	// p256Invert calculates |out| = |in|^{-1}
   659	//
   660	// Based on Fermat's Little Theorem:
   661	//   a^p = a (mod p)
   662	//   a^{p-1} = 1 (mod p)
   663	//   a^{p-2} = a^{-1} (mod p)
   664	func p256Invert(out, in *[p256Limbs]uint32) {
   665		var ftmp, ftmp2 [p256Limbs]uint32
   666	
   667		// each e_I will hold |in|^{2^I - 1}
   668		var e2, e4, e8, e16, e32, e64 [p256Limbs]uint32
   669	
   670		p256Square(&ftmp, in)     // 2^1
   671		p256Mul(&ftmp, in, &ftmp) // 2^2 - 2^0
   672		p256Assign(&e2, &ftmp)
   673		p256Square(&ftmp, &ftmp)   // 2^3 - 2^1
   674		p256Square(&ftmp, &ftmp)   // 2^4 - 2^2
   675		p256Mul(&ftmp, &ftmp, &e2) // 2^4 - 2^0
   676		p256Assign(&e4, &ftmp)
   677		p256Square(&ftmp, &ftmp)   // 2^5 - 2^1
   678		p256Square(&ftmp, &ftmp)   // 2^6 - 2^2
   679		p256Square(&ftmp, &ftmp)   // 2^7 - 2^3
   680		p256Square(&ftmp, &ftmp)   // 2^8 - 2^4
   681		p256Mul(&ftmp, &ftmp, &e4) // 2^8 - 2^0
   682		p256Assign(&e8, &ftmp)
   683		for i := 0; i < 8; i++ {
   684			p256Square(&ftmp, &ftmp)
   685		} // 2^16 - 2^8
   686		p256Mul(&ftmp, &ftmp, &e8) // 2^16 - 2^0
   687		p256Assign(&e16, &ftmp)
   688		for i := 0; i < 16; i++ {
   689			p256Square(&ftmp, &ftmp)
   690		} // 2^32 - 2^16
   691		p256Mul(&ftmp, &ftmp, &e16) // 2^32 - 2^0
   692		p256Assign(&e32, &ftmp)
   693		for i := 0; i < 32; i++ {
   694			p256Square(&ftmp, &ftmp)
   695		} // 2^64 - 2^32
   696		p256Assign(&e64, &ftmp)
   697		p256Mul(&ftmp, &ftmp, in) // 2^64 - 2^32 + 2^0
   698		for i := 0; i < 192; i++ {
   699			p256Square(&ftmp, &ftmp)
   700		} // 2^256 - 2^224 + 2^192
   701	
   702		p256Mul(&ftmp2, &e64, &e32) // 2^64 - 2^0
   703		for i := 0; i < 16; i++ {
   704			p256Square(&ftmp2, &ftmp2)
   705		} // 2^80 - 2^16
   706		p256Mul(&ftmp2, &ftmp2, &e16) // 2^80 - 2^0
   707		for i := 0; i < 8; i++ {
   708			p256Square(&ftmp2, &ftmp2)
   709		} // 2^88 - 2^8
   710		p256Mul(&ftmp2, &ftmp2, &e8) // 2^88 - 2^0
   711		for i := 0; i < 4; i++ {
   712			p256Square(&ftmp2, &ftmp2)
   713		} // 2^92 - 2^4
   714		p256Mul(&ftmp2, &ftmp2, &e4) // 2^92 - 2^0
   715		p256Square(&ftmp2, &ftmp2)   // 2^93 - 2^1
   716		p256Square(&ftmp2, &ftmp2)   // 2^94 - 2^2
   717		p256Mul(&ftmp2, &ftmp2, &e2) // 2^94 - 2^0
   718		p256Square(&ftmp2, &ftmp2)   // 2^95 - 2^1
   719		p256Square(&ftmp2, &ftmp2)   // 2^96 - 2^2
   720		p256Mul(&ftmp2, &ftmp2, in)  // 2^96 - 3
   721	
   722		p256Mul(out, &ftmp2, &ftmp) // 2^256 - 2^224 + 2^192 + 2^96 - 3
   723	}
   724	
   725	// p256Scalar3 sets out=3*out.
   726	//
   727	// On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   728	// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   729	func p256Scalar3(out *[p256Limbs]uint32) {
   730		var carry uint32
   731	
   732		for i := 0; ; i++ {
   733			out[i] *= 3
   734			out[i] += carry
   735			carry = out[i] >> 29
   736			out[i] &= bottom29Bits
   737	
   738			i++
   739			if i == p256Limbs {
   740				break
   741			}
   742	
   743			out[i] *= 3
   744			out[i] += carry
   745			carry = out[i] >> 28
   746			out[i] &= bottom28Bits
   747		}
   748	
   749		p256ReduceCarry(out, carry)
   750	}
   751	
   752	// p256Scalar4 sets out=4*out.
   753	//
   754	// On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   755	// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   756	func p256Scalar4(out *[p256Limbs]uint32) {
   757		var carry, nextCarry uint32
   758	
   759		for i := 0; ; i++ {
   760			nextCarry = out[i] >> 27
   761			out[i] <<= 2
   762			out[i] &= bottom29Bits
   763			out[i] += carry
   764			carry = nextCarry + (out[i] >> 29)
   765			out[i] &= bottom29Bits
   766	
   767			i++
   768			if i == p256Limbs {
   769				break
   770			}
   771			nextCarry = out[i] >> 26
   772			out[i] <<= 2
   773			out[i] &= bottom28Bits
   774			out[i] += carry
   775			carry = nextCarry + (out[i] >> 28)
   776			out[i] &= bottom28Bits
   777		}
   778	
   779		p256ReduceCarry(out, carry)
   780	}
   781	
   782	// p256Scalar8 sets out=8*out.
   783	//
   784	// On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   785	// On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   786	func p256Scalar8(out *[p256Limbs]uint32) {
   787		var carry, nextCarry uint32
   788	
   789		for i := 0; ; i++ {
   790			nextCarry = out[i] >> 26
   791			out[i] <<= 3
   792			out[i] &= bottom29Bits
   793			out[i] += carry
   794			carry = nextCarry + (out[i] >> 29)
   795			out[i] &= bottom29Bits
   796	
   797			i++
   798			if i == p256Limbs {
   799				break
   800			}
   801			nextCarry = out[i] >> 25
   802			out[i] <<= 3
   803			out[i] &= bottom28Bits
   804			out[i] += carry
   805			carry = nextCarry + (out[i] >> 28)
   806			out[i] &= bottom28Bits
   807		}
   808	
   809		p256ReduceCarry(out, carry)
   810	}
   811	
   812	// Group operations:
   813	//
   814	// Elements of the elliptic curve group are represented in Jacobian
   815	// coordinates: (x, y, z). An affine point (x', y') is x'=x/z**2, y'=y/z**3 in
   816	// Jacobian form.
   817	
   818	// p256PointDouble sets {xOut,yOut,zOut} = 2*{x,y,z}.
   819	//
   820	// See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
   821	func p256PointDouble(xOut, yOut, zOut, x, y, z *[p256Limbs]uint32) {
   822		var delta, gamma, alpha, beta, tmp, tmp2 [p256Limbs]uint32
   823	
   824		p256Square(&delta, z)
   825		p256Square(&gamma, y)
   826		p256Mul(&beta, x, &gamma)
   827	
   828		p256Sum(&tmp, x, &delta)
   829		p256Diff(&tmp2, x, &delta)
   830		p256Mul(&alpha, &tmp, &tmp2)
   831		p256Scalar3(&alpha)
   832	
   833		p256Sum(&tmp, y, z)
   834		p256Square(&tmp, &tmp)
   835		p256Diff(&tmp, &tmp, &gamma)
   836		p256Diff(zOut, &tmp, &delta)
   837	
   838		p256Scalar4(&beta)
   839		p256Square(xOut, &alpha)
   840		p256Diff(xOut, xOut, &beta)
   841		p256Diff(xOut, xOut, &beta)
   842	
   843		p256Diff(&tmp, &beta, xOut)
   844		p256Mul(&tmp, &alpha, &tmp)
   845		p256Square(&tmp2, &gamma)
   846		p256Scalar8(&tmp2)
   847		p256Diff(yOut, &tmp, &tmp2)
   848	}
   849	
   850	// p256PointAddMixed sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,1}.
   851	// (i.e. the second point is affine.)
   852	//
   853	// See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
   854	//
   855	// Note that this function does not handle P+P, infinity+P nor P+infinity
   856	// correctly.
   857	func p256PointAddMixed(xOut, yOut, zOut, x1, y1, z1, x2, y2 *[p256Limbs]uint32) {
   858		var z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
   859	
   860		p256Square(&z1z1, z1)
   861		p256Sum(&tmp, z1, z1)
   862	
   863		p256Mul(&u2, x2, &z1z1)
   864		p256Mul(&z1z1z1, z1, &z1z1)
   865		p256Mul(&s2, y2, &z1z1z1)
   866		p256Diff(&h, &u2, x1)
   867		p256Sum(&i, &h, &h)
   868		p256Square(&i, &i)
   869		p256Mul(&j, &h, &i)
   870		p256Diff(&r, &s2, y1)
   871		p256Sum(&r, &r, &r)
   872		p256Mul(&v, x1, &i)
   873	
   874		p256Mul(zOut, &tmp, &h)
   875		p256Square(&rr, &r)
   876		p256Diff(xOut, &rr, &j)
   877		p256Diff(xOut, xOut, &v)
   878		p256Diff(xOut, xOut, &v)
   879	
   880		p256Diff(&tmp, &v, xOut)
   881		p256Mul(yOut, &tmp, &r)
   882		p256Mul(&tmp, y1, &j)
   883		p256Diff(yOut, yOut, &tmp)
   884		p256Diff(yOut, yOut, &tmp)
   885	}
   886	
   887	// p256PointAdd sets {xOut,yOut,zOut} = {x1,y1,z1} + {x2,y2,z2}.
   888	//
   889	// See https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
   890	//
   891	// Note that this function does not handle P+P, infinity+P nor P+infinity
   892	// correctly.
   893	func p256PointAdd(xOut, yOut, zOut, x1, y1, z1, x2, y2, z2 *[p256Limbs]uint32) {
   894		var z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp [p256Limbs]uint32
   895	
   896		p256Square(&z1z1, z1)
   897		p256Square(&z2z2, z2)
   898		p256Mul(&u1, x1, &z2z2)
   899	
   900		p256Sum(&tmp, z1, z2)
   901		p256Square(&tmp, &tmp)
   902		p256Diff(&tmp, &tmp, &z1z1)
   903		p256Diff(&tmp, &tmp, &z2z2)
   904	
   905		p256Mul(&z2z2z2, z2, &z2z2)
   906		p256Mul(&s1, y1, &z2z2z2)
   907	
   908		p256Mul(&u2, x2, &z1z1)
   909		p256Mul(&z1z1z1, z1, &z1z1)
   910		p256Mul(&s2, y2, &z1z1z1)
   911		p256Diff(&h, &u2, &u1)
   912		p256Sum(&i, &h, &h)
   913		p256Square(&i, &i)
   914		p256Mul(&j, &h, &i)
   915		p256Diff(&r, &s2, &s1)
   916		p256Sum(&r, &r, &r)
   917		p256Mul(&v, &u1, &i)
   918	
   919		p256Mul(zOut, &tmp, &h)
   920		p256Square(&rr, &r)
   921		p256Diff(xOut, &rr, &j)
   922		p256Diff(xOut, xOut, &v)
   923		p256Diff(xOut, xOut, &v)
   924	
   925		p256Diff(&tmp, &v, xOut)
   926		p256Mul(yOut, &tmp, &r)
   927		p256Mul(&tmp, &s1, &j)
   928		p256Diff(yOut, yOut, &tmp)
   929		p256Diff(yOut, yOut, &tmp)
   930	}
   931	
   932	// p256CopyConditional sets out=in if mask = 0xffffffff in constant time.
   933	//
   934	// On entry: mask is either 0 or 0xffffffff.
   935	func p256CopyConditional(out, in *[p256Limbs]uint32, mask uint32) {
   936		for i := 0; i < p256Limbs; i++ {
   937			tmp := mask & (in[i] ^ out[i])
   938			out[i] ^= tmp
   939		}
   940	}
   941	
   942	// p256SelectAffinePoint sets {out_x,out_y} to the index'th entry of table.
   943	// On entry: index < 16, table[0] must be zero.
   944	func p256SelectAffinePoint(xOut, yOut *[p256Limbs]uint32, table []uint32, index uint32) {
   945		for i := range xOut {
   946			xOut[i] = 0
   947		}
   948		for i := range yOut {
   949			yOut[i] = 0
   950		}
   951	
   952		for i := uint32(1); i < 16; i++ {
   953			mask := i ^ index
   954			mask |= mask >> 2
   955			mask |= mask >> 1
   956			mask &= 1
   957			mask--
   958			for j := range xOut {
   959				xOut[j] |= table[0] & mask
   960				table = table[1:]
   961			}
   962			for j := range yOut {
   963				yOut[j] |= table[0] & mask
   964				table = table[1:]
   965			}
   966		}
   967	}
   968	
   969	// p256SelectJacobianPoint sets {out_x,out_y,out_z} to the index'th entry of
   970	// table.
   971	// On entry: index < 16, table[0] must be zero.
   972	func p256SelectJacobianPoint(xOut, yOut, zOut *[p256Limbs]uint32, table *[16][3][p256Limbs]uint32, index uint32) {
   973		for i := range xOut {
   974			xOut[i] = 0
   975		}
   976		for i := range yOut {
   977			yOut[i] = 0
   978		}
   979		for i := range zOut {
   980			zOut[i] = 0
   981		}
   982	
   983		// The implicit value at index 0 is all zero. We don't need to perform that
   984		// iteration of the loop because we already set out_* to zero.
   985		for i := uint32(1); i < 16; i++ {
   986			mask := i ^ index
   987			mask |= mask >> 2
   988			mask |= mask >> 1
   989			mask &= 1
   990			mask--
   991			for j := range xOut {
   992				xOut[j] |= table[i][0][j] & mask
   993			}
   994			for j := range yOut {
   995				yOut[j] |= table[i][1][j] & mask
   996			}
   997			for j := range zOut {
   998				zOut[j] |= table[i][2][j] & mask
   999			}
  1000		}
  1001	}
  1002	
  1003	// p256GetBit returns the bit'th bit of scalar.
  1004	func p256GetBit(scalar *[32]uint8, bit uint) uint32 {
  1005		return uint32(((scalar[bit>>3]) >> (bit & 7)) & 1)
  1006	}
  1007	
  1008	// p256ScalarBaseMult sets {xOut,yOut,zOut} = scalar*G where scalar is a
  1009	// little-endian number. Note that the value of scalar must be less than the
  1010	// order of the group.
  1011	func p256ScalarBaseMult(xOut, yOut, zOut *[p256Limbs]uint32, scalar *[32]uint8) {
  1012		nIsInfinityMask := ^uint32(0)
  1013		var pIsNoninfiniteMask, mask, tableOffset uint32
  1014		var px, py, tx, ty, tz [p256Limbs]uint32
  1015	
  1016		for i := range xOut {
  1017			xOut[i] = 0
  1018		}
  1019		for i := range yOut {
  1020			yOut[i] = 0
  1021		}
  1022		for i := range zOut {
  1023			zOut[i] = 0
  1024		}
  1025	
  1026		// The loop adds bits at positions 0, 64, 128 and 192, followed by
  1027		// positions 32,96,160 and 224 and does this 32 times.
  1028		for i := uint(0); i < 32; i++ {
  1029			if i != 0 {
  1030				p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1031			}
  1032			tableOffset = 0
  1033			for j := uint(0); j <= 32; j += 32 {
  1034				bit0 := p256GetBit(scalar, 31-i+j)
  1035				bit1 := p256GetBit(scalar, 95-i+j)
  1036				bit2 := p256GetBit(scalar, 159-i+j)
  1037				bit3 := p256GetBit(scalar, 223-i+j)
  1038				index := bit0 | (bit1 << 1) | (bit2 << 2) | (bit3 << 3)
  1039	
  1040				p256SelectAffinePoint(&px, &py, p256Precomputed[tableOffset:], index)
  1041				tableOffset += 30 * p256Limbs
  1042	
  1043				// Since scalar is less than the order of the group, we know that
  1044				// {xOut,yOut,zOut} != {px,py,1}, unless both are zero, which we handle
  1045				// below.
  1046				p256PointAddMixed(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py)
  1047				// The result of pointAddMixed is incorrect if {xOut,yOut,zOut} is zero
  1048				// (a.k.a.  the point at infinity). We handle that situation by
  1049				// copying the point from the table.
  1050				p256CopyConditional(xOut, &px, nIsInfinityMask)
  1051				p256CopyConditional(yOut, &py, nIsInfinityMask)
  1052				p256CopyConditional(zOut, &p256One, nIsInfinityMask)
  1053	
  1054				// Equally, the result is also wrong if the point from the table is
  1055				// zero, which happens when the index is zero. We handle that by
  1056				// only copying from {tx,ty,tz} to {xOut,yOut,zOut} if index != 0.
  1057				pIsNoninfiniteMask = nonZeroToAllOnes(index)
  1058				mask = pIsNoninfiniteMask & ^nIsInfinityMask
  1059				p256CopyConditional(xOut, &tx, mask)
  1060				p256CopyConditional(yOut, &ty, mask)
  1061				p256CopyConditional(zOut, &tz, mask)
  1062				// If p was not zero, then n is now non-zero.
  1063				nIsInfinityMask &^= pIsNoninfiniteMask
  1064			}
  1065		}
  1066	}
  1067	
  1068	// p256PointToAffine converts a Jacobian point to an affine point. If the input
  1069	// is the point at infinity then it returns (0, 0) in constant time.
  1070	func p256PointToAffine(xOut, yOut, x, y, z *[p256Limbs]uint32) {
  1071		var zInv, zInvSq [p256Limbs]uint32
  1072	
  1073		p256Invert(&zInv, z)
  1074		p256Square(&zInvSq, &zInv)
  1075		p256Mul(xOut, x, &zInvSq)
  1076		p256Mul(&zInv, &zInv, &zInvSq)
  1077		p256Mul(yOut, y, &zInv)
  1078	}
  1079	
  1080	// p256ToAffine returns a pair of *big.Int containing the affine representation
  1081	// of {x,y,z}.
  1082	func p256ToAffine(x, y, z *[p256Limbs]uint32) (xOut, yOut *big.Int) {
  1083		var xx, yy [p256Limbs]uint32
  1084		p256PointToAffine(&xx, &yy, x, y, z)
  1085		return p256ToBig(&xx), p256ToBig(&yy)
  1086	}
  1087	
  1088	// p256ScalarMult sets {xOut,yOut,zOut} = scalar*{x,y}.
  1089	func p256ScalarMult(xOut, yOut, zOut, x, y *[p256Limbs]uint32, scalar *[32]uint8) {
  1090		var px, py, pz, tx, ty, tz [p256Limbs]uint32
  1091		var precomp [16][3][p256Limbs]uint32
  1092		var nIsInfinityMask, index, pIsNoninfiniteMask, mask uint32
  1093	
  1094		// We precompute 0,1,2,... times {x,y}.
  1095		precomp[1][0] = *x
  1096		precomp[1][1] = *y
  1097		precomp[1][2] = p256One
  1098	
  1099		for i := 2; i < 16; i += 2 {
  1100			p256PointDouble(&precomp[i][0], &precomp[i][1], &precomp[i][2], &precomp[i/2][0], &precomp[i/2][1], &precomp[i/2][2])
  1101			p256PointAddMixed(&precomp[i+1][0], &precomp[i+1][1], &precomp[i+1][2], &precomp[i][0], &precomp[i][1], &precomp[i][2], x, y)
  1102		}
  1103	
  1104		for i := range xOut {
  1105			xOut[i] = 0
  1106		}
  1107		for i := range yOut {
  1108			yOut[i] = 0
  1109		}
  1110		for i := range zOut {
  1111			zOut[i] = 0
  1112		}
  1113		nIsInfinityMask = ^uint32(0)
  1114	
  1115		// We add in a window of four bits each iteration and do this 64 times.
  1116		for i := 0; i < 64; i++ {
  1117			if i != 0 {
  1118				p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1119				p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1120				p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1121				p256PointDouble(xOut, yOut, zOut, xOut, yOut, zOut)
  1122			}
  1123	
  1124			index = uint32(scalar[31-i/2])
  1125			if (i & 1) == 1 {
  1126				index &= 15
  1127			} else {
  1128				index >>= 4
  1129			}
  1130	
  1131			// See the comments in scalarBaseMult about handling infinities.
  1132			p256SelectJacobianPoint(&px, &py, &pz, &precomp, index)
  1133			p256PointAdd(&tx, &ty, &tz, xOut, yOut, zOut, &px, &py, &pz)
  1134			p256CopyConditional(xOut, &px, nIsInfinityMask)
  1135			p256CopyConditional(yOut, &py, nIsInfinityMask)
  1136			p256CopyConditional(zOut, &pz, nIsInfinityMask)
  1137	
  1138			pIsNoninfiniteMask = nonZeroToAllOnes(index)
  1139			mask = pIsNoninfiniteMask & ^nIsInfinityMask
  1140			p256CopyConditional(xOut, &tx, mask)
  1141			p256CopyConditional(yOut, &ty, mask)
  1142			p256CopyConditional(zOut, &tz, mask)
  1143			nIsInfinityMask &^= pIsNoninfiniteMask
  1144		}
  1145	}
  1146	
  1147	// p256FromBig sets out = R*in.
  1148	func p256FromBig(out *[p256Limbs]uint32, in *big.Int) {
  1149		tmp := new(big.Int).Lsh(in, 257)
  1150		tmp.Mod(tmp, p256Params.P)
  1151	
  1152		for i := 0; i < p256Limbs; i++ {
  1153			if bits := tmp.Bits(); len(bits) > 0 {
  1154				out[i] = uint32(bits[0]) & bottom29Bits
  1155			} else {
  1156				out[i] = 0
  1157			}
  1158			tmp.Rsh(tmp, 29)
  1159	
  1160			i++
  1161			if i == p256Limbs {
  1162				break
  1163			}
  1164	
  1165			if bits := tmp.Bits(); len(bits) > 0 {
  1166				out[i] = uint32(bits[0]) & bottom28Bits
  1167			} else {
  1168				out[i] = 0
  1169			}
  1170			tmp.Rsh(tmp, 28)
  1171		}
  1172	}
  1173	
  1174	// p256ToBig returns a *big.Int containing the value of in.
  1175	func p256ToBig(in *[p256Limbs]uint32) *big.Int {
  1176		result, tmp := new(big.Int), new(big.Int)
  1177	
  1178		result.SetInt64(int64(in[p256Limbs-1]))
  1179		for i := p256Limbs - 2; i >= 0; i-- {
  1180			if (i & 1) == 0 {
  1181				result.Lsh(result, 29)
  1182			} else {
  1183				result.Lsh(result, 28)
  1184			}
  1185			tmp.SetInt64(int64(in[i]))
  1186			result.Add(result, tmp)
  1187		}
  1188	
  1189		result.Mul(result, p256RInverse)
  1190		result.Mod(result, p256Params.P)
  1191		return result
  1192	}
  1193
View as plain text