Text file src/pkg/crypto/elliptic/p256_asm_arm64.s

     1	// Copyright 2018 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// This file contains constant-time, 64-bit assembly implementation of
     6	// P256. The optimizations performed here are described in detail in:
     7	// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
     8	//                          256-bit primes"
     9	// http://link.springer.com/article/10.1007%2Fs13389-014-0090-x
    10	// https://eprint.iacr.org/2013/816.pdf
    11	
    12	#include "textflag.h"
    13	
    14	#define res_ptr R0
    15	#define a_ptr R1
    16	#define b_ptr R2
    17	
    18	#define acc0 R3
    19	#define acc1 R4
    20	#define acc2 R5
    21	#define acc3 R6
    22	
    23	#define acc4 R7
    24	#define acc5 R8
    25	#define acc6 R9
    26	#define acc7 R10
    27	#define t0 R11
    28	#define t1 R12
    29	#define t2 R13
    30	#define t3 R14
    31	#define const0 R15
    32	#define const1 R16
    33	
    34	#define hlp0 R17
    35	#define hlp1 res_ptr
    36	
    37	#define x0 R19
    38	#define x1 R20
    39	#define x2 R21
    40	#define x3 R22
    41	#define y0 R23
    42	#define y1 R24
    43	#define y2 R25
    44	#define y3 R26
    45	
    46	#define const2 t2
    47	#define const3 t3
    48	
    49	DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
    50	DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
    51	DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
    52	DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
    53	DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
    54	DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
    55	DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
    56	DATA p256one<>+0x00(SB)/8, $0x0000000000000001
    57	DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
    58	DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
    59	DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
    60	GLOBL p256const0<>(SB), 8, $8
    61	GLOBL p256const1<>(SB), 8, $8
    62	GLOBL p256ordK0<>(SB), 8, $8
    63	GLOBL p256ord<>(SB), 8, $32
    64	GLOBL p256one<>(SB), 8, $32
    65	
    66	/* ---------------------------------------*/
    67	// func p256LittleToBig(res []byte, in []uint64)
    68	TEXT ·p256LittleToBig(SB),NOSPLIT,$0
    69		JMP	·p256BigToLittle(SB)
    70	/* ---------------------------------------*/
    71	// func p256BigToLittle(res []uint64, in []byte)
    72	TEXT ·p256BigToLittle(SB),NOSPLIT,$0
    73		MOVD	res+0(FP), res_ptr
    74		MOVD	in+24(FP), a_ptr
    75	
    76		LDP	0*16(a_ptr), (acc0, acc1)
    77		LDP	1*16(a_ptr), (acc2, acc3)
    78	
    79		REV	acc0, acc0
    80		REV	acc1, acc1
    81		REV	acc2, acc2
    82		REV	acc3, acc3
    83	
    84		STP	(acc3, acc2), 0*16(res_ptr)
    85		STP	(acc1, acc0), 1*16(res_ptr)
    86		RET
    87	/* ---------------------------------------*/
    88	// func p256MovCond(res, a, b []uint64, cond int)
    89	// If cond == 0 res=b, else res=a
    90	TEXT ·p256MovCond(SB),NOSPLIT,$0
    91		MOVD	res+0(FP), res_ptr
    92		MOVD	a+24(FP), a_ptr
    93		MOVD	b+48(FP), b_ptr
    94		MOVD	cond+72(FP), R3
    95	
    96		CMP	$0, R3
    97		// Two remarks:
    98		// 1) Will want to revisit NEON, when support is better
    99		// 2) CSEL might not be constant time on all ARM processors
   100		LDP	0*16(a_ptr), (R4, R5)
   101		LDP	1*16(a_ptr), (R6, R7)
   102		LDP	2*16(a_ptr), (R8, R9)
   103		LDP	0*16(b_ptr), (R16, R17)
   104		LDP	1*16(b_ptr), (R19, R20)
   105		LDP	2*16(b_ptr), (R21, R22)
   106		CSEL	EQ, R16, R4, R4
   107		CSEL	EQ, R17, R5, R5
   108		CSEL	EQ, R19, R6, R6
   109		CSEL	EQ, R20, R7, R7
   110		CSEL	EQ, R21, R8, R8
   111		CSEL	EQ, R22, R9, R9
   112		STP	(R4, R5), 0*16(res_ptr)
   113		STP	(R6, R7), 1*16(res_ptr)
   114		STP	(R8, R9), 2*16(res_ptr)
   115	
   116		LDP	3*16(a_ptr), (R4, R5)
   117		LDP	4*16(a_ptr), (R6, R7)
   118		LDP	5*16(a_ptr), (R8, R9)
   119		LDP	3*16(b_ptr), (R16, R17)
   120		LDP	4*16(b_ptr), (R19, R20)
   121		LDP	5*16(b_ptr), (R21, R22)
   122		CSEL	EQ, R16, R4, R4
   123		CSEL	EQ, R17, R5, R5
   124		CSEL	EQ, R19, R6, R6
   125		CSEL	EQ, R20, R7, R7
   126		CSEL	EQ, R21, R8, R8
   127		CSEL	EQ, R22, R9, R9
   128		STP	(R4, R5), 3*16(res_ptr)
   129		STP	(R6, R7), 4*16(res_ptr)
   130		STP	(R8, R9), 5*16(res_ptr)
   131	
   132		RET
   133	/* ---------------------------------------*/
   134	// func p256NegCond(val []uint64, cond int)
   135	TEXT ·p256NegCond(SB),NOSPLIT,$0
   136		MOVD	val+0(FP), a_ptr
   137		MOVD	cond+24(FP), hlp0
   138		MOVD	a_ptr, res_ptr
   139		// acc = poly
   140		MOVD	$-1, acc0
   141		MOVD	p256const0<>(SB), acc1
   142		MOVD	$0, acc2
   143		MOVD	p256const1<>(SB), acc3
   144		// Load the original value
   145		LDP	0*16(a_ptr), (t0, t1)
   146		LDP	1*16(a_ptr), (t2, t3)
   147		// Speculatively subtract
   148		SUBS	t0, acc0
   149		SBCS	t1, acc1
   150		SBCS	t2, acc2
   151		SBC	t3, acc3
   152		// If condition is 0, keep original value
   153		CMP	$0, hlp0
   154		CSEL	EQ, t0, acc0, acc0
   155		CSEL	EQ, t1, acc1, acc1
   156		CSEL	EQ, t2, acc2, acc2
   157		CSEL	EQ, t3, acc3, acc3
   158		// Store result
   159		STP	(acc0, acc1), 0*16(res_ptr)
   160		STP	(acc2, acc3), 1*16(res_ptr)
   161	
   162		RET
   163	/* ---------------------------------------*/
   164	// func p256Sqr(res, in []uint64, n int)
   165	TEXT ·p256Sqr(SB),NOSPLIT,$0
   166		MOVD	res+0(FP), res_ptr
   167		MOVD	in+24(FP), a_ptr
   168		MOVD	n+48(FP), b_ptr
   169	
   170		MOVD	p256const0<>(SB), const0
   171		MOVD	p256const1<>(SB), const1
   172	
   173		LDP	0*16(a_ptr), (x0, x1)
   174		LDP	1*16(a_ptr), (x2, x3)
   175	
   176	sqrLoop:
   177		SUB	$1, b_ptr
   178		CALL	p256SqrInternal<>(SB)
   179		MOVD	y0, x0
   180		MOVD	y1, x1
   181		MOVD	y2, x2
   182		MOVD	y3, x3
   183		CBNZ	b_ptr, sqrLoop
   184	
   185		STP	(y0, y1), 0*16(res_ptr)
   186		STP	(y2, y3), 1*16(res_ptr)
   187		RET
   188	/* ---------------------------------------*/
   189	// func p256Mul(res, in1, in2 []uint64)
   190	TEXT ·p256Mul(SB),NOSPLIT,$0
   191		MOVD	res+0(FP), res_ptr
   192		MOVD	in1+24(FP), a_ptr
   193		MOVD	in2+48(FP), b_ptr
   194	
   195		MOVD	p256const0<>(SB), const0
   196		MOVD	p256const1<>(SB), const1
   197	
   198		LDP	0*16(a_ptr), (x0, x1)
   199		LDP	1*16(a_ptr), (x2, x3)
   200	
   201		LDP	0*16(b_ptr), (y0, y1)
   202		LDP	1*16(b_ptr), (y2, y3)
   203	
   204		CALL	p256MulInternal<>(SB)
   205	
   206		STP	(y0, y1), 0*16(res_ptr)
   207		STP	(y2, y3), 1*16(res_ptr)
   208		RET
   209	/* ---------------------------------------*/
   210	// func p256FromMont(res, in []uint64)
   211	TEXT ·p256FromMont(SB),NOSPLIT,$0
   212		MOVD	res+0(FP), res_ptr
   213		MOVD	in+24(FP), a_ptr
   214	
   215		MOVD	p256const0<>(SB), const0
   216		MOVD	p256const1<>(SB), const1
   217	
   218		LDP	0*16(a_ptr), (acc0, acc1)
   219		LDP	1*16(a_ptr), (acc2, acc3)
   220		// Only reduce, no multiplications are needed
   221		// First reduction step
   222		ADDS	acc0<<32, acc1, acc1
   223		LSR	$32, acc0, t0
   224		MUL	acc0, const1, t1
   225		UMULH	acc0, const1, acc0
   226		ADCS	t0, acc2
   227		ADCS	t1, acc3
   228		ADC	$0, acc0
   229		// Second reduction step
   230		ADDS	acc1<<32, acc2, acc2
   231		LSR	$32, acc1, t0
   232		MUL	acc1, const1, t1
   233		UMULH	acc1, const1, acc1
   234		ADCS	t0, acc3
   235		ADCS	t1, acc0
   236		ADC	$0, acc1
   237		// Third reduction step
   238		ADDS	acc2<<32, acc3, acc3
   239		LSR	$32, acc2, t0
   240		MUL	acc2, const1, t1
   241		UMULH	acc2, const1, acc2
   242		ADCS	t0, acc0
   243		ADCS	t1, acc1
   244		ADC	$0, acc2
   245		// Last reduction step
   246		ADDS	acc3<<32, acc0, acc0
   247		LSR	$32, acc3, t0
   248		MUL	acc3, const1, t1
   249		UMULH	acc3, const1, acc3
   250		ADCS	t0, acc1
   251		ADCS	t1, acc2
   252		ADC	$0, acc3
   253	
   254		SUBS	$-1, acc0, t0
   255		SBCS	const0, acc1, t1
   256		SBCS	$0, acc2, t2
   257		SBCS	const1, acc3, t3
   258	
   259		CSEL	CS, t0, acc0, acc0
   260		CSEL	CS, t1, acc1, acc1
   261		CSEL	CS, t2, acc2, acc2
   262		CSEL	CS, t3, acc3, acc3
   263	
   264		STP	(acc0, acc1), 0*16(res_ptr)
   265		STP	(acc2, acc3), 1*16(res_ptr)
   266	
   267		RET
   268	/* ---------------------------------------*/
   269	// Constant time point access to arbitrary point table.
   270	// Indexed from 1 to 15, with -1 offset
   271	// (index 0 is implicitly point at infinity)
   272	// func p256Select(point, table []uint64, idx int)
   273	TEXT ·p256Select(SB),NOSPLIT,$0
   274		MOVD	idx+48(FP), const0
   275		MOVD	table+24(FP), b_ptr
   276		MOVD	point+0(FP), res_ptr
   277	
   278		EOR	x0, x0, x0
   279		EOR	x1, x1, x1
   280		EOR	x2, x2, x2
   281		EOR	x3, x3, x3
   282		EOR	y0, y0, y0
   283		EOR	y1, y1, y1
   284		EOR	y2, y2, y2
   285		EOR	y3, y3, y3
   286		EOR	t0, t0, t0
   287		EOR	t1, t1, t1
   288		EOR	t2, t2, t2
   289		EOR	t3, t3, t3
   290	
   291		MOVD	$0, const1
   292	
   293	loop_select:
   294			ADD	$1, const1
   295			CMP	const0, const1
   296			LDP.P	16(b_ptr), (acc0, acc1)
   297			CSEL	EQ, acc0, x0, x0
   298			CSEL	EQ, acc1, x1, x1
   299			LDP.P	16(b_ptr), (acc2, acc3)
   300			CSEL	EQ, acc2, x2, x2
   301			CSEL	EQ, acc3, x3, x3
   302			LDP.P	16(b_ptr), (acc4, acc5)
   303			CSEL	EQ, acc4, y0, y0
   304			CSEL	EQ, acc5, y1, y1
   305			LDP.P	16(b_ptr), (acc6, acc7)
   306			CSEL	EQ, acc6, y2, y2
   307			CSEL	EQ, acc7, y3, y3
   308			LDP.P	16(b_ptr), (acc0, acc1)
   309			CSEL	EQ, acc0, t0, t0
   310			CSEL	EQ, acc1, t1, t1
   311			LDP.P	16(b_ptr), (acc2, acc3)
   312			CSEL	EQ, acc2, t2, t2
   313			CSEL	EQ, acc3, t3, t3
   314	
   315			CMP	$16, const1
   316			BNE	loop_select
   317	
   318		STP	(x0, x1), 0*16(res_ptr)
   319		STP	(x2, x3), 1*16(res_ptr)
   320		STP	(y0, y1), 2*16(res_ptr)
   321		STP	(y2, y3), 3*16(res_ptr)
   322		STP	(t0, t1), 4*16(res_ptr)
   323		STP	(t2, t3), 5*16(res_ptr)
   324		RET
   325	/* ---------------------------------------*/
   326	// Constant time point access to base point table.
   327	// func p256SelectBase(point, table []uint64, idx int)
   328	TEXT ·p256SelectBase(SB),NOSPLIT,$0
   329		MOVD	idx+48(FP), t0
   330		MOVD	table+24(FP), t1
   331		MOVD	point+0(FP), res_ptr
   332	
   333		EOR	x0, x0, x0
   334		EOR	x1, x1, x1
   335		EOR	x2, x2, x2
   336		EOR	x3, x3, x3
   337		EOR	y0, y0, y0
   338		EOR	y1, y1, y1
   339		EOR	y2, y2, y2
   340		EOR	y3, y3, y3
   341	
   342		MOVD	$0, t2
   343	
   344	loop_select:
   345			ADD	$1, t2
   346			CMP	t0, t2
   347			LDP.P	16(t1), (acc0, acc1)
   348			CSEL	EQ, acc0, x0, x0
   349			CSEL	EQ, acc1, x1, x1
   350			LDP.P	16(t1), (acc2, acc3)
   351			CSEL	EQ, acc2, x2, x2
   352			CSEL	EQ, acc3, x3, x3
   353			LDP.P	16(t1), (acc4, acc5)
   354			CSEL	EQ, acc4, y0, y0
   355			CSEL	EQ, acc5, y1, y1
   356			LDP.P	16(t1), (acc6, acc7)
   357			CSEL	EQ, acc6, y2, y2
   358			CSEL	EQ, acc7, y3, y3
   359	
   360			CMP	$32, t2
   361			BNE	loop_select
   362	
   363		STP	(x0, x1), 0*16(res_ptr)
   364		STP	(x2, x3), 1*16(res_ptr)
   365		STP	(y0, y1), 2*16(res_ptr)
   366		STP	(y2, y3), 3*16(res_ptr)
   367		RET
   368	/* ---------------------------------------*/
   369	// func p256OrdSqr(res, in []uint64, n int)
   370	TEXT ·p256OrdSqr(SB),NOSPLIT,$0
   371		MOVD	in+24(FP), a_ptr
   372		MOVD	n+48(FP), b_ptr
   373	
   374		MOVD	p256ordK0<>(SB), hlp1
   375		LDP	p256ord<>+0x00(SB), (const0, const1)
   376		LDP	p256ord<>+0x10(SB), (const2, const3)
   377	
   378		LDP	0*16(a_ptr), (x0, x1)
   379		LDP	1*16(a_ptr), (x2, x3)
   380	
   381	ordSqrLoop:
   382		SUB	$1, b_ptr
   383	
   384		// x[1:] * x[0]
   385		MUL	x0, x1, acc1
   386		UMULH	x0, x1, acc2
   387	
   388		MUL	x0, x2, t0
   389		ADDS	t0, acc2, acc2
   390		UMULH	x0, x2, acc3
   391	
   392		MUL	x0, x3, t0
   393		ADCS	t0, acc3, acc3
   394		UMULH	x0, x3, acc4
   395		ADC	$0, acc4, acc4
   396		// x[2:] * x[1]
   397		MUL	x1, x2, t0
   398		ADDS	t0, acc3
   399		UMULH	x1, x2, t1
   400		ADCS	t1, acc4
   401		ADC	$0, ZR, acc5
   402	
   403		MUL	x1, x3, t0
   404		ADDS	t0, acc4
   405		UMULH	x1, x3, t1
   406		ADC	t1, acc5
   407		// x[3] * x[2]
   408		MUL	x2, x3, t0
   409		ADDS	t0, acc5
   410		UMULH	x2, x3, acc6
   411		ADC	$0, acc6
   412	
   413		MOVD	$0, acc7
   414		// *2
   415		ADDS	acc1, acc1
   416		ADCS	acc2, acc2
   417		ADCS	acc3, acc3
   418		ADCS	acc4, acc4
   419		ADCS	acc5, acc5
   420		ADCS	acc6, acc6
   421		ADC	$0, acc7
   422		// Missing products
   423		MUL	x0, x0, acc0
   424		UMULH	x0, x0, t0
   425		ADDS	t0, acc1, acc1
   426	
   427		MUL	x1, x1, t0
   428		ADCS	t0, acc2, acc2
   429		UMULH	x1, x1, t1
   430		ADCS	t1, acc3, acc3
   431	
   432		MUL	x2, x2, t0
   433		ADCS	t0, acc4, acc4
   434		UMULH	x2, x2, t1
   435		ADCS	t1, acc5, acc5
   436	
   437		MUL	x3, x3, t0
   438		ADCS	t0, acc6, acc6
   439		UMULH	x3, x3, t1
   440		ADC	t1, acc7, acc7
   441		// First reduction step
   442		MUL	acc0, hlp1, hlp0
   443	
   444		MUL	const0, hlp1, t0
   445		ADDS	t0, acc0, acc0
   446		UMULH	const0, hlp0, t1
   447	
   448		MUL	const1, hlp0, t0
   449		ADCS	t0, acc1, acc1
   450		UMULH	const1, hlp0, y0
   451	
   452		MUL	const2, hlp0, t0
   453		ADCS	t0, acc2, acc2
   454		UMULH	const2, hlp0, acc0
   455	
   456		MUL	const3, hlp0, t0
   457		ADCS	t0, acc3, acc3
   458	
   459		UMULH	const3, hlp0, hlp0
   460		ADC	$0, hlp0
   461	
   462		ADDS	t1, acc1, acc1
   463		ADCS	y0, acc2, acc2
   464		ADCS	acc0, acc3, acc3
   465		ADC	$0, hlp0, acc0
   466		// Second reduction step
   467		MUL	acc1, hlp1, hlp0
   468	
   469		MUL	const0, hlp1, t0
   470		ADDS	t0, acc1, acc1
   471		UMULH	const0, hlp0, t1
   472	
   473		MUL	const1, hlp0, t0
   474		ADCS	t0, acc2, acc2
   475		UMULH	const1, hlp0, y0
   476	
   477		MUL	const2, hlp0, t0
   478		ADCS	t0, acc3, acc3
   479		UMULH	const2, hlp0, acc1
   480	
   481		MUL	const3, hlp0, t0
   482		ADCS	t0, acc0, acc0
   483	
   484		UMULH	const3, hlp0, hlp0
   485		ADC	$0, hlp0
   486	
   487		ADDS	t1, acc2, acc2
   488		ADCS	y0, acc3, acc3
   489		ADCS	acc1, acc0, acc0
   490		ADC	$0, hlp0, acc1
   491		// Third reduction step
   492		MUL	acc2, hlp1, hlp0
   493	
   494		MUL	const0, hlp1, t0
   495		ADDS	t0, acc2, acc2
   496		UMULH	const0, hlp0, t1
   497	
   498		MUL	const1, hlp0, t0
   499		ADCS	t0, acc3, acc3
   500		UMULH	const1, hlp0, y0
   501	
   502		MUL	const2, hlp0, t0
   503		ADCS	t0, acc0, acc0
   504		UMULH	const2, hlp0, acc2
   505	
   506		MUL	const3, hlp0, t0
   507		ADCS	t0, acc1, acc1
   508	
   509		UMULH	const3, hlp0, hlp0
   510		ADC	$0, hlp0
   511	
   512		ADDS	t1, acc3, acc3
   513		ADCS	y0, acc0, acc0
   514		ADCS	acc2, acc1, acc1
   515		ADC	$0, hlp0, acc2
   516	
   517		// Last reduction step
   518		MUL	acc3, hlp1, hlp0
   519	
   520		MUL	const0, hlp1, t0
   521		ADDS	t0, acc3, acc3
   522		UMULH	const0, hlp0, t1
   523	
   524		MUL	const1, hlp0, t0
   525		ADCS	t0, acc0, acc0
   526		UMULH	const1, hlp0, y0
   527	
   528		MUL	const2, hlp0, t0
   529		ADCS	t0, acc1, acc1
   530		UMULH	const2, hlp0, acc3
   531	
   532		MUL	const3, hlp0, t0
   533		ADCS	t0, acc2, acc2
   534	
   535		UMULH	const3, hlp0, hlp0
   536		ADC	$0, acc7
   537	
   538		ADDS	t1, acc0, acc0
   539		ADCS	y0, acc1, acc1
   540		ADCS	acc3, acc2, acc2
   541		ADC	$0, hlp0, acc3
   542	
   543		ADDS	acc4, acc0, acc0
   544		ADCS	acc5, acc1, acc1
   545		ADCS	acc6, acc2, acc2
   546		ADCS	acc7, acc3, acc3
   547		ADC	$0, ZR, acc4
   548	
   549		SUBS	const0, acc0, y0
   550		SBCS	const1, acc1, y1
   551		SBCS	const2, acc2, y2
   552		SBCS	const3, acc3, y3
   553		SBCS	$0, acc4, acc4
   554	
   555		CSEL	CS, y0, acc0, x0
   556		CSEL	CS, y1, acc1, x1
   557		CSEL	CS, y2, acc2, x2
   558		CSEL	CS, y3, acc3, x3
   559	
   560		CBNZ	b_ptr, ordSqrLoop
   561	
   562		MOVD	res+0(FP), res_ptr
   563		STP	(x0, x1), 0*16(res_ptr)
   564		STP	(x2, x3), 1*16(res_ptr)
   565	
   566		RET
   567	/* ---------------------------------------*/
   568	// func p256OrdMul(res, in1, in2 []uint64)
   569	TEXT ·p256OrdMul(SB),NOSPLIT,$0
   570		MOVD	in1+24(FP), a_ptr
   571		MOVD	in2+48(FP), b_ptr
   572	
   573		MOVD	p256ordK0<>(SB), hlp1
   574		LDP	p256ord<>+0x00(SB), (const0, const1)
   575		LDP	p256ord<>+0x10(SB), (const2, const3)
   576	
   577		LDP	0*16(a_ptr), (x0, x1)
   578		LDP	1*16(a_ptr), (x2, x3)
   579		LDP	0*16(b_ptr), (y0, y1)
   580		LDP	1*16(b_ptr), (y2, y3)
   581	
   582		// y[0] * x
   583		MUL	y0, x0, acc0
   584		UMULH	y0, x0, acc1
   585	
   586		MUL	y0, x1, t0
   587		ADDS	t0, acc1
   588		UMULH	y0, x1, acc2
   589	
   590		MUL	y0, x2, t0
   591		ADCS	t0, acc2
   592		UMULH	y0, x2, acc3
   593	
   594		MUL	y0, x3, t0
   595		ADCS	t0, acc3
   596		UMULH	y0, x3, acc4
   597		ADC	$0, acc4
   598		// First reduction step
   599		MUL	acc0, hlp1, hlp0
   600	
   601		MUL	const0, hlp1, t0
   602		ADDS	t0, acc0, acc0
   603		UMULH	const0, hlp0, t1
   604	
   605		MUL	const1, hlp0, t0
   606		ADCS	t0, acc1, acc1
   607		UMULH	const1, hlp0, y0
   608	
   609		MUL	const2, hlp0, t0
   610		ADCS	t0, acc2, acc2
   611		UMULH	const2, hlp0, acc0
   612	
   613		MUL	const3, hlp0, t0
   614		ADCS	t0, acc3, acc3
   615	
   616		UMULH	const3, hlp0, hlp0
   617		ADC	$0, acc4
   618	
   619		ADDS	t1, acc1, acc1
   620		ADCS	y0, acc2, acc2
   621		ADCS	acc0, acc3, acc3
   622		ADC	$0, hlp0, acc0
   623		// y[1] * x
   624		MUL	y1, x0, t0
   625		ADDS	t0, acc1
   626		UMULH	y1, x0, t1
   627	
   628		MUL	y1, x1, t0
   629		ADCS	t0, acc2
   630		UMULH	y1, x1, hlp0
   631	
   632		MUL	y1, x2, t0
   633		ADCS	t0, acc3
   634		UMULH	y1, x2, y0
   635	
   636		MUL	y1, x3, t0
   637		ADCS	t0, acc4
   638		UMULH	y1, x3, y1
   639		ADC	$0, ZR, acc5
   640	
   641		ADDS	t1, acc2
   642		ADCS	hlp0, acc3
   643		ADCS	y0, acc4
   644		ADC	y1, acc5
   645		// Second reduction step
   646		MUL	acc1, hlp1, hlp0
   647	
   648		MUL	const0, hlp1, t0
   649		ADDS	t0, acc1, acc1
   650		UMULH	const0, hlp0, t1
   651	
   652		MUL	const1, hlp0, t0
   653		ADCS	t0, acc2, acc2
   654		UMULH	const1, hlp0, y0
   655	
   656		MUL	const2, hlp0, t0
   657		ADCS	t0, acc3, acc3
   658		UMULH	const2, hlp0, acc1
   659	
   660		MUL	const3, hlp0, t0
   661		ADCS	t0, acc0, acc0
   662	
   663		UMULH	const3, hlp0, hlp0
   664		ADC	$0, acc5
   665	
   666		ADDS	t1, acc2, acc2
   667		ADCS	y0, acc3, acc3
   668		ADCS	acc1, acc0, acc0
   669		ADC	$0, hlp0, acc1
   670		// y[2] * x
   671		MUL	y2, x0, t0
   672		ADDS	t0, acc2
   673		UMULH	y2, x0, t1
   674	
   675		MUL	y2, x1, t0
   676		ADCS	t0, acc3
   677		UMULH	y2, x1, hlp0
   678	
   679		MUL	y2, x2, t0
   680		ADCS	t0, acc4
   681		UMULH	y2, x2, y0
   682	
   683		MUL	y2, x3, t0
   684		ADCS	t0, acc5
   685		UMULH	y2, x3, y1
   686		ADC	$0, ZR, acc6
   687	
   688		ADDS	t1, acc3
   689		ADCS	hlp0, acc4
   690		ADCS	y0, acc5
   691		ADC	y1, acc6
   692		// Third reduction step
   693		MUL	acc2, hlp1, hlp0
   694	
   695		MUL	const0, hlp1, t0
   696		ADDS	t0, acc2, acc2
   697		UMULH	const0, hlp0, t1
   698	
   699		MUL	const1, hlp0, t0
   700		ADCS	t0, acc3, acc3
   701		UMULH	const1, hlp0, y0
   702	
   703		MUL	const2, hlp0, t0
   704		ADCS	t0, acc0, acc0
   705		UMULH	const2, hlp0, acc2
   706	
   707		MUL	const3, hlp0, t0
   708		ADCS	t0, acc1, acc1
   709	
   710		UMULH	const3, hlp0, hlp0
   711		ADC	$0, acc6
   712	
   713		ADDS	t1, acc3, acc3
   714		ADCS	y0, acc0, acc0
   715		ADCS	acc2, acc1, acc1
   716		ADC	$0, hlp0, acc2
   717		// y[3] * x
   718		MUL	y3, x0, t0
   719		ADDS	t0, acc3
   720		UMULH	y3, x0, t1
   721	
   722		MUL	y3, x1, t0
   723		ADCS	t0, acc4
   724		UMULH	y3, x1, hlp0
   725	
   726		MUL	y3, x2, t0
   727		ADCS	t0, acc5
   728		UMULH	y3, x2, y0
   729	
   730		MUL	y3, x3, t0
   731		ADCS	t0, acc6
   732		UMULH	y3, x3, y1
   733		ADC	$0, ZR, acc7
   734	
   735		ADDS	t1, acc4
   736		ADCS	hlp0, acc5
   737		ADCS	y0, acc6
   738		ADC	y1, acc7
   739		// Last reduction step
   740		MUL	acc3, hlp1, hlp0
   741	
   742		MUL	const0, hlp1, t0
   743		ADDS	t0, acc3, acc3
   744		UMULH	const0, hlp0, t1
   745	
   746		MUL	const1, hlp0, t0
   747		ADCS	t0, acc0, acc0
   748		UMULH	const1, hlp0, y0
   749	
   750		MUL	const2, hlp0, t0
   751		ADCS	t0, acc1, acc1
   752		UMULH	const2, hlp0, acc3
   753	
   754		MUL	const3, hlp0, t0
   755		ADCS	t0, acc2, acc2
   756	
   757		UMULH	const3, hlp0, hlp0
   758		ADC	$0, acc7
   759	
   760		ADDS	t1, acc0, acc0
   761		ADCS	y0, acc1, acc1
   762		ADCS	acc3, acc2, acc2
   763		ADC	$0, hlp0, acc3
   764	
   765		ADDS	acc4, acc0, acc0
   766		ADCS	acc5, acc1, acc1
   767		ADCS	acc6, acc2, acc2
   768		ADCS	acc7, acc3, acc3
   769		ADC	$0, ZR, acc4
   770	
   771		SUBS	const0, acc0, t0
   772		SBCS	const1, acc1, t1
   773		SBCS	const2, acc2, t2
   774		SBCS	const3, acc3, t3
   775		SBCS	$0, acc4, acc4
   776	
   777		CSEL	CS, t0, acc0, acc0
   778		CSEL	CS, t1, acc1, acc1
   779		CSEL	CS, t2, acc2, acc2
   780		CSEL	CS, t3, acc3, acc3
   781	
   782		MOVD	res+0(FP), res_ptr
   783		STP	(acc0, acc1), 0*16(res_ptr)
   784		STP	(acc2, acc3), 1*16(res_ptr)
   785	
   786		RET
   787	/* ---------------------------------------*/
   788	TEXT p256SubInternal<>(SB),NOSPLIT,$0
   789		SUBS	x0, y0, acc0
   790		SBCS	x1, y1, acc1
   791		SBCS	x2, y2, acc2
   792		SBCS	x3, y3, acc3
   793		SBC	$0, ZR, t0
   794	
   795		ADDS	$-1, acc0, acc4
   796		ADCS	const0, acc1, acc5
   797		ADCS	$0, acc2, acc6
   798		ADC	const1, acc3, acc7
   799	
   800		ANDS	$1, t0
   801		CSEL	EQ, acc0, acc4, x0
   802		CSEL	EQ, acc1, acc5, x1
   803		CSEL	EQ, acc2, acc6, x2
   804		CSEL	EQ, acc3, acc7, x3
   805	
   806		RET
   807	/* ---------------------------------------*/
   808	TEXT p256SqrInternal<>(SB),NOSPLIT,$0
   809		// x[1:] * x[0]
   810		MUL	x0, x1, acc1
   811		UMULH	x0, x1, acc2
   812	
   813		MUL	x0, x2, t0
   814		ADDS	t0, acc2, acc2
   815		UMULH	x0, x2, acc3
   816	
   817		MUL	x0, x3, t0
   818		ADCS	t0, acc3, acc3
   819		UMULH	x0, x3, acc4
   820		ADC	$0, acc4, acc4
   821		// x[2:] * x[1]
   822		MUL	x1, x2, t0
   823		ADDS	t0, acc3
   824		UMULH	x1, x2, t1
   825		ADCS	t1, acc4
   826		ADC	$0, ZR, acc5
   827	
   828		MUL	x1, x3, t0
   829		ADDS	t0, acc4
   830		UMULH	x1, x3, t1
   831		ADC	t1, acc5
   832		// x[3] * x[2]
   833		MUL	x2, x3, t0
   834		ADDS	t0, acc5
   835		UMULH	x2, x3, acc6
   836		ADC	$0, acc6
   837	
   838		MOVD	$0, acc7
   839		// *2
   840		ADDS	acc1, acc1
   841		ADCS	acc2, acc2
   842		ADCS	acc3, acc3
   843		ADCS	acc4, acc4
   844		ADCS	acc5, acc5
   845		ADCS	acc6, acc6
   846		ADC	$0, acc7
   847		// Missing products
   848		MUL	x0, x0, acc0
   849		UMULH	x0, x0, t0
   850		ADDS	t0, acc1, acc1
   851	
   852		MUL	x1, x1, t0
   853		ADCS	t0, acc2, acc2
   854		UMULH	x1, x1, t1
   855		ADCS	t1, acc3, acc3
   856	
   857		MUL	x2, x2, t0
   858		ADCS	t0, acc4, acc4
   859		UMULH	x2, x2, t1
   860		ADCS	t1, acc5, acc5
   861	
   862		MUL	x3, x3, t0
   863		ADCS	t0, acc6, acc6
   864		UMULH	x3, x3, t1
   865		ADCS	t1, acc7, acc7
   866		// First reduction step
   867		ADDS	acc0<<32, acc1, acc1
   868		LSR	$32, acc0, t0
   869		MUL	acc0, const1, t1
   870		UMULH	acc0, const1, acc0
   871		ADCS	t0, acc2, acc2
   872		ADCS	t1, acc3, acc3
   873		ADC	$0, acc0, acc0
   874		// Second reduction step
   875		ADDS	acc1<<32, acc2, acc2
   876		LSR	$32, acc1, t0
   877		MUL	acc1, const1, t1
   878		UMULH	acc1, const1, acc1
   879		ADCS	t0, acc3, acc3
   880		ADCS	t1, acc0, acc0
   881		ADC	$0, acc1, acc1
   882		// Third reduction step
   883		ADDS	acc2<<32, acc3, acc3
   884		LSR	$32, acc2, t0
   885		MUL	acc2, const1, t1
   886		UMULH	acc2, const1, acc2
   887		ADCS	t0, acc0, acc0
   888		ADCS	t1, acc1, acc1
   889		ADC	$0, acc2, acc2
   890		// Last reduction step
   891		ADDS	acc3<<32, acc0, acc0
   892		LSR	$32, acc3, t0
   893		MUL	acc3, const1, t1
   894		UMULH	acc3, const1, acc3
   895		ADCS	t0, acc1, acc1
   896		ADCS	t1, acc2, acc2
   897		ADC	$0, acc3, acc3
   898		// Add bits [511:256] of the sqr result
   899		ADDS	acc4, acc0, acc0
   900		ADCS	acc5, acc1, acc1
   901		ADCS	acc6, acc2, acc2
   902		ADCS	acc7, acc3, acc3
   903		ADC	$0, ZR, acc4
   904	
   905		SUBS	$-1, acc0, t0
   906		SBCS	const0, acc1, t1
   907		SBCS	$0, acc2, t2
   908		SBCS	const1, acc3, t3
   909		SBCS	$0, acc4, acc4
   910	
   911		CSEL	CS, t0, acc0, y0
   912		CSEL	CS, t1, acc1, y1
   913		CSEL	CS, t2, acc2, y2
   914		CSEL	CS, t3, acc3, y3
   915		RET
   916	/* ---------------------------------------*/
   917	TEXT p256MulInternal<>(SB),NOSPLIT,$0
   918		// y[0] * x
   919		MUL	y0, x0, acc0
   920		UMULH	y0, x0, acc1
   921	
   922		MUL	y0, x1, t0
   923		ADDS	t0, acc1
   924		UMULH	y0, x1, acc2
   925	
   926		MUL	y0, x2, t0
   927		ADCS	t0, acc2
   928		UMULH	y0, x2, acc3
   929	
   930		MUL	y0, x3, t0
   931		ADCS	t0, acc3
   932		UMULH	y0, x3, acc4
   933		ADC	$0, acc4
   934		// First reduction step
   935		ADDS	acc0<<32, acc1, acc1
   936		LSR	$32, acc0, t0
   937		MUL	acc0, const1, t1
   938		UMULH	acc0, const1, acc0
   939		ADCS	t0, acc2
   940		ADCS	t1, acc3
   941		ADC	$0, acc0
   942		// y[1] * x
   943		MUL	y1, x0, t0
   944		ADDS	t0, acc1
   945		UMULH	y1, x0, t1
   946	
   947		MUL	y1, x1, t0
   948		ADCS	t0, acc2
   949		UMULH	y1, x1, t2
   950	
   951		MUL	y1, x2, t0
   952		ADCS	t0, acc3
   953		UMULH	y1, x2, t3
   954	
   955		MUL	y1, x3, t0
   956		ADCS	t0, acc4
   957		UMULH	y1, x3, hlp0
   958		ADC	$0, ZR, acc5
   959	
   960		ADDS	t1, acc2
   961		ADCS	t2, acc3
   962		ADCS	t3, acc4
   963		ADC	hlp0, acc5
   964		// Second reduction step
   965		ADDS	acc1<<32, acc2, acc2
   966		LSR	$32, acc1, t0
   967		MUL	acc1, const1, t1
   968		UMULH	acc1, const1, acc1
   969		ADCS	t0, acc3
   970		ADCS	t1, acc0
   971		ADC	$0, acc1
   972		// y[2] * x
   973		MUL	y2, x0, t0
   974		ADDS	t0, acc2
   975		UMULH	y2, x0, t1
   976	
   977		MUL	y2, x1, t0
   978		ADCS	t0, acc3
   979		UMULH	y2, x1, t2
   980	
   981		MUL	y2, x2, t0
   982		ADCS	t0, acc4
   983		UMULH	y2, x2, t3
   984	
   985		MUL	y2, x3, t0
   986		ADCS	t0, acc5
   987		UMULH	y2, x3, hlp0
   988		ADC	$0, ZR, acc6
   989	
   990		ADDS	t1, acc3
   991		ADCS	t2, acc4
   992		ADCS	t3, acc5
   993		ADC	hlp0, acc6
   994		// Third reduction step
   995		ADDS	acc2<<32, acc3, acc3
   996		LSR	$32, acc2, t0
   997		MUL	acc2, const1, t1
   998		UMULH	acc2, const1, acc2
   999		ADCS	t0, acc0
  1000		ADCS	t1, acc1
  1001		ADC	$0, acc2
  1002		// y[3] * x
  1003		MUL	y3, x0, t0
  1004		ADDS	t0, acc3
  1005		UMULH	y3, x0, t1
  1006	
  1007		MUL	y3, x1, t0
  1008		ADCS	t0, acc4
  1009		UMULH	y3, x1, t2
  1010	
  1011		MUL	y3, x2, t0
  1012		ADCS	t0, acc5
  1013		UMULH	y3, x2, t3
  1014	
  1015		MUL	y3, x3, t0
  1016		ADCS	t0, acc6
  1017		UMULH	y3, x3, hlp0
  1018		ADC	$0, ZR, acc7
  1019	
  1020		ADDS	t1, acc4
  1021		ADCS	t2, acc5
  1022		ADCS	t3, acc6
  1023		ADC	hlp0, acc7
  1024		// Last reduction step
  1025		ADDS	acc3<<32, acc0, acc0
  1026		LSR	$32, acc3, t0
  1027		MUL	acc3, const1, t1
  1028		UMULH	acc3, const1, acc3
  1029		ADCS	t0, acc1
  1030		ADCS	t1, acc2
  1031		ADC	$0, acc3
  1032		// Add bits [511:256] of the mul result
  1033		ADDS	acc4, acc0, acc0
  1034		ADCS	acc5, acc1, acc1
  1035		ADCS	acc6, acc2, acc2
  1036		ADCS	acc7, acc3, acc3
  1037		ADC	$0, ZR, acc4
  1038	
  1039		SUBS	$-1, acc0, t0
  1040		SBCS	const0, acc1, t1
  1041		SBCS	$0, acc2, t2
  1042		SBCS	const1, acc3, t3
  1043		SBCS	$0, acc4, acc4
  1044	
  1045		CSEL	CS, t0, acc0, y0
  1046		CSEL	CS, t1, acc1, y1
  1047		CSEL	CS, t2, acc2, y2
  1048		CSEL	CS, t3, acc3, y3
  1049		RET
  1050	/* ---------------------------------------*/
  1051	#define p256MulBy2Inline       \
  1052		ADDS	y0, y0, x0;    \
  1053		ADCS	y1, y1, x1;    \
  1054		ADCS	y2, y2, x2;    \
  1055		ADCS	y3, y3, x3;    \
  1056		ADC	$0, ZR, hlp0;  \
  1057		SUBS	$-1, x0, t0;   \
  1058		SBCS	const0, x1, t1;\
  1059		SBCS	$0, x2, t2;    \
  1060		SBCS	const1, x3, t3;\
  1061		SBCS	$0, hlp0, hlp0;\
  1062		CSEL	CC, x0, t0, x0;\
  1063		CSEL	CC, x1, t1, x1;\
  1064		CSEL	CC, x2, t2, x2;\
  1065		CSEL	CC, x3, t3, x3;
  1066	/* ---------------------------------------*/
  1067	#define x1in(off) (off)(a_ptr)
  1068	#define y1in(off) (off + 32)(a_ptr)
  1069	#define z1in(off) (off + 64)(a_ptr)
  1070	#define x2in(off) (off)(b_ptr)
  1071	#define z2in(off) (off + 64)(b_ptr)
  1072	#define x3out(off) (off)(res_ptr)
  1073	#define y3out(off) (off + 32)(res_ptr)
  1074	#define z3out(off) (off + 64)(res_ptr)
  1075	#define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3)
  1076	#define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3)
  1077	#define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16)
  1078	#define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16)
  1079	/* ---------------------------------------*/
  1080	#define y2in(off)  (32*0 + 8 + off)(RSP)
  1081	#define s2(off)    (32*1 + 8 + off)(RSP)
  1082	#define z1sqr(off) (32*2 + 8 + off)(RSP)
  1083	#define h(off)	   (32*3 + 8 + off)(RSP)
  1084	#define r(off)	   (32*4 + 8 + off)(RSP)
  1085	#define hsqr(off)  (32*5 + 8 + off)(RSP)
  1086	#define rsqr(off)  (32*6 + 8 + off)(RSP)
  1087	#define hcub(off)  (32*7 + 8 + off)(RSP)
  1088	
  1089	#define z2sqr(off) (32*8 + 8 + off)(RSP)
  1090	#define s1(off) (32*9 + 8 + off)(RSP)
  1091	#define u1(off) (32*10 + 8 + off)(RSP)
  1092	#define u2(off) (32*11 + 8 + off)(RSP)
  1093	
  1094	// func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int)
  1095	TEXT ·p256PointAddAffineAsm(SB),0,$264-96
  1096		MOVD	in1+24(FP), a_ptr
  1097		MOVD	in2+48(FP), b_ptr
  1098		MOVD	sign+72(FP), hlp0
  1099		MOVD	sel+80(FP), hlp1
  1100		MOVD	zero+88(FP), t2
  1101	
  1102		MOVD	$1, t0
  1103		CMP	$0, t2
  1104		CSEL	EQ, ZR, t0, t2
  1105		CMP	$0, hlp1
  1106		CSEL	EQ, ZR, t0, hlp1
  1107	
  1108		MOVD	p256const0<>(SB), const0
  1109		MOVD	p256const1<>(SB), const1
  1110		EOR	t2<<1, hlp1
  1111	
  1112		// Negate y2in based on sign
  1113		LDP	2*16(b_ptr), (y0, y1)
  1114		LDP	3*16(b_ptr), (y2, y3)
  1115		MOVD	$-1, acc0
  1116	
  1117		SUBS	y0, acc0, acc0
  1118		SBCS	y1, const0, acc1
  1119		SBCS	y2, ZR, acc2
  1120		SBCS	y3, const1, acc3
  1121		SBC	$0, ZR, t0
  1122	
  1123		ADDS	$-1, acc0, acc4
  1124		ADCS	const0, acc1, acc5
  1125		ADCS	$0, acc2, acc6
  1126		ADCS	const1, acc3, acc7
  1127		ADC	$0, t0, t0
  1128	
  1129		CMP	$0, t0
  1130		CSEL	EQ, acc4, acc0, acc0
  1131		CSEL	EQ, acc5, acc1, acc1
  1132		CSEL	EQ, acc6, acc2, acc2
  1133		CSEL	EQ, acc7, acc3, acc3
  1134		// If condition is 0, keep original value
  1135		CMP	$0, hlp0
  1136		CSEL	EQ, y0, acc0, y0
  1137		CSEL	EQ, y1, acc1, y1
  1138		CSEL	EQ, y2, acc2, y2
  1139		CSEL	EQ, y3, acc3, y3
  1140		// Store result
  1141		STy(y2in)
  1142		// Begin point add
  1143		LDx(z1in)
  1144		CALL	p256SqrInternal<>(SB)    // z1ˆ2
  1145		STy(z1sqr)
  1146	
  1147		LDx(x2in)
  1148		CALL	p256MulInternal<>(SB)    // x2 * z1ˆ2
  1149	
  1150		LDx(x1in)
  1151		CALL	p256SubInternal<>(SB)    // h = u2 - u1
  1152		STx(h)
  1153	
  1154		LDy(z1in)
  1155		CALL	p256MulInternal<>(SB)    // z3 = h * z1
  1156	
  1157		LDP	4*16(a_ptr), (acc0, acc1)// iff select[0] == 0, z3 = z1
  1158		LDP	5*16(a_ptr), (acc2, acc3)
  1159		ANDS	$1, hlp1, ZR
  1160		CSEL	EQ, acc0, y0, y0
  1161		CSEL	EQ, acc1, y1, y1
  1162		CSEL	EQ, acc2, y2, y2
  1163		CSEL	EQ, acc3, y3, y3
  1164		LDP	p256one<>+0x00(SB), (acc0, acc1)
  1165		LDP	p256one<>+0x10(SB), (acc2, acc3)
  1166		ANDS	$2, hlp1, ZR            // iff select[1] == 0, z3 = 1
  1167		CSEL	EQ, acc0, y0, y0
  1168		CSEL	EQ, acc1, y1, y1
  1169		CSEL	EQ, acc2, y2, y2
  1170		CSEL	EQ, acc3, y3, y3
  1171		LDx(z1in)
  1172		MOVD	res+0(FP), t0
  1173		STP	(y0, y1), 4*16(t0)
  1174		STP	(y2, y3), 5*16(t0)
  1175	
  1176		LDy(z1sqr)
  1177		CALL	p256MulInternal<>(SB)    // z1 ^ 3
  1178	
  1179		LDx(y2in)
  1180		CALL	p256MulInternal<>(SB)    // s2 = y2 * z1ˆ3
  1181		STy(s2)
  1182	
  1183		LDx(y1in)
  1184		CALL	p256SubInternal<>(SB)    // r = s2 - s1
  1185		STx(r)
  1186	
  1187		CALL	p256SqrInternal<>(SB)    // rsqr = rˆ2
  1188		STy	(rsqr)
  1189	
  1190		LDx(h)
  1191		CALL	p256SqrInternal<>(SB)    // hsqr = hˆ2
  1192		STy(hsqr)
  1193	
  1194		CALL	p256MulInternal<>(SB)    // hcub = hˆ3
  1195		STy(hcub)
  1196	
  1197		LDx(y1in)
  1198		CALL	p256MulInternal<>(SB)    // y1 * hˆ3
  1199		STy(s2)
  1200	
  1201		LDP	hsqr(0*8), (x0, x1)
  1202		LDP	hsqr(2*8), (x2, x3)
  1203		LDP	0*16(a_ptr), (y0, y1)
  1204		LDP	1*16(a_ptr), (y2, y3)
  1205		CALL	p256MulInternal<>(SB)    // u1 * hˆ2
  1206		STP	(y0, y1), h(0*8)
  1207		STP	(y2, y3), h(2*8)
  1208	
  1209		p256MulBy2Inline               // u1 * hˆ2 * 2, inline
  1210	
  1211		LDy(rsqr)
  1212		CALL	p256SubInternal<>(SB)    // rˆ2 - u1 * hˆ2 * 2
  1213	
  1214		MOVD	x0, y0
  1215		MOVD	x1, y1
  1216		MOVD	x2, y2
  1217		MOVD	x3, y3
  1218		LDx(hcub)
  1219		CALL	p256SubInternal<>(SB)
  1220	
  1221		LDP	0*16(a_ptr), (acc0, acc1)
  1222		LDP	1*16(a_ptr), (acc2, acc3)
  1223		ANDS	$1, hlp1, ZR           // iff select[0] == 0, x3 = x1
  1224		CSEL	EQ, acc0, x0, x0
  1225		CSEL	EQ, acc1, x1, x1
  1226		CSEL	EQ, acc2, x2, x2
  1227		CSEL	EQ, acc3, x3, x3
  1228		LDP	0*16(b_ptr), (acc0, acc1)
  1229		LDP	1*16(b_ptr), (acc2, acc3)
  1230		ANDS	$2, hlp1, ZR           // iff select[1] == 0, x3 = x2
  1231		CSEL	EQ, acc0, x0, x0
  1232		CSEL	EQ, acc1, x1, x1
  1233		CSEL	EQ, acc2, x2, x2
  1234		CSEL	EQ, acc3, x3, x3
  1235		MOVD	res+0(FP), t0
  1236		STP	(x0, x1), 0*16(t0)
  1237		STP	(x2, x3), 1*16(t0)
  1238	
  1239		LDP	h(0*8), (y0, y1)
  1240		LDP	h(2*8), (y2, y3)
  1241		CALL	p256SubInternal<>(SB)
  1242	
  1243		LDP	r(0*8), (y0, y1)
  1244		LDP	r(2*8), (y2, y3)
  1245		CALL	p256MulInternal<>(SB)
  1246	
  1247		LDP	s2(0*8), (x0, x1)
  1248		LDP	s2(2*8), (x2, x3)
  1249		CALL	p256SubInternal<>(SB)
  1250		LDP	2*16(a_ptr), (acc0, acc1)
  1251		LDP	3*16(a_ptr), (acc2, acc3)
  1252		ANDS	$1, hlp1, ZR           // iff select[0] == 0, y3 = y1
  1253		CSEL	EQ, acc0, x0, x0
  1254		CSEL	EQ, acc1, x1, x1
  1255		CSEL	EQ, acc2, x2, x2
  1256		CSEL	EQ, acc3, x3, x3
  1257		LDP	y2in(0*8), (acc0, acc1)
  1258		LDP	y2in(2*8), (acc2, acc3)
  1259		ANDS	$2, hlp1, ZR            // iff select[1] == 0, y3 = y2
  1260		CSEL	EQ, acc0, x0, x0
  1261		CSEL	EQ, acc1, x1, x1
  1262		CSEL	EQ, acc2, x2, x2
  1263		CSEL	EQ, acc3, x3, x3
  1264		MOVD	res+0(FP), t0
  1265		STP	(x0, x1), 2*16(t0)
  1266		STP	(x2, x3), 3*16(t0)
  1267	
  1268		RET
  1269	
  1270	#define p256AddInline          \
  1271		ADDS	y0, x0, x0;    \
  1272		ADCS	y1, x1, x1;    \
  1273		ADCS	y2, x2, x2;    \
  1274		ADCS	y3, x3, x3;    \
  1275		ADC	$0, ZR, hlp0;  \
  1276		SUBS	$-1, x0, t0;   \
  1277		SBCS	const0, x1, t1;\
  1278		SBCS	$0, x2, t2;    \
  1279		SBCS	const1, x3, t3;\
  1280		SBCS	$0, hlp0, hlp0;\
  1281		CSEL	CC, x0, t0, x0;\
  1282		CSEL	CC, x1, t1, x1;\
  1283		CSEL	CC, x2, t2, x2;\
  1284		CSEL	CC, x3, t3, x3;
  1285	
  1286	#define s(off)	(32*0 + 8 + off)(RSP)
  1287	#define m(off)	(32*1 + 8 + off)(RSP)
  1288	#define zsqr(off) (32*2 + 8 + off)(RSP)
  1289	#define tmp(off)  (32*3 + 8 + off)(RSP)
  1290	
  1291	//func p256PointDoubleAsm(res, in []uint64)
  1292	TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$136-48
  1293		MOVD	res+0(FP), res_ptr
  1294		MOVD	in+24(FP), a_ptr
  1295	
  1296		MOVD	p256const0<>(SB), const0
  1297		MOVD	p256const1<>(SB), const1
  1298	
  1299		// Begin point double
  1300		LDP	4*16(a_ptr), (x0, x1)
  1301		LDP	5*16(a_ptr), (x2, x3)
  1302		CALL	p256SqrInternal<>(SB)
  1303		STP	(y0, y1), zsqr(0*8)
  1304		STP	(y2, y3), zsqr(2*8)
  1305	
  1306		LDP	0*16(a_ptr), (x0, x1)
  1307		LDP	1*16(a_ptr), (x2, x3)
  1308		p256AddInline
  1309		STx(m)
  1310	
  1311		LDx(z1in)
  1312		LDy(y1in)
  1313		CALL	p256MulInternal<>(SB)
  1314		p256MulBy2Inline
  1315		STx(z3out)
  1316	
  1317		LDy(x1in)
  1318		LDx(zsqr)
  1319		CALL	p256SubInternal<>(SB)
  1320		LDy(m)
  1321		CALL	p256MulInternal<>(SB)
  1322	
  1323		// Multiply by 3
  1324		p256MulBy2Inline
  1325		p256AddInline
  1326		STx(m)
  1327	
  1328		LDy(y1in)
  1329		p256MulBy2Inline
  1330		CALL	p256SqrInternal<>(SB)
  1331		STy(s)
  1332		MOVD	y0, x0
  1333		MOVD	y1, x1
  1334		MOVD	y2, x2
  1335		MOVD	y3, x3
  1336		CALL	p256SqrInternal<>(SB)
  1337	
  1338		// Divide by 2
  1339		ADDS	$-1, y0, t0
  1340		ADCS	const0, y1, t1
  1341		ADCS	$0, y2, t2
  1342		ADCS	const1, y3, t3
  1343		ADC	$0, ZR, hlp0
  1344	
  1345		ANDS	$1, y0, ZR
  1346		CSEL	EQ, y0, t0, t0
  1347		CSEL	EQ, y1, t1, t1
  1348		CSEL	EQ, y2, t2, t2
  1349		CSEL	EQ, y3, t3, t3
  1350		AND	y0, hlp0, hlp0
  1351	
  1352		EXTR	$1, t0, t1, y0
  1353		EXTR	$1, t1, t2, y1
  1354		EXTR	$1, t2, t3, y2
  1355		EXTR	$1, t3, hlp0, y3
  1356		STy(y3out)
  1357	
  1358		LDx(x1in)
  1359		LDy(s)
  1360		CALL	p256MulInternal<>(SB)
  1361		STy(s)
  1362		p256MulBy2Inline
  1363		STx(tmp)
  1364	
  1365		LDx(m)
  1366		CALL	p256SqrInternal<>(SB)
  1367		LDx(tmp)
  1368		CALL	p256SubInternal<>(SB)
  1369	
  1370		STx(x3out)
  1371	
  1372		LDy(s)
  1373		CALL	p256SubInternal<>(SB)
  1374	
  1375		LDy(m)
  1376		CALL	p256MulInternal<>(SB)
  1377	
  1378		LDx(y3out)
  1379		CALL	p256SubInternal<>(SB)
  1380		STx(y3out)
  1381		RET
  1382	/* ---------------------------------------*/
  1383	#undef y2in
  1384	#undef x3out
  1385	#undef y3out
  1386	#undef z3out
  1387	#define y2in(off) (off + 32)(b_ptr)
  1388	#define x3out(off) (off)(b_ptr)
  1389	#define y3out(off) (off + 32)(b_ptr)
  1390	#define z3out(off) (off + 64)(b_ptr)
  1391	//func p256PointAddAsm(res, in1, in2 []uint64) int
  1392	TEXT ·p256PointAddAsm(SB),0,$392-80
  1393		// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
  1394		// Move input to stack in order to free registers
  1395		MOVD	in1+24(FP), a_ptr
  1396		MOVD	in2+48(FP), b_ptr
  1397	
  1398		MOVD	p256const0<>(SB), const0
  1399		MOVD	p256const1<>(SB), const1
  1400	
  1401		// Begin point add
  1402		LDx(z2in)
  1403		CALL	p256SqrInternal<>(SB)    // z2^2
  1404		STy(z2sqr)
  1405	
  1406		CALL	p256MulInternal<>(SB)    // z2^3
  1407	
  1408		LDx(y1in)
  1409		CALL	p256MulInternal<>(SB)    // s1 = z2ˆ3*y1
  1410		STy(s1)
  1411	
  1412		LDx(z1in)
  1413		CALL	p256SqrInternal<>(SB)    // z1^2
  1414		STy(z1sqr)
  1415	
  1416		CALL	p256MulInternal<>(SB)    // z1^3
  1417	
  1418		LDx(y2in)
  1419		CALL	p256MulInternal<>(SB)    // s2 = z1ˆ3*y2
  1420	
  1421		LDx(s1)
  1422		CALL	p256SubInternal<>(SB)    // r = s2 - s1
  1423		STx(r)
  1424	
  1425		MOVD	$1, t2
  1426		ORR	x0, x1, t0             // Check if zero mod p256
  1427		ORR	x2, x3, t1
  1428		ORR	t1, t0, t0
  1429		CMP	$0, t0
  1430		CSEL	EQ, t2, ZR, hlp1
  1431	
  1432		EOR	$-1, x0, t0
  1433		EOR	const0, x1, t1
  1434		EOR	const1, x3, t3
  1435	
  1436		ORR	t0, t1, t0
  1437		ORR	x2, t3, t1
  1438		ORR	t1, t0, t0
  1439		CMP	$0, t0
  1440		CSEL	EQ, t2, hlp1, hlp1
  1441	
  1442		LDx(z2sqr)
  1443		LDy(x1in)
  1444		CALL	p256MulInternal<>(SB)    // u1 = x1 * z2ˆ2
  1445		STy(u1)
  1446	
  1447		LDx(z1sqr)
  1448		LDy(x2in)
  1449		CALL	p256MulInternal<>(SB)    // u2 = x2 * z1ˆ2
  1450		STy(u2)
  1451	
  1452		LDx(u1)
  1453		CALL	p256SubInternal<>(SB)    // h = u2 - u1
  1454		STx(h)
  1455	
  1456		MOVD	$1, t2
  1457		ORR	x0, x1, t0             // Check if zero mod p256
  1458		ORR	x2, x3, t1
  1459		ORR	t1, t0, t0
  1460		CMP	$0, t0
  1461		CSEL	EQ, t2, ZR, hlp0
  1462	
  1463		EOR	$-1, x0, t0
  1464		EOR	const0, x1, t1
  1465		EOR	const1, x3, t3
  1466	
  1467		ORR	t0, t1, t0
  1468		ORR	x2, t3, t1
  1469		ORR	t1, t0, t0
  1470		CMP	$0, t0
  1471		CSEL	EQ, t2, hlp0, hlp0
  1472	
  1473		AND	hlp0, hlp1, hlp1
  1474	
  1475		LDx(r)
  1476		CALL	p256SqrInternal<>(SB)    // rsqr = rˆ2
  1477		STy(rsqr)
  1478	
  1479		LDx(h)
  1480		CALL	p256SqrInternal<>(SB)    // hsqr = hˆ2
  1481		STy(hsqr)
  1482	
  1483		LDx(h)
  1484		CALL	p256MulInternal<>(SB)    // hcub = hˆ3
  1485		STy(hcub)
  1486	
  1487		LDx(s1)
  1488		CALL	p256MulInternal<>(SB)
  1489		STy(s2)
  1490	
  1491		LDx(z1in)
  1492		LDy(z2in)
  1493		CALL	p256MulInternal<>(SB)    // z1 * z2
  1494		LDx(h)
  1495		CALL	p256MulInternal<>(SB)    // z1 * z2 * h
  1496		MOVD	res+0(FP), b_ptr
  1497		STy(z3out)
  1498	
  1499		LDx(hsqr)
  1500		LDy(u1)
  1501		CALL	p256MulInternal<>(SB)    // hˆ2 * u1
  1502		STy(u2)
  1503	
  1504		p256MulBy2Inline               // u1 * hˆ2 * 2, inline
  1505		LDy(rsqr)
  1506		CALL	p256SubInternal<>(SB)    // rˆ2 - u1 * hˆ2 * 2
  1507	
  1508		MOVD	x0, y0
  1509		MOVD	x1, y1
  1510		MOVD	x2, y2
  1511		MOVD	x3, y3
  1512		LDx(hcub)
  1513		CALL	p256SubInternal<>(SB)
  1514		STx(x3out)
  1515	
  1516		LDy(u2)
  1517		CALL	p256SubInternal<>(SB)
  1518	
  1519		LDy(r)
  1520		CALL	p256MulInternal<>(SB)
  1521	
  1522		LDx(s2)
  1523		CALL	p256SubInternal<>(SB)
  1524		STx(y3out)
  1525	
  1526		MOVD	hlp1, R0
  1527		MOVD	R0, ret+72(FP)
  1528	
  1529		RET
View as plain text