...

Text file src/crypto/elliptic/p256_asm_s390x.s

     1	// Copyright 2016 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	#include "textflag.h"
     6	#include "go_asm.h"
     7	
     8	
     9	DATA p256ordK0<>+0x00(SB)/4, $0xee00bc4f
    10	DATA p256ord<>+0x00(SB)/8, $0xffffffff00000000
    11	DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff
    12	DATA p256ord<>+0x10(SB)/8, $0xbce6faada7179e84
    13	DATA p256ord<>+0x18(SB)/8, $0xf3b9cac2fc632551
    14	DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256
    15	DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256
    16	DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256
    17	DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
    18	DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    19	DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    20	DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0  d1 d0  0
    21	DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0  d1 d0  0
    22	DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
    23	DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
    24	DATA p256mul<>+0x00(SB)/8, $0xffffffff00000001 // P256
    25	DATA p256mul<>+0x08(SB)/8, $0x0000000000000000 // P256
    26	DATA p256mul<>+0x10(SB)/8, $0x00000000ffffffff // P256
    27	DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256
    28	DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0  0  0 d0
    29	DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0  0  0 d0
    30	DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0  0 d1 d0
    31	DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0  0 d1 d0
    32	DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL  0 d1 d0 d1
    33	DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL  0 d1 d0 d1
    34	DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL  0  0 d1 d0
    35	DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL  0  0 d1 d0
    36	DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    37	DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    38	DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0  d1 d0  0
    39	DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0  d1 d0  0
    40	DATA p256mul<>+0x80(SB)/8, $0x00000000fffffffe // (1*2^256)%P256
    41	DATA p256mul<>+0x88(SB)/8, $0xffffffffffffffff // (1*2^256)%P256
    42	DATA p256mul<>+0x90(SB)/8, $0xffffffff00000000 // (1*2^256)%P256
    43	DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256
    44	GLOBL p256ordK0<>(SB), 8, $4
    45	GLOBL p256ord<>(SB), 8, $32
    46	GLOBL p256<>(SB), 8, $80
    47	GLOBL p256mul<>(SB), 8, $160
    48	
    49	DATA p256vmsl<>+0x0(SB)/8, $0x0012131415161718
    50	DATA p256vmsl<>+0x8(SB)/8, $0x00191a1b1c1d1e1f
    51	DATA p256vmsl<>+0x10(SB)/8, $0x0012131415161718
    52	DATA p256vmsl<>+0x18(SB)/8, $0x000b0c0d0e0f1011
    53	DATA p256vmsl<>+0x20(SB)/8, $0x00191a1b1c1d1e1f
    54	DATA p256vmsl<>+0x28(SB)/8, $0x0012131415161718
    55	DATA p256vmsl<>+0x30(SB)/8, $0x000b0c0d0e0f1011
    56	DATA p256vmsl<>+0x38(SB)/8, $0x0012131415161718
    57	DATA p256vmsl<>+0x40(SB)/8, $0x000405060708090a
    58	DATA p256vmsl<>+0x48(SB)/8, $0x000b0c0d0e0f1011
    59	DATA p256vmsl<>+0x50(SB)/8, $0x000b0c0d0e0f1011
    60	DATA p256vmsl<>+0x58(SB)/8, $0x000405060708090a
    61	DATA p256vmsl<>+0x60(SB)/8, $0x1010101000010203
    62	DATA p256vmsl<>+0x68(SB)/8, $0x100405060708090a
    63	DATA p256vmsl<>+0x70(SB)/8, $0x100405060708090a
    64	DATA p256vmsl<>+0x78(SB)/8, $0x1010101000010203
    65	GLOBL p256vmsl<>(SB), 8, $128
    66	
    67	// ---------------------------------------
    68	// iff cond == 1  val <- -val
    69	// func p256NegCond(val *p256Point, cond int)
    70	#define P1ptr   R1
    71	#define CPOOL   R4
    72	
    73	#define Y1L   V0
    74	#define Y1H   V1
    75	#define T1L   V2
    76	#define T1H   V3
    77	
    78	#define PL    V30
    79	#define PH    V31
    80	
    81	#define ZER   V4
    82	#define SEL1  V5
    83	#define CAR1  V6
    84	TEXT ·p256NegCond(SB), NOSPLIT, $0
    85		MOVD val+0(FP), P1ptr
    86	
    87		MOVD $p256mul<>+0x00(SB), CPOOL
    88		VL   16(CPOOL), PL
    89		VL   0(CPOOL), PH
    90	
    91		VL 32(P1ptr), Y1H
    92		VL 48(P1ptr), Y1L
    93	
    94		VLREPG cond+8(FP), SEL1
    95		VZERO  ZER
    96		VCEQG  SEL1, ZER, SEL1
    97	
    98		VSCBIQ Y1L, PL, CAR1
    99		VSQ    Y1L, PL, T1L
   100		VSBIQ  PH, Y1H, CAR1, T1H
   101	
   102		VSEL Y1L, T1L, SEL1, Y1L
   103		VSEL Y1H, T1H, SEL1, Y1H
   104	
   105		VST Y1H, 32(P1ptr)
   106		VST Y1L, 48(P1ptr)
   107		RET
   108	
   109	#undef P1ptr
   110	#undef CPOOL
   111	#undef Y1L
   112	#undef Y1H
   113	#undef T1L
   114	#undef T1H
   115	#undef PL
   116	#undef PH
   117	#undef ZER
   118	#undef SEL1
   119	#undef CAR1
   120	
   121	// ---------------------------------------
   122	// if cond == 0 res <- b; else res <- a
   123	// func p256MovCond(res, a, b *p256Point, cond int)
   124	#define P3ptr   R1
   125	#define P1ptr   R2
   126	#define P2ptr   R3
   127	
   128	#define X1L    V0
   129	#define X1H    V1
   130	#define Y1L    V2
   131	#define Y1H    V3
   132	#define Z1L    V4
   133	#define Z1H    V5
   134	#define X2L    V6
   135	#define X2H    V7
   136	#define Y2L    V8
   137	#define Y2H    V9
   138	#define Z2L    V10
   139	#define Z2H    V11
   140	
   141	#define ZER   V18
   142	#define SEL1  V19
   143	TEXT ·p256MovCond(SB), NOSPLIT, $0
   144		MOVD   res+0(FP), P3ptr
   145		MOVD   a+8(FP), P1ptr
   146		MOVD   b+16(FP), P2ptr
   147		VLREPG cond+24(FP), SEL1
   148		VZERO  ZER
   149		VCEQG  SEL1, ZER, SEL1
   150	
   151		VL 0(P1ptr), X1H
   152		VL 16(P1ptr), X1L
   153		VL 32(P1ptr), Y1H
   154		VL 48(P1ptr), Y1L
   155		VL 64(P1ptr), Z1H
   156		VL 80(P1ptr), Z1L
   157	
   158		VL 0(P2ptr), X2H
   159		VL 16(P2ptr), X2L
   160		VL 32(P2ptr), Y2H
   161		VL 48(P2ptr), Y2L
   162		VL 64(P2ptr), Z2H
   163		VL 80(P2ptr), Z2L
   164	
   165		VSEL X2L, X1L, SEL1, X1L
   166		VSEL X2H, X1H, SEL1, X1H
   167		VSEL Y2L, Y1L, SEL1, Y1L
   168		VSEL Y2H, Y1H, SEL1, Y1H
   169		VSEL Z2L, Z1L, SEL1, Z1L
   170		VSEL Z2H, Z1H, SEL1, Z1H
   171	
   172		VST X1H, 0(P3ptr)
   173		VST X1L, 16(P3ptr)
   174		VST Y1H, 32(P3ptr)
   175		VST Y1L, 48(P3ptr)
   176		VST Z1H, 64(P3ptr)
   177		VST Z1L, 80(P3ptr)
   178	
   179		RET
   180	
   181	#undef P3ptr
   182	#undef P1ptr
   183	#undef P2ptr
   184	#undef X1L
   185	#undef X1H
   186	#undef Y1L
   187	#undef Y1H
   188	#undef Z1L
   189	#undef Z1H
   190	#undef X2L
   191	#undef X2H
   192	#undef Y2L
   193	#undef Y2H
   194	#undef Z2L
   195	#undef Z2H
   196	#undef ZER
   197	#undef SEL1
   198	
   199	// ---------------------------------------
   200	// Constant time table access
   201	// Indexed from 1 to 15, with -1 offset
   202	// (index 0 is implicitly point at infinity)
   203	// func p256Select(point *p256Point, table []p256Point, idx int)
   204	#define P3ptr   R1
   205	#define P1ptr   R2
   206	#define COUNT   R4
   207	
   208	#define X1L    V0
   209	#define X1H    V1
   210	#define Y1L    V2
   211	#define Y1H    V3
   212	#define Z1L    V4
   213	#define Z1H    V5
   214	#define X2L    V6
   215	#define X2H    V7
   216	#define Y2L    V8
   217	#define Y2H    V9
   218	#define Z2L    V10
   219	#define Z2H    V11
   220	
   221	#define ONE   V18
   222	#define IDX   V19
   223	#define SEL1  V20
   224	#define SEL2  V21
   225	TEXT ·p256Select(SB), NOSPLIT, $0
   226		MOVD   point+0(FP), P3ptr
   227		MOVD   table+8(FP), P1ptr
   228		VLREPB idx+(32+7)(FP), IDX
   229		VREPIB $1, ONE
   230		VREPIB $1, SEL2
   231		MOVD   $1, COUNT
   232	
   233		VZERO X1H
   234		VZERO X1L
   235		VZERO Y1H
   236		VZERO Y1L
   237		VZERO Z1H
   238		VZERO Z1L
   239	
   240	loop_select:
   241		VL 0(P1ptr), X2H
   242		VL 16(P1ptr), X2L
   243		VL 32(P1ptr), Y2H
   244		VL 48(P1ptr), Y2L
   245		VL 64(P1ptr), Z2H
   246		VL 80(P1ptr), Z2L
   247	
   248		VCEQG SEL2, IDX, SEL1
   249	
   250		VSEL X2L, X1L, SEL1, X1L
   251		VSEL X2H, X1H, SEL1, X1H
   252		VSEL Y2L, Y1L, SEL1, Y1L
   253		VSEL Y2H, Y1H, SEL1, Y1H
   254		VSEL Z2L, Z1L, SEL1, Z1L
   255		VSEL Z2H, Z1H, SEL1, Z1H
   256	
   257		VAB  SEL2, ONE, SEL2
   258		ADDW $1, COUNT
   259		ADD  $96, P1ptr
   260		CMPW COUNT, $17
   261		BLT  loop_select
   262	
   263		VST X1H, 0(P3ptr)
   264		VST X1L, 16(P3ptr)
   265		VST Y1H, 32(P3ptr)
   266		VST Y1L, 48(P3ptr)
   267		VST Z1H, 64(P3ptr)
   268		VST Z1L, 80(P3ptr)
   269		RET
   270	
   271	#undef P3ptr
   272	#undef P1ptr
   273	#undef COUNT
   274	#undef X1L
   275	#undef X1H
   276	#undef Y1L
   277	#undef Y1H
   278	#undef Z1L
   279	#undef Z1H
   280	#undef X2L
   281	#undef X2H
   282	#undef Y2L
   283	#undef Y2H
   284	#undef Z2L
   285	#undef Z2H
   286	#undef ONE
   287	#undef IDX
   288	#undef SEL1
   289	#undef SEL2
   290	
   291	// ---------------------------------------
   292	// Constant time table access
   293	// Indexed from 1 to 15, with -1 offset
   294	// (index 0 is implicitly point at infinity)
   295	// func p256SelectBase(point *p256Point, table []p256Point, idx int)
   296	#define P3ptr   R1
   297	#define P1ptr   R2
   298	#define COUNT   R4
   299	
   300	#define X1L    V0
   301	#define X1H    V1
   302	#define Y1L    V2
   303	#define Y1H    V3
   304	#define Z1L    V4
   305	#define Z1H    V5
   306	#define X2L    V6
   307	#define X2H    V7
   308	#define Y2L    V8
   309	#define Y2H    V9
   310	#define Z2L    V10
   311	#define Z2H    V11
   312	
   313	#define ONE   V18
   314	#define IDX   V19
   315	#define SEL1  V20
   316	#define SEL2  V21
   317	TEXT ·p256SelectBase(SB), NOSPLIT, $0
   318		MOVD   point+0(FP), P3ptr
   319		MOVD   table+8(FP), P1ptr
   320		VLREPB idx+(32+7)(FP), IDX
   321		VREPIB $1, ONE
   322		VREPIB $1, SEL2
   323		MOVD   $1, COUNT
   324	
   325		VZERO X1H
   326		VZERO X1L
   327		VZERO Y1H
   328		VZERO Y1L
   329		VZERO Z1H
   330		VZERO Z1L
   331	
   332	loop_select:
   333		VL 0(P1ptr), X2H
   334		VL 16(P1ptr), X2L
   335		VL 32(P1ptr), Y2H
   336		VL 48(P1ptr), Y2L
   337		VL 64(P1ptr), Z2H
   338		VL 80(P1ptr), Z2L
   339	
   340		VCEQG SEL2, IDX, SEL1
   341	
   342		VSEL X2L, X1L, SEL1, X1L
   343		VSEL X2H, X1H, SEL1, X1H
   344		VSEL Y2L, Y1L, SEL1, Y1L
   345		VSEL Y2H, Y1H, SEL1, Y1H
   346		VSEL Z2L, Z1L, SEL1, Z1L
   347		VSEL Z2H, Z1H, SEL1, Z1H
   348	
   349		VAB  SEL2, ONE, SEL2
   350		ADDW $1, COUNT
   351		ADD  $96, P1ptr
   352		CMPW COUNT, $65
   353		BLT  loop_select
   354	
   355		VST X1H, 0(P3ptr)
   356		VST X1L, 16(P3ptr)
   357		VST Y1H, 32(P3ptr)
   358		VST Y1L, 48(P3ptr)
   359		VST Z1H, 64(P3ptr)
   360		VST Z1L, 80(P3ptr)
   361		RET
   362	
   363	#undef P3ptr
   364	#undef P1ptr
   365	#undef COUNT
   366	#undef X1L
   367	#undef X1H
   368	#undef Y1L
   369	#undef Y1H
   370	#undef Z1L
   371	#undef Z1H
   372	#undef X2L
   373	#undef X2H
   374	#undef Y2L
   375	#undef Y2H
   376	#undef Z2L
   377	#undef Z2H
   378	#undef ONE
   379	#undef IDX
   380	#undef SEL1
   381	#undef SEL2
   382	
   383	// ---------------------------------------
   384	// func p256FromMont(res, in []byte)
   385	#define res_ptr R1
   386	#define x_ptr   R2
   387	#define CPOOL   R4
   388	
   389	#define T0   V0
   390	#define T1   V1
   391	#define T2   V2
   392	#define TT0  V3
   393	#define TT1  V4
   394	
   395	#define ZER   V6
   396	#define SEL1  V7
   397	#define SEL2  V8
   398	#define CAR1  V9
   399	#define CAR2  V10
   400	#define RED1  V11
   401	#define RED2  V12
   402	#define PL    V13
   403	#define PH    V14
   404	
   405	TEXT ·p256FromMont(SB), NOSPLIT, $0
   406		MOVD res+0(FP), res_ptr
   407		MOVD in+24(FP), x_ptr
   408	
   409		VZERO T2
   410		VZERO ZER
   411		MOVD  $p256<>+0x00(SB), CPOOL
   412		VL    16(CPOOL), PL
   413		VL    0(CPOOL), PH
   414		VL    48(CPOOL), SEL2
   415		VL    64(CPOOL), SEL1
   416	
   417		VL (1*16)(x_ptr), T0
   418		VL (0*16)(x_ptr), T1
   419	
   420		// First round
   421		VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   422		VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   423		VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   424	
   425		VSLDB $8, T1, T0, T0
   426		VSLDB $8, T2, T1, T1
   427	
   428		VACCQ  T0, RED1, CAR1
   429		VAQ    T0, RED1, T0
   430		VACCCQ T1, RED2, CAR1, CAR2
   431		VACQ   T1, RED2, CAR1, T1
   432		VAQ    T2, CAR2, T2
   433	
   434		// Second round
   435		VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   436		VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   437		VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   438	
   439		VSLDB $8, T1, T0, T0
   440		VSLDB $8, T2, T1, T1
   441	
   442		VACCQ  T0, RED1, CAR1
   443		VAQ    T0, RED1, T0
   444		VACCCQ T1, RED2, CAR1, CAR2
   445		VACQ   T1, RED2, CAR1, T1
   446		VAQ    T2, CAR2, T2
   447	
   448		// Third round
   449		VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   450		VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   451		VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   452	
   453		VSLDB $8, T1, T0, T0
   454		VSLDB $8, T2, T1, T1
   455	
   456		VACCQ  T0, RED1, CAR1
   457		VAQ    T0, RED1, T0
   458		VACCCQ T1, RED2, CAR1, CAR2
   459		VACQ   T1, RED2, CAR1, T1
   460		VAQ    T2, CAR2, T2
   461	
   462		// Last round
   463		VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   464		VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   465		VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   466	
   467		VSLDB $8, T1, T0, T0
   468		VSLDB $8, T2, T1, T1
   469	
   470		VACCQ  T0, RED1, CAR1
   471		VAQ    T0, RED1, T0
   472		VACCCQ T1, RED2, CAR1, CAR2
   473		VACQ   T1, RED2, CAR1, T1
   474		VAQ    T2, CAR2, T2
   475	
   476		// ---------------------------------------------------
   477	
   478		VSCBIQ  PL, T0, CAR1
   479		VSQ     PL, T0, TT0
   480		VSBCBIQ T1, PH, CAR1, CAR2
   481		VSBIQ   T1, PH, CAR1, TT1
   482		VSBIQ   T2, ZER, CAR2, T2
   483	
   484		// what output to use, TT1||TT0 or T1||T0?
   485		VSEL T0, TT0, T2, T0
   486		VSEL T1, TT1, T2, T1
   487	
   488		VST T0, (1*16)(res_ptr)
   489		VST T1, (0*16)(res_ptr)
   490		RET
   491	
   492	#undef res_ptr
   493	#undef x_ptr
   494	#undef CPOOL
   495	#undef T0
   496	#undef T1
   497	#undef T2
   498	#undef TT0
   499	#undef TT1
   500	#undef ZER
   501	#undef SEL1
   502	#undef SEL2
   503	#undef CAR1
   504	#undef CAR2
   505	#undef RED1
   506	#undef RED2
   507	#undef PL
   508	#undef PH
   509	
   510	// ---------------------------------------
   511	// func p256OrdMul(res, in1, in2 []byte)
   512	#define res_ptr R1
   513	#define x_ptr R2
   514	#define y_ptr R3
   515	#define X0    V0
   516	#define X1    V1
   517	#define Y0    V2
   518	#define Y1    V3
   519	#define M0    V4
   520	#define M1    V5
   521	#define T0    V6
   522	#define T1    V7
   523	#define T2    V8
   524	#define YDIG  V9
   525	
   526	#define ADD1  V16
   527	#define ADD1H V17
   528	#define ADD2  V18
   529	#define ADD2H V19
   530	#define RED1  V20
   531	#define RED1H V21
   532	#define RED2  V22
   533	#define RED2H V23
   534	#define CAR1  V24
   535	#define CAR1M V25
   536	
   537	#define MK0   V30
   538	#define K0    V31
   539	TEXT ·p256OrdMul(SB), NOSPLIT, $0
   540		MOVD res+0(FP), res_ptr
   541		MOVD in1+24(FP), x_ptr
   542		MOVD in2+48(FP), y_ptr
   543	
   544		VZERO T2
   545		MOVD  $p256ordK0<>+0x00(SB), R4
   546	
   547		// VLEF    $3, 0(R4), K0
   548		WORD $0xE7F40000
   549		BYTE $0x38
   550		BYTE $0x03
   551		MOVD $p256ord<>+0x00(SB), R4
   552		VL   16(R4), M0
   553		VL   0(R4), M1
   554	
   555		VL (1*16)(x_ptr), X0
   556		VL (0*16)(x_ptr), X1
   557		VL (1*16)(y_ptr), Y0
   558		VL (0*16)(y_ptr), Y1
   559	
   560		// ---------------------------------------------------------------------------/
   561		VREPF $3, Y0, YDIG
   562		VMLF  X0, YDIG, ADD1
   563		VMLF  ADD1, K0, MK0
   564		VREPF $3, MK0, MK0
   565	
   566		VMLF  X1, YDIG, ADD2
   567		VMLHF X0, YDIG, ADD1H
   568		VMLHF X1, YDIG, ADD2H
   569	
   570		VMALF  M0, MK0, ADD1, RED1
   571		VMALHF M0, MK0, ADD1, RED1H
   572		VMALF  M1, MK0, ADD2, RED2
   573		VMALHF M1, MK0, ADD2, RED2H
   574	
   575		VSLDB $12, RED2, RED1, RED1
   576		VSLDB $12, T2, RED2, RED2
   577	
   578		VACCQ RED1, ADD1H, CAR1
   579		VAQ   RED1, ADD1H, T0
   580		VACCQ RED1H, T0, CAR1M
   581		VAQ   RED1H, T0, T0
   582	
   583		// << ready for next MK0
   584	
   585		VACQ   RED2, ADD2H, CAR1, T1
   586		VACCCQ RED2, ADD2H, CAR1, CAR1
   587		VACCCQ RED2H, T1, CAR1M, T2
   588		VACQ   RED2H, T1, CAR1M, T1
   589		VAQ    CAR1, T2, T2
   590	
   591		// ---------------------------------------------------
   592	/* *
   593	 * ---+--------+--------+
   594	 *  T2|   T1   |   T0   |
   595	 * ---+--------+--------+
   596	 *           *(add)*
   597	 *    +--------+--------+
   598	 *    |   X1   |   X0   |
   599	 *    +--------+--------+
   600	 *           *(mul)*
   601	 *    +--------+--------+
   602	 *    |  YDIG  |  YDIG  |
   603	 *    +--------+--------+
   604	 *           *(add)*
   605	 *    +--------+--------+
   606	 *    |   M1   |   M0   |
   607	 *    +--------+--------+
   608	 *           *(mul)*
   609	 *    +--------+--------+
   610	 *    |   MK0  |   MK0  |
   611	 *    +--------+--------+
   612	 *
   613	 *   ---------------------
   614	 *
   615	 *    +--------+--------+
   616	 *    |  ADD2  |  ADD1  |
   617	 *    +--------+--------+
   618	 *  +--------+--------+
   619	 *  | ADD2H  | ADD1H  |
   620	 *  +--------+--------+
   621	 *    +--------+--------+
   622	 *    |  RED2  |  RED1  |
   623	 *    +--------+--------+
   624	 *  +--------+--------+
   625	 *  | RED2H  | RED1H  |
   626	 *  +--------+--------+
   627	 */
   628		VREPF $2, Y0, YDIG
   629		VMALF X0, YDIG, T0, ADD1
   630		VMLF  ADD1, K0, MK0
   631		VREPF $3, MK0, MK0
   632	
   633		VMALF  X1, YDIG, T1, ADD2
   634		VMALHF X0, YDIG, T0, ADD1H
   635		VMALHF X1, YDIG, T1, ADD2H
   636	
   637		VMALF  M0, MK0, ADD1, RED1
   638		VMALHF M0, MK0, ADD1, RED1H
   639		VMALF  M1, MK0, ADD2, RED2
   640		VMALHF M1, MK0, ADD2, RED2H
   641	
   642		VSLDB $12, RED2, RED1, RED1
   643		VSLDB $12, T2, RED2, RED2
   644	
   645		VACCQ RED1, ADD1H, CAR1
   646		VAQ   RED1, ADD1H, T0
   647		VACCQ RED1H, T0, CAR1M
   648		VAQ   RED1H, T0, T0
   649	
   650		// << ready for next MK0
   651	
   652		VACQ   RED2, ADD2H, CAR1, T1
   653		VACCCQ RED2, ADD2H, CAR1, CAR1
   654		VACCCQ RED2H, T1, CAR1M, T2
   655		VACQ   RED2H, T1, CAR1M, T1
   656		VAQ    CAR1, T2, T2
   657	
   658		// ---------------------------------------------------
   659		VREPF $1, Y0, YDIG
   660		VMALF X0, YDIG, T0, ADD1
   661		VMLF  ADD1, K0, MK0
   662		VREPF $3, MK0, MK0
   663	
   664		VMALF  X1, YDIG, T1, ADD2
   665		VMALHF X0, YDIG, T0, ADD1H
   666		VMALHF X1, YDIG, T1, ADD2H
   667	
   668		VMALF  M0, MK0, ADD1, RED1
   669		VMALHF M0, MK0, ADD1, RED1H
   670		VMALF  M1, MK0, ADD2, RED2
   671		VMALHF M1, MK0, ADD2, RED2H
   672	
   673		VSLDB $12, RED2, RED1, RED1
   674		VSLDB $12, T2, RED2, RED2
   675	
   676		VACCQ RED1, ADD1H, CAR1
   677		VAQ   RED1, ADD1H, T0
   678		VACCQ RED1H, T0, CAR1M
   679		VAQ   RED1H, T0, T0
   680	
   681		// << ready for next MK0
   682	
   683		VACQ   RED2, ADD2H, CAR1, T1
   684		VACCCQ RED2, ADD2H, CAR1, CAR1
   685		VACCCQ RED2H, T1, CAR1M, T2
   686		VACQ   RED2H, T1, CAR1M, T1
   687		VAQ    CAR1, T2, T2
   688	
   689		// ---------------------------------------------------
   690		VREPF $0, Y0, YDIG
   691		VMALF X0, YDIG, T0, ADD1
   692		VMLF  ADD1, K0, MK0
   693		VREPF $3, MK0, MK0
   694	
   695		VMALF  X1, YDIG, T1, ADD2
   696		VMALHF X0, YDIG, T0, ADD1H
   697		VMALHF X1, YDIG, T1, ADD2H
   698	
   699		VMALF  M0, MK0, ADD1, RED1
   700		VMALHF M0, MK0, ADD1, RED1H
   701		VMALF  M1, MK0, ADD2, RED2
   702		VMALHF M1, MK0, ADD2, RED2H
   703	
   704		VSLDB $12, RED2, RED1, RED1
   705		VSLDB $12, T2, RED2, RED2
   706	
   707		VACCQ RED1, ADD1H, CAR1
   708		VAQ   RED1, ADD1H, T0
   709		VACCQ RED1H, T0, CAR1M
   710		VAQ   RED1H, T0, T0
   711	
   712		// << ready for next MK0
   713	
   714		VACQ   RED2, ADD2H, CAR1, T1
   715		VACCCQ RED2, ADD2H, CAR1, CAR1
   716		VACCCQ RED2H, T1, CAR1M, T2
   717		VACQ   RED2H, T1, CAR1M, T1
   718		VAQ    CAR1, T2, T2
   719	
   720		// ---------------------------------------------------
   721		VREPF $3, Y1, YDIG
   722		VMALF X0, YDIG, T0, ADD1
   723		VMLF  ADD1, K0, MK0
   724		VREPF $3, MK0, MK0
   725	
   726		VMALF  X1, YDIG, T1, ADD2
   727		VMALHF X0, YDIG, T0, ADD1H
   728		VMALHF X1, YDIG, T1, ADD2H
   729	
   730		VMALF  M0, MK0, ADD1, RED1
   731		VMALHF M0, MK0, ADD1, RED1H
   732		VMALF  M1, MK0, ADD2, RED2
   733		VMALHF M1, MK0, ADD2, RED2H
   734	
   735		VSLDB $12, RED2, RED1, RED1
   736		VSLDB $12, T2, RED2, RED2
   737	
   738		VACCQ RED1, ADD1H, CAR1
   739		VAQ   RED1, ADD1H, T0
   740		VACCQ RED1H, T0, CAR1M
   741		VAQ   RED1H, T0, T0
   742	
   743		// << ready for next MK0
   744	
   745		VACQ   RED2, ADD2H, CAR1, T1
   746		VACCCQ RED2, ADD2H, CAR1, CAR1
   747		VACCCQ RED2H, T1, CAR1M, T2
   748		VACQ   RED2H, T1, CAR1M, T1
   749		VAQ    CAR1, T2, T2
   750	
   751		// ---------------------------------------------------
   752		VREPF $2, Y1, YDIG
   753		VMALF X0, YDIG, T0, ADD1
   754		VMLF  ADD1, K0, MK0
   755		VREPF $3, MK0, MK0
   756	
   757		VMALF  X1, YDIG, T1, ADD2
   758		VMALHF X0, YDIG, T0, ADD1H
   759		VMALHF X1, YDIG, T1, ADD2H
   760	
   761		VMALF  M0, MK0, ADD1, RED1
   762		VMALHF M0, MK0, ADD1, RED1H
   763		VMALF  M1, MK0, ADD2, RED2
   764		VMALHF M1, MK0, ADD2, RED2H
   765	
   766		VSLDB $12, RED2, RED1, RED1
   767		VSLDB $12, T2, RED2, RED2
   768	
   769		VACCQ RED1, ADD1H, CAR1
   770		VAQ   RED1, ADD1H, T0
   771		VACCQ RED1H, T0, CAR1M
   772		VAQ   RED1H, T0, T0
   773	
   774		// << ready for next MK0
   775	
   776		VACQ   RED2, ADD2H, CAR1, T1
   777		VACCCQ RED2, ADD2H, CAR1, CAR1
   778		VACCCQ RED2H, T1, CAR1M, T2
   779		VACQ   RED2H, T1, CAR1M, T1
   780		VAQ    CAR1, T2, T2
   781	
   782		// ---------------------------------------------------
   783		VREPF $1, Y1, YDIG
   784		VMALF X0, YDIG, T0, ADD1
   785		VMLF  ADD1, K0, MK0
   786		VREPF $3, MK0, MK0
   787	
   788		VMALF  X1, YDIG, T1, ADD2
   789		VMALHF X0, YDIG, T0, ADD1H
   790		VMALHF X1, YDIG, T1, ADD2H
   791	
   792		VMALF  M0, MK0, ADD1, RED1
   793		VMALHF M0, MK0, ADD1, RED1H
   794		VMALF  M1, MK0, ADD2, RED2
   795		VMALHF M1, MK0, ADD2, RED2H
   796	
   797		VSLDB $12, RED2, RED1, RED1
   798		VSLDB $12, T2, RED2, RED2
   799	
   800		VACCQ RED1, ADD1H, CAR1
   801		VAQ   RED1, ADD1H, T0
   802		VACCQ RED1H, T0, CAR1M
   803		VAQ   RED1H, T0, T0
   804	
   805		// << ready for next MK0
   806	
   807		VACQ   RED2, ADD2H, CAR1, T1
   808		VACCCQ RED2, ADD2H, CAR1, CAR1
   809		VACCCQ RED2H, T1, CAR1M, T2
   810		VACQ   RED2H, T1, CAR1M, T1
   811		VAQ    CAR1, T2, T2
   812	
   813		// ---------------------------------------------------
   814		VREPF $0, Y1, YDIG
   815		VMALF X0, YDIG, T0, ADD1
   816		VMLF  ADD1, K0, MK0
   817		VREPF $3, MK0, MK0
   818	
   819		VMALF  X1, YDIG, T1, ADD2
   820		VMALHF X0, YDIG, T0, ADD1H
   821		VMALHF X1, YDIG, T1, ADD2H
   822	
   823		VMALF  M0, MK0, ADD1, RED1
   824		VMALHF M0, MK0, ADD1, RED1H
   825		VMALF  M1, MK0, ADD2, RED2
   826		VMALHF M1, MK0, ADD2, RED2H
   827	
   828		VSLDB $12, RED2, RED1, RED1
   829		VSLDB $12, T2, RED2, RED2
   830	
   831		VACCQ RED1, ADD1H, CAR1
   832		VAQ   RED1, ADD1H, T0
   833		VACCQ RED1H, T0, CAR1M
   834		VAQ   RED1H, T0, T0
   835	
   836		// << ready for next MK0
   837	
   838		VACQ   RED2, ADD2H, CAR1, T1
   839		VACCCQ RED2, ADD2H, CAR1, CAR1
   840		VACCCQ RED2H, T1, CAR1M, T2
   841		VACQ   RED2H, T1, CAR1M, T1
   842		VAQ    CAR1, T2, T2
   843	
   844		// ---------------------------------------------------
   845	
   846		VZERO   RED1
   847		VSCBIQ  M0, T0, CAR1
   848		VSQ     M0, T0, ADD1
   849		VSBCBIQ T1, M1, CAR1, CAR1M
   850		VSBIQ   T1, M1, CAR1, ADD2
   851		VSBIQ   T2, RED1, CAR1M, T2
   852	
   853		// what output to use, ADD2||ADD1 or T1||T0?
   854		VSEL T0, ADD1, T2, T0
   855		VSEL T1, ADD2, T2, T1
   856	
   857		VST T0, (1*16)(res_ptr)
   858		VST T1, (0*16)(res_ptr)
   859		RET
   860	
   861	#undef res_ptr
   862	#undef x_ptr
   863	#undef y_ptr
   864	#undef X0
   865	#undef X1
   866	#undef Y0
   867	#undef Y1
   868	#undef M0
   869	#undef M1
   870	#undef T0
   871	#undef T1
   872	#undef T2
   873	#undef YDIG
   874	
   875	#undef ADD1
   876	#undef ADD1H
   877	#undef ADD2
   878	#undef ADD2H
   879	#undef RED1
   880	#undef RED1H
   881	#undef RED2
   882	#undef RED2H
   883	#undef CAR1
   884	#undef CAR1M
   885	
   886	#undef MK0
   887	#undef K0
   888	
   889	// ---------------------------------------
   890	// p256MulInternalVX
   891	// V0-V3,V30,V31 - Not Modified
   892	// V4-V15 - Volatile
   893	
   894	#define CPOOL   R4
   895	
   896	// Parameters
   897	#define X0    V0 // Not modified
   898	#define X1    V1 // Not modified
   899	#define Y0    V2 // Not modified
   900	#define Y1    V3 // Not modified
   901	#define T0    V4
   902	#define T1    V5
   903	#define P0    V30 // Not modified
   904	#define P1    V31 // Not modified
   905	
   906	// Temporaries
   907	#define YDIG  V6 // Overloaded with CAR2, ZER
   908	#define ADD1H V7 // Overloaded with ADD3H
   909	#define ADD2H V8 // Overloaded with ADD4H
   910	#define ADD3  V9 // Overloaded with SEL2,SEL5
   911	#define ADD4  V10 // Overloaded with SEL3,SEL6
   912	#define RED1  V11 // Overloaded with CAR2
   913	#define RED2  V12
   914	#define RED3  V13 // Overloaded with SEL1
   915	#define T2    V14
   916	// Overloaded temporaries
   917	#define ADD1  V4 // Overloaded with T0
   918	#define ADD2  V5 // Overloaded with T1
   919	#define ADD3H V7 // Overloaded with ADD1H
   920	#define ADD4H V8 // Overloaded with ADD2H
   921	#define ZER   V6 // Overloaded with YDIG, CAR2
   922	#define CAR1  V6 // Overloaded with YDIG, ZER
   923	#define CAR2  V11 // Overloaded with RED1
   924	// Constant Selects
   925	#define SEL1  V13 // Overloaded with RED3
   926	#define SEL2  V9 // Overloaded with ADD3,SEL5
   927	#define SEL3  V10 // Overloaded with ADD4,SEL6
   928	#define SEL4  V6 // Overloaded with YDIG,CAR2,ZER
   929	#define SEL5  V9 // Overloaded with ADD3,SEL2
   930	#define SEL6  V10 // Overloaded with ADD4,SEL3
   931	
   932	/* *
   933	 * To follow the flow of bits, for your own sanity a stiff drink, need you shall.
   934	 * Of a single round, a 'helpful' picture, here is. Meaning, column position has.
   935	 * With you, SIMD be...
   936	 *
   937	 *                                           +--------+--------+
   938	 *                                  +--------|  RED2  |  RED1  |
   939	 *                                  |        +--------+--------+
   940	 *                                  |       ---+--------+--------+
   941	 *                                  |  +---- T2|   T1   |   T0   |--+
   942	 *                                  |  |    ---+--------+--------+  |
   943	 *                                  |  |                            |
   944	 *                                  |  |    ======================= |
   945	 *                                  |  |                            |
   946	 *                                  |  |       +--------+--------+<-+
   947	 *                                  |  +-------|  ADD2  |  ADD1  |--|-----+
   948	 *                                  |  |       +--------+--------+  |     |
   949	 *                                  |  |     +--------+--------+<---+     |
   950	 *                                  |  |     | ADD2H  | ADD1H  |--+       |
   951	 *                                  |  |     +--------+--------+  |       |
   952	 *                                  |  |     +--------+--------+<-+       |
   953	 *                                  |  |     |  ADD4  |  ADD3  |--|-+     |
   954	 *                                  |  |     +--------+--------+  | |     |
   955	 *                                  |  |   +--------+--------+<---+ |     |
   956	 *                                  |  |   | ADD4H  | ADD3H  |------|-+   |(+vzero)
   957	 *                                  |  |   +--------+--------+      | |   V
   958	 *                                  |  | ------------------------   | | +--------+
   959	 *                                  |  |                            | | |  RED3  |  [d0 0 0 d0]
   960	 *                                  |  |                            | | +--------+
   961	 *                                  |  +---->+--------+--------+    | |   |
   962	 *   (T2[1w]||ADD2[4w]||ADD1[3w])   +--------|   T1   |   T0   |    | |   |
   963	 *                                  |        +--------+--------+    | |   |
   964	 *                                  +---->---+--------+--------+    | |   |
   965	 *                                         T2|   T1   |   T0   |----+ |   |
   966	 *                                        ---+--------+--------+    | |   |
   967	 *                                        ---+--------+--------+<---+ |   |
   968	 *                                    +--- T2|   T1   |   T0   |----------+
   969	 *                                    |   ---+--------+--------+      |   |
   970	 *                                    |  +--------+--------+<-------------+
   971	 *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0]
   972	 *                                    |  +--------+--------+     |    |   |
   973	 *                                    |  +--------+<----------------------+
   974	 *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0]
   975	 *                                    |  +--------+              |    |
   976	 *                                    +--->+--------+--------+   |    |
   977	 *                                         |   T1   |   T0   |--------+
   978	 *                                         +--------+--------+   |    |
   979	 *                                   --------------------------- |    |
   980	 *                                                               |    |
   981	 *                                       +--------+--------+<----+    |
   982	 *                                       |  RED2  |  RED1  |          |
   983	 *                                       +--------+--------+          |
   984	 *                                      ---+--------+--------+<-------+
   985	 *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!)
   986	 *                                      ---+--------+--------+
   987	 *
   988	 *                                                                *Mi obra de arte de siglo XXI @vpaprots
   989	 *
   990	 *
   991	 * First group is special, doesn't get the two inputs:
   992	 *                                             +--------+--------+<-+
   993	 *                                     +-------|  ADD2  |  ADD1  |--|-----+
   994	 *                                     |       +--------+--------+  |     |
   995	 *                                     |     +--------+--------+<---+     |
   996	 *                                     |     | ADD2H  | ADD1H  |--+       |
   997	 *                                     |     +--------+--------+  |       |
   998	 *                                     |     +--------+--------+<-+       |
   999	 *                                     |     |  ADD4  |  ADD3  |--|-+     |
  1000	 *                                     |     +--------+--------+  | |     |
  1001	 *                                     |   +--------+--------+<---+ |     |
  1002	 *                                     |   | ADD4H  | ADD3H  |------|-+   |(+vzero)
  1003	 *                                     |   +--------+--------+      | |   V
  1004	 *                                     | ------------------------   | | +--------+
  1005	 *                                     |                            | | |  RED3  |  [d0 0 0 d0]
  1006	 *                                     |                            | | +--------+
  1007	 *                                     +---->+--------+--------+    | |   |
  1008	 *   (T2[1w]||ADD2[4w]||ADD1[3w])            |   T1   |   T0   |----+ |   |
  1009	 *                                           +--------+--------+    | |   |
  1010	 *                                        ---+--------+--------+<---+ |   |
  1011	 *                                    +--- T2|   T1   |   T0   |----------+
  1012	 *                                    |   ---+--------+--------+      |   |
  1013	 *                                    |  +--------+--------+<-------------+
  1014	 *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0]
  1015	 *                                    |  +--------+--------+     |    |   |
  1016	 *                                    |  +--------+<----------------------+
  1017	 *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0]
  1018	 *                                    |  +--------+              |    |
  1019	 *                                    +--->+--------+--------+   |    |
  1020	 *                                         |   T1   |   T0   |--------+
  1021	 *                                         +--------+--------+   |    |
  1022	 *                                   --------------------------- |    |
  1023	 *                                                               |    |
  1024	 *                                       +--------+--------+<----+    |
  1025	 *                                       |  RED2  |  RED1  |          |
  1026	 *                                       +--------+--------+          |
  1027	 *                                      ---+--------+--------+<-------+
  1028	 *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!)
  1029	 *                                      ---+--------+--------+
  1030	 *
  1031	 * Last 'group' needs to RED2||RED1 shifted less
  1032	 */
  1033	TEXT ·p256MulInternalVX(SB), NOSPLIT, $0-0
  1034		VL 32(CPOOL), SEL1
  1035		VL 48(CPOOL), SEL2
  1036		VL 64(CPOOL), SEL3
  1037		VL 80(CPOOL), SEL4
  1038	
  1039		// ---------------------------------------------------
  1040	
  1041		VREPF $3, Y0, YDIG
  1042		VMLHF X0, YDIG, ADD1H
  1043		VMLHF X1, YDIG, ADD2H
  1044		VMLF  X0, YDIG, ADD1
  1045		VMLF  X1, YDIG, ADD2
  1046	
  1047		VREPF  $2, Y0, YDIG
  1048		VMALF  X0, YDIG, ADD1H, ADD3
  1049		VMALF  X1, YDIG, ADD2H, ADD4
  1050		VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
  1051		VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
  1052	
  1053		VZERO ZER
  1054		VL    32(CPOOL), SEL1
  1055		VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1056	
  1057		VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
  1058		VSLDB $12, ZER, ADD2, T1  // ADD2 Free
  1059	
  1060		VACCQ  T0, ADD3, CAR1
  1061		VAQ    T0, ADD3, T0       // ADD3 Free
  1062		VACCCQ T1, ADD4, CAR1, T2
  1063		VACQ   T1, ADD4, CAR1, T1 // ADD4 Free
  1064	
  1065		VL    48(CPOOL), SEL2
  1066		VL    64(CPOOL), SEL3
  1067		VL    80(CPOOL), SEL4
  1068		VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
  1069		VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
  1070		VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
  1071		VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
  1072	
  1073		VSLDB $12, T1, T0, T0
  1074		VSLDB $12, T2, T1, T1
  1075	
  1076		VACCQ  T0, ADD3H, CAR1
  1077		VAQ    T0, ADD3H, T0
  1078		VACCCQ T1, ADD4H, CAR1, T2
  1079		VACQ   T1, ADD4H, CAR1, T1
  1080	
  1081		// ---------------------------------------------------
  1082	
  1083		VREPF  $1, Y0, YDIG
  1084		VMALHF X0, YDIG, T0, ADD1H
  1085		VMALHF X1, YDIG, T1, ADD2H
  1086		VMALF  X0, YDIG, T0, ADD1  // T0 Free->ADD1
  1087		VMALF  X1, YDIG, T1, ADD2  // T1 Free->ADD2
  1088	
  1089		VREPF  $0, Y0, YDIG
  1090		VMALF  X0, YDIG, ADD1H, ADD3
  1091		VMALF  X1, YDIG, ADD2H, ADD4
  1092		VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
  1093		VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
  1094	
  1095		VZERO ZER
  1096		VL    32(CPOOL), SEL1
  1097		VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1098	
  1099		VSLDB $12, ADD2, ADD1, T0 // ADD1 Free->T0
  1100		VSLDB $12, T2, ADD2, T1   // ADD2 Free->T1, T2 Free
  1101	
  1102		VACCQ  T0, RED1, CAR1
  1103		VAQ    T0, RED1, T0
  1104		VACCCQ T1, RED2, CAR1, T2
  1105		VACQ   T1, RED2, CAR1, T1
  1106	
  1107		VACCQ  T0, ADD3, CAR1
  1108		VAQ    T0, ADD3, T0
  1109		VACCCQ T1, ADD4, CAR1, CAR2
  1110		VACQ   T1, ADD4, CAR1, T1
  1111		VAQ    T2, CAR2, T2
  1112	
  1113		VL    48(CPOOL), SEL2
  1114		VL    64(CPOOL), SEL3
  1115		VL    80(CPOOL), SEL4
  1116		VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
  1117		VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
  1118		VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
  1119		VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
  1120	
  1121		VSLDB $12, T1, T0, T0
  1122		VSLDB $12, T2, T1, T1
  1123	
  1124		VACCQ  T0, ADD3H, CAR1
  1125		VAQ    T0, ADD3H, T0
  1126		VACCCQ T1, ADD4H, CAR1, T2
  1127		VACQ   T1, ADD4H, CAR1, T1
  1128	
  1129		// ---------------------------------------------------
  1130	
  1131		VREPF  $3, Y1, YDIG
  1132		VMALHF X0, YDIG, T0, ADD1H
  1133		VMALHF X1, YDIG, T1, ADD2H
  1134		VMALF  X0, YDIG, T0, ADD1
  1135		VMALF  X1, YDIG, T1, ADD2
  1136	
  1137		VREPF  $2, Y1, YDIG
  1138		VMALF  X0, YDIG, ADD1H, ADD3
  1139		VMALF  X1, YDIG, ADD2H, ADD4
  1140		VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
  1141		VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
  1142	
  1143		VZERO ZER
  1144		VL    32(CPOOL), SEL1
  1145		VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1146	
  1147		VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
  1148		VSLDB $12, T2, ADD2, T1   // ADD2 Free
  1149	
  1150		VACCQ  T0, RED1, CAR1
  1151		VAQ    T0, RED1, T0
  1152		VACCCQ T1, RED2, CAR1, T2
  1153		VACQ   T1, RED2, CAR1, T1
  1154	
  1155		VACCQ  T0, ADD3, CAR1
  1156		VAQ    T0, ADD3, T0
  1157		VACCCQ T1, ADD4, CAR1, CAR2
  1158		VACQ   T1, ADD4, CAR1, T1
  1159		VAQ    T2, CAR2, T2
  1160	
  1161		VL    48(CPOOL), SEL2
  1162		VL    64(CPOOL), SEL3
  1163		VL    80(CPOOL), SEL4
  1164		VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
  1165		VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
  1166		VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
  1167		VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
  1168	
  1169		VSLDB $12, T1, T0, T0
  1170		VSLDB $12, T2, T1, T1
  1171	
  1172		VACCQ  T0, ADD3H, CAR1
  1173		VAQ    T0, ADD3H, T0
  1174		VACCCQ T1, ADD4H, CAR1, T2
  1175		VACQ   T1, ADD4H, CAR1, T1
  1176	
  1177		// ---------------------------------------------------
  1178	
  1179		VREPF  $1, Y1, YDIG
  1180		VMALHF X0, YDIG, T0, ADD1H
  1181		VMALHF X1, YDIG, T1, ADD2H
  1182		VMALF  X0, YDIG, T0, ADD1
  1183		VMALF  X1, YDIG, T1, ADD2
  1184	
  1185		VREPF  $0, Y1, YDIG
  1186		VMALF  X0, YDIG, ADD1H, ADD3
  1187		VMALF  X1, YDIG, ADD2H, ADD4
  1188		VMALHF X0, YDIG, ADD1H, ADD3H
  1189		VMALHF X1, YDIG, ADD2H, ADD4H
  1190	
  1191		VZERO ZER
  1192		VL    32(CPOOL), SEL1
  1193		VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1194	
  1195		VSLDB $12, ADD2, ADD1, T0
  1196		VSLDB $12, T2, ADD2, T1
  1197	
  1198		VACCQ  T0, RED1, CAR1
  1199		VAQ    T0, RED1, T0
  1200		VACCCQ T1, RED2, CAR1, T2
  1201		VACQ   T1, RED2, CAR1, T1
  1202	
  1203		VACCQ  T0, ADD3, CAR1
  1204		VAQ    T0, ADD3, T0
  1205		VACCCQ T1, ADD4, CAR1, CAR2
  1206		VACQ   T1, ADD4, CAR1, T1
  1207		VAQ    T2, CAR2, T2
  1208	
  1209		VL    96(CPOOL), SEL5
  1210		VL    112(CPOOL), SEL6
  1211		VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0]
  1212		VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0  0]
  1213		VSQ   RED1, RED2, RED2     // Guaranteed not to underflow
  1214	
  1215		VSLDB $12, T1, T0, T0
  1216		VSLDB $12, T2, T1, T1
  1217	
  1218		VACCQ  T0, ADD3H, CAR1
  1219		VAQ    T0, ADD3H, T0
  1220		VACCCQ T1, ADD4H, CAR1, T2
  1221		VACQ   T1, ADD4H, CAR1, T1
  1222	
  1223		VACCQ  T0, RED1, CAR1
  1224		VAQ    T0, RED1, T0
  1225		VACCCQ T1, RED2, CAR1, CAR2
  1226		VACQ   T1, RED2, CAR1, T1
  1227		VAQ    T2, CAR2, T2
  1228	
  1229		// ---------------------------------------------------
  1230	
  1231		VZERO   RED3
  1232		VSCBIQ  P0, T0, CAR1
  1233		VSQ     P0, T0, ADD1H
  1234		VSBCBIQ T1, P1, CAR1, CAR2
  1235		VSBIQ   T1, P1, CAR1, ADD2H
  1236		VSBIQ   T2, RED3, CAR2, T2
  1237	
  1238		// what output to use, ADD2H||ADD1H or T1||T0?
  1239		VSEL T0, ADD1H, T2, T0
  1240		VSEL T1, ADD2H, T2, T1
  1241		RET
  1242	
  1243	#undef CPOOL
  1244	
  1245	#undef X0
  1246	#undef X1
  1247	#undef Y0
  1248	#undef Y1
  1249	#undef T0
  1250	#undef T1
  1251	#undef P0
  1252	#undef P1
  1253	
  1254	#undef SEL1
  1255	#undef SEL2
  1256	#undef SEL3
  1257	#undef SEL4
  1258	#undef SEL5
  1259	#undef SEL6
  1260	
  1261	#undef YDIG
  1262	#undef ADD1H
  1263	#undef ADD2H
  1264	#undef ADD3
  1265	#undef ADD4
  1266	#undef RED1
  1267	#undef RED2
  1268	#undef RED3
  1269	#undef T2
  1270	#undef ADD1
  1271	#undef ADD2
  1272	#undef ADD3H
  1273	#undef ADD4H
  1274	#undef ZER
  1275	#undef CAR1
  1276	#undef CAR2
  1277	
  1278	// ---------------------------------------
  1279	// p256MulInternalVMSL
  1280	// V0-V3,V30,V31 - Not Modified
  1281	// V4-V14 - Volatile
  1282	
  1283	#define CPOOL   R4
  1284	#define SCRATCH R9
  1285	
  1286	// Parameters
  1287	#define X0    V0 // Not modified
  1288	#define X1    V1 // Not modified
  1289	#define Y0    V2 // Not modified
  1290	#define Y1    V3 // Not modified
  1291	#define T0    V4
  1292	#define T1    V5
  1293	#define T2    V6
  1294	#define P0    V30 // Not modified
  1295	#define P1    V31 // Not modified
  1296	
  1297	// input: d0
  1298	// output: h0, h1
  1299	// temp: TEMP, ZERO, BORROW
  1300	#define OBSERVATION3(d0, h0, h1, TEMP, ZERO, BORROW) \
  1301		VZERO ZERO                   \
  1302		VSLDB $4, d0, ZERO, h0       \
  1303		VLR   h0, BORROW             \
  1304		VSLDB $12, ZERO, h0, TEMP    \
  1305		VSQ   TEMP, h0, h0           \
  1306		VSLDB $12, d0, BORROW, h1    \
  1307		VSLDB $8, ZERO, BORROW, TEMP \
  1308		VAQ   TEMP, h0, h0           \
  1309	
  1310	#define OBSERVATION3A(d2, h0, h1, TEMP, ZERO) \
  1311		VZERO ZERO                \
  1312		VSLDB $8, d2, ZERO, TEMP  \
  1313		VSLDB $8, d2, TEMP, h0    \
  1314		VSLDB $12, ZERO, TEMP, h1 \
  1315		VSQ   h1, h0, h0          \
  1316	
  1317	TEXT ·p256MulInternalVMSL(SB), NOFRAME|NOSPLIT, $0-0
  1318		VSTM V16, V19, (SCRATCH)
  1319	
  1320		MOVD $p256vmsl<>+0x00(SB), CPOOL
  1321	
  1322		// Divide input1 into 5 limbs
  1323		VGBM  $0x007f, V14
  1324		VZERO V12
  1325		VSLDB $2, X1, X0, V13
  1326		VSLDB $2, Y1, Y0, V8
  1327		VSLDB $4, V12, X1, V11 // V11(X1): 4 bytes limb
  1328		VSLDB $4, V12, Y1, V6  // V6: 4 bytes limb
  1329	
  1330		VN V14, X0, V5   // V5: first 7 bytes limb
  1331		VN V14, Y0, V10  // V10: first 7 bytes limb
  1332		VN V14, V13, V13 // v13: third 7 bytes limb
  1333		VN V14, V8, V8   // V8: third 7 bytes limb
  1334	
  1335		VMSLG V10, V5, V12, V10 // v10: l10 x l5 (column 1)
  1336		VMSLG V8, V5, V12, V8   // v8: l8 x l5
  1337		VMSLG V6, V13, V12, V13 // v13: l6 x l3
  1338		VMSLG V6, V11, V12, V11 // v11: l6 x l1 (column 9)
  1339		VMSLG V6, V5, V12, V6   // v6: l6 x l5
  1340	
  1341		MOVD $p256vmsl<>+0x00(SB), CPOOL
  1342		VGBM $0x7f7f, V14
  1343	
  1344		VL 0(CPOOL), V4
  1345		VL 16(CPOOL), V7
  1346		VL 32(CPOOL), V9
  1347		VL 48(CPOOL), V5
  1348		VLM 64(CPOOL), V16, V19
  1349	
  1350		VPERM V12, X0, V4, V4   // v4: limb4 | limb5
  1351		VPERM Y1, Y0, V7, V7
  1352		VPERM V12, Y0, V9, V9   // v9: limb10 | limb9
  1353		VPERM X1, X0, V5, V5
  1354		VPERM X1, X0, V16, V16
  1355		VPERM Y1, Y0, V17, V17
  1356		VPERM X1, V12, V18, V18 // v18: limb1 | limb2
  1357		VPERM Y1, V12, V19, V19 // v19: limb7 | limb6
  1358		VN    V14, V7, V7       // v7:  limb9 | limb8
  1359		VN    V14, V5, V5       // v5:  limb3 | limb4
  1360		VN    V14, V16, V16     // v16: limb2 | limb3
  1361		VN    V14, V17, V17     // v17: limb8 | limb7
  1362	
  1363		VMSLG V9, V4, V12, V14   // v14: l10 x l4 + l9 x l5 (column 2)
  1364		VMSLG V9, V5, V8, V8     // v8: l10 x l9 + l3 x l4 + l8 x l5 (column 3)
  1365		VMSLG V9, V16, V12, V16  // v16: l10 x l9 + l2 x l3
  1366		VMSLG V9, V18, V12, V9   // v9: l10 x l1 + l9 x l2
  1367		VMSLG V7, V18, V12, V7   // v7: l9 x l1 + l8 x l2
  1368		VMSLG V17, V4, V16, V16  // v16: l8 x l4 + l7 x l5 + l10 x l9 + l2 x l3 (column 4)
  1369		VMSLG V17, V5, V9, V9    // v9: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4
  1370		VMSLG V17, V18, V12, V17 // v18: l8 x l1 + l7 x l2
  1371		VMSLG V19, V5, V7, V7    // v7: l9 x l1 + l8 x l2 + l7 x l3 + l6 x l4 (column 6)
  1372		VMSLG V19, V18, V12, V19 // v19: l7 x l1 + l6 x l2 (column 8)
  1373		VAQ   V9, V6, V9         // v9: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4 + l6 x l5 (column 5)
  1374		VAQ   V17, V13, V13      // v13: l8 x l1 + l7 x l2 + l6 x l3 (column 7)
  1375	
  1376		VSLDB $9, V12, V10, V4
  1377		VSLDB $9, V12, V7, V5
  1378		VAQ   V4, V14, V14
  1379		VAQ   V5, V13, V13
  1380	
  1381		VSLDB $9, V12, V14, V4
  1382		VSLDB $9, V12, V13, V5
  1383		VAQ   V4, V8, V8
  1384		VAQ   V5, V19, V19
  1385	
  1386		VSLDB $9, V12, V8, V4
  1387		VSLDB $9, V12, V19, V5
  1388		VAQ   V4, V16, V16
  1389		VAQ   V5, V11, V11
  1390	
  1391		VSLDB $9, V12, V16, V4
  1392		VAQ   V4, V9, V17
  1393	
  1394		VGBM $0x007f, V4
  1395		VGBM $0x00ff, V5
  1396	
  1397		VN V10, V4, V10
  1398		VN V14, V4, V14
  1399		VN V8, V4, V8
  1400		VN V16, V4, V16
  1401		VN V17, V4, V9
  1402		VN V7, V4, V7
  1403		VN V13, V4, V13
  1404		VN V19, V4, V19
  1405		VN V11, V5, V11
  1406	
  1407		VSLDB $7, V14, V14, V14
  1408		VSLDB $14, V8, V12, V4
  1409		VSLDB $14, V12, V8, V8
  1410		VSLDB $5, V16, V16, V16
  1411		VSLDB $12, V9, V12, V5
  1412	
  1413		VO V14, V10, V10
  1414		VO V8, V16, V16
  1415		VO V4, V10, V10  // first rightmost 128bits of the multiplication result
  1416		VO V5, V16, V16  // second rightmost 128bits of the multiplication result
  1417	
  1418		// adjust v7, v13, v19, v11
  1419		VSLDB $7, V13, V13, V13
  1420		VSLDB $14, V19, V12, V4
  1421		VSLDB $14, V12, V19, V19
  1422		VSLDB $5, V11, V12, V5
  1423		VO    V13, V7, V7
  1424		VO    V4, V7, V7
  1425		VO    V19, V5, V11
  1426	
  1427		VSLDB $9, V12, V17, V14
  1428		VSLDB $12, V12, V9, V9
  1429		VACCQ V7, V14, V13
  1430		VAQ   V7, V14, V7
  1431		VAQ   V11, V13, V11
  1432	
  1433		// First reduction, 96 bits
  1434		VSLDB $4, V16, V10, T0
  1435		VSLDB $4, V12, V16, T1
  1436		VSLDB $3, V11, V7, V11 // fourth rightmost 128bits of the multiplication result
  1437		VSLDB $3, V7, V12, V7
  1438		OBSERVATION3(V10, V8, T2, V17, V18, V19)// results V8 | T2
  1439		VO    V7, V9, V7       // third rightmost 128bits of the multiplication result
  1440		VACCQ T0, T2, V9
  1441		VAQ   T0, T2, T2
  1442		VACQ  T1, V8, V9, V8
  1443	
  1444		// Second reduction 96 bits
  1445		VSLDB $4, V8, T2, T0
  1446		VSLDB $4, V12, V8, T1
  1447		OBSERVATION3(T2, V9, V8, V17, V18, V19)// results V9 | V8
  1448		VACCQ T0, V8, T2
  1449		VAQ   T0, V8, V8
  1450		VACQ  T1, V9, T2, V9
  1451	
  1452		// Third reduction 64 bits
  1453		VSLDB  $8, V9, V8, T0
  1454		VSLDB  $8, V12, V9, T1
  1455		OBSERVATION3A(V8, V14, V13, V17, V18)// results V14 | V13
  1456		VACCQ  T0, V13, V12
  1457		VAQ    T0, V13, V13
  1458		VACQ   T1, V14, V12, V14
  1459		VACCQ  V13, V7, V12
  1460		VAQ    V13, V7, T0
  1461		VACCCQ V14, V11, V12, T2
  1462		VACQ   V14, V11, V12, T1 // results T2 | T1 | T0
  1463	
  1464		// ---------------------------------------------------
  1465		MOVD $p256mul<>+0x00(SB), CPOOL
  1466	
  1467		VZERO   V12
  1468		VSCBIQ  P0, T0, V8
  1469		VSQ     P0, T0, V7
  1470		VSBCBIQ T1, P1, V8, V10
  1471		VSBIQ   T1, P1, V8, V9
  1472		VSBIQ   T2, V12, V10, T2
  1473	
  1474		// what output to use, V9||V7 or T1||T0?
  1475		VSEL T0, V7, T2, T0
  1476		VSEL T1, V9, T2, T1
  1477	
  1478		VLM (SCRATCH), V16, V19
  1479	
  1480		RET
  1481	
  1482	// ---------------------------------------
  1483	// p256SqrInternalVMSL
  1484	// V0-V1,V30,V31 - Not Modified
  1485	// V4-V14 - Volatile
  1486	
  1487	TEXT ·p256SqrInternalVMSL(SB), NOFRAME|NOSPLIT, $0-0
  1488		VSTM V16, V18, (SCRATCH)
  1489	
  1490		MOVD $p256vmsl<>+0x00(SB), CPOOL
  1491		// Divide input into limbs
  1492		VGBM  $0x007f, V14
  1493		VZERO V12
  1494		VSLDB $2, X1, X0, V13
  1495		VSLDB $4, V12, X1, V11 // V11(X1): 4 bytes limb
  1496	
  1497		VN V14, X0, V10  // V10: first 7 bytes limb
  1498		VN V14, V13, V13 // v13: third 7 bytes limb
  1499	
  1500		VMSLG V10, V10, V12, V10 // v10: l10 x l5 (column 1)
  1501		VMSLG V13, V13, V12, V13 // v13: l8 x l3
  1502		VMSLG V11, V11, V12, V11 // v11: l6 x l1 (column 9)
  1503	
  1504		MOVD $p256vmsl<>+0x00(SB), CPOOL
  1505		VGBM $0x7f7f, V14
  1506	
  1507		VL 0(CPOOL), V4
  1508		VL 16(CPOOL), V7
  1509		VL 32(CPOOL), V9
  1510		VL 48(CPOOL), V5
  1511		VLM 64(CPOOL), V16, V18
  1512		VL 112(CPOOL), V8
  1513	
  1514		VPERM V12, X0, V4, V4   // v4: limb4 | limb5
  1515		VPERM X1, X0, V7, V7
  1516		VPERM V12, X0, V9, V9   // v9: limb10 | limb9
  1517		VPERM X1, X0, V5, V5
  1518		VPERM X1, X0, V16, V16
  1519		VPERM X1, X0, V17, V17
  1520		VPERM X1, V12, V18, V18 // v18: limb1 | limb2
  1521		VPERM X1, V12, V8, V8   // v8:  limb7 | limb6
  1522		VN    V14, V7, V7       // v7:  limb9 | limb8
  1523		VN    V14, V5, V5       // v5:  limb3 | limb4
  1524		VN    V14, V16, V16     // v16: limb2 | limb3
  1525		VN    V14, V17, V17     // v17: limb8 | limb7
  1526	
  1527		VMSLEOG V9, V18, V13, V6   // v6: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4 + l6 x l5 (column 5)
  1528		VMSLG   V9, V4, V12, V14   // v14: l10 x l4 + l9 x l5 (column 2)
  1529		VMSLEOG V9, V16, V12, V16  // v16: l10 x l2 + l9 x l3 + l8 x l4 + l7 x l5 (column 4)
  1530		VMSLEOG V7, V18, V12, V7   // v7: l9 x l1 + l8 x l2 (column 6)
  1531		VMSLEG  V17, V18, V12, V13 // v13: l8 x l1 + l7 x l2 + l6 x l3 (column 7)
  1532		VMSLG   V8, V18, V12, V8   // v8: l7 x l1 + l6 x l2 (column 8)
  1533		VMSLEG  V9, V5, V12, V18   // v18: l10 x l3 + l9 x l4 + l8 x l5 (column 3)
  1534	
  1535		VSLDB $9, V12, V10, V4
  1536		VSLDB $9, V12, V7, V5
  1537		VAQ   V4, V14, V14
  1538		VAQ   V5, V13, V13
  1539	
  1540		VSLDB $9, V12, V14, V4
  1541		VSLDB $9, V12, V13, V5
  1542		VAQ   V4, V18, V18
  1543		VAQ   V5, V8, V8
  1544	
  1545		VSLDB $9, V12, V18, V4
  1546		VSLDB $9, V12, V8, V5
  1547		VAQ   V4, V16, V16
  1548		VAQ   V5, V11, V11
  1549	
  1550		VSLDB $9, V12, V16, V4
  1551		VAQ   V4, V6, V17
  1552	
  1553		VGBM $0x007f, V4
  1554		VGBM $0x00ff, V5
  1555	
  1556		VN V10, V4, V10
  1557		VN V14, V4, V14
  1558		VN V18, V4, V18
  1559		VN V16, V4, V16
  1560		VN V17, V4, V9
  1561		VN V7, V4, V7
  1562		VN V13, V4, V13
  1563		VN V8, V4, V8
  1564		VN V11, V5, V11
  1565	
  1566		VSLDB $7, V14, V14, V14
  1567		VSLDB $14, V18, V12, V4
  1568		VSLDB $14, V12, V18, V18
  1569		VSLDB $5, V16, V16, V16
  1570		VSLDB $12, V9, V12, V5
  1571	
  1572		VO V14, V10, V10
  1573		VO V18, V16, V16
  1574		VO V4, V10, V10  // first rightmost 128bits of the multiplication result
  1575		VO V5, V16, V16  // second rightmost 128bits of the multiplication result
  1576	
  1577		// adjust v7, v13, v8, v11
  1578		VSLDB $7, V13, V13, V13
  1579		VSLDB $14, V8, V12, V4
  1580		VSLDB $14, V12, V8, V8
  1581		VSLDB $5, V11, V12, V5
  1582		VO    V13, V7, V7
  1583		VO    V4, V7, V7
  1584		VO    V8, V5, V11
  1585	
  1586		VSLDB $9, V12, V17, V14
  1587		VSLDB $12, V12, V9, V9
  1588		VACCQ V7, V14, V13
  1589		VAQ   V7, V14, V7
  1590		VAQ   V11, V13, V11
  1591	
  1592		// First reduction, 96 bits
  1593		VSLDB $4, V16, V10, T0
  1594		VSLDB $4, V12, V16, T1
  1595		VSLDB $3, V11, V7, V11 // fourth rightmost 128bits of the multiplication result
  1596		VSLDB $3, V7, V12, V7
  1597		OBSERVATION3(V10, V8, T2, V16, V17, V18)// results V8 | T2
  1598		VO    V7, V9, V7       // third rightmost 128bits of the multiplication result
  1599		VACCQ T0, T2, V9
  1600		VAQ   T0, T2, T2
  1601		VACQ  T1, V8, V9, V8
  1602	
  1603		// Second reduction 96 bits
  1604		VSLDB $4, V8, T2, T0
  1605		VSLDB $4, V12, V8, T1
  1606		OBSERVATION3(T2, V9, V8, V16, V17, V18)// results V9 | V8
  1607		VACCQ T0, V8, T2
  1608		VAQ   T0, V8, V8
  1609		VACQ  T1, V9, T2, V9
  1610	
  1611		// Third reduction 64 bits
  1612		VSLDB  $8, V9, V8, T0
  1613		VSLDB  $8, V12, V9, T1
  1614		OBSERVATION3A(V8, V14, V13, V17, V18)// results V14 | V13
  1615		VACCQ  T0, V13, V12
  1616		VAQ    T0, V13, V13
  1617		VACQ   T1, V14, V12, V14
  1618		VACCQ  V13, V7, V12
  1619		VAQ    V13, V7, T0
  1620		VACCCQ V14, V11, V12, T2
  1621		VACQ   V14, V11, V12, T1 // results T2 | T1 | T0
  1622	
  1623		// ---------------------------------------------------
  1624		MOVD $p256mul<>+0x00(SB), CPOOL
  1625	
  1626		VZERO   V12
  1627		VSCBIQ  P0, T0, V8
  1628		VSQ     P0, T0, V7
  1629		VSBCBIQ T1, P1, V8, V10
  1630		VSBIQ   T1, P1, V8, V9
  1631		VSBIQ   T2, V12, V10, T2
  1632	
  1633		// what output to use, V9||V7 or T1||T0?
  1634		VSEL T0, V7, T2, T0
  1635		VSEL T1, V9, T2, T1
  1636	
  1637		VLM (SCRATCH), V16, V18
  1638		RET
  1639	
  1640	
  1641	
  1642	#undef CPOOL
  1643	#undef SCRATCH
  1644	#undef X0
  1645	#undef X1
  1646	#undef Y0
  1647	#undef Y1
  1648	#undef T0
  1649	#undef T1
  1650	#undef T2
  1651	#undef P0
  1652	#undef P1
  1653	
  1654	#define SCRATCH R9
  1655	
  1656	TEXT p256MulInternal<>(SB),NOSPLIT,$64-0
  1657		MOVD    $scratch-64(SP), SCRATCH
  1658		MOVD    ·p256MulInternalFacility+0x00(SB),R7
  1659		CALL    (R7)
  1660		RET
  1661	
  1662	TEXT ·p256MulInternalTrampolineSetup(SB),NOSPLIT|NOFRAME, $0
  1663		MOVBZ  internal∕cpu·S390X+const_offsetS390xHasVE1(SB), R0
  1664		MOVD    $·p256MulInternalFacility+0x00(SB), R7
  1665		MOVD    $·p256MulInternalVX(SB), R8
  1666		CMPBEQ  R0, $0, novmsl      // VE1 facility = 1, VMSL supported
  1667		MOVD    $·p256MulInternalVMSL(SB), R8
  1668	novmsl:
  1669		MOVD    R8, 0(R7)
  1670		BR      (R8)
  1671	
  1672	GLOBL ·p256MulInternalFacility+0x00(SB), NOPTR, $8
  1673	DATA ·p256MulInternalFacility+0x00(SB)/8, $·p256MulInternalTrampolineSetup(SB)
  1674	
  1675	// Parameters
  1676	#define X0    V0
  1677	#define X1    V1
  1678	#define Y0    V2
  1679	#define Y1    V3
  1680	
  1681	TEXT ·p256SqrInternalVX(SB), NOFRAME|NOSPLIT, $0
  1682		VLR X0, Y0
  1683		VLR X1, Y1
  1684		BR  ·p256MulInternalVX(SB)
  1685	
  1686	#undef X0
  1687	#undef X1
  1688	#undef Y0
  1689	#undef Y1
  1690	
  1691	
  1692	TEXT p256SqrInternal<>(SB),NOSPLIT,$48-0
  1693		MOVD    $scratch-48(SP), SCRATCH
  1694	        MOVD    ·p256SqrInternalFacility+0x00(SB),R7
  1695	        CALL    (R7)
  1696		RET
  1697	
  1698	TEXT ·p256SqrInternalTrampolineSetup(SB),NOSPLIT|NOFRAME, $0
  1699		MOVBZ  internal∕cpu·S390X+const_offsetS390xHasVE1(SB), R0
  1700		MOVD    $·p256SqrInternalFacility+0x00(SB), R7
  1701		MOVD    $·p256SqrInternalVX(SB), R8
  1702		CMPBEQ  R0, $0, novmsl      // VE1 facility = 1, VMSL supported
  1703		MOVD    $·p256SqrInternalVMSL(SB), R8
  1704	novmsl:
  1705		MOVD    R8, 0(R7)
  1706		BR      (R8)
  1707	
  1708	
  1709	GLOBL ·p256SqrInternalFacility+0x00(SB), NOPTR, $8
  1710	DATA ·p256SqrInternalFacility+0x00(SB)/8, $·p256SqrInternalTrampolineSetup(SB)
  1711	
  1712	#undef SCRATCH
  1713	
  1714	
  1715	#define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
  1716		VZERO   ZER                \
  1717		VSCBIQ  Y0, X0, CAR1       \
  1718		VSQ     Y0, X0, T0         \
  1719		VSBCBIQ X1, Y1, CAR1, SEL1 \
  1720		VSBIQ   X1, Y1, CAR1, T1   \
  1721		VSQ     SEL1, ZER, SEL1    \
  1722		                           \
  1723		VACCQ   T0, PL, CAR1       \
  1724		VAQ     T0, PL, TT0        \
  1725		VACQ    T1, PH, CAR1, TT1  \
  1726		                           \
  1727		VSEL    T0, TT0, SEL1, T0  \
  1728		VSEL    T1, TT1, SEL1, T1  \
  1729	
  1730	#define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \
  1731		VACCQ   X0, Y0, CAR1        \
  1732		VAQ     X0, Y0, T0          \
  1733		VACCCQ  X1, Y1, CAR1, T2    \
  1734		VACQ    X1, Y1, CAR1, T1    \
  1735		                            \
  1736		VZERO   ZER                 \
  1737		VSCBIQ  PL, T0, CAR1        \
  1738		VSQ     PL, T0, TT0         \
  1739		VSBCBIQ T1, PH, CAR1, CAR2  \
  1740		VSBIQ   T1, PH, CAR1, TT1   \
  1741		VSBIQ   T2, ZER, CAR2, SEL1 \
  1742		                            \
  1743		VSEL    T0, TT0, SEL1, T0   \
  1744		VSEL    T1, TT1, SEL1, T1
  1745	
  1746	#define p256HalfInternal(T1, T0, X1, X0) \
  1747		VZERO  ZER                \
  1748		VSBIQ  ZER, ZER, X0, SEL1 \
  1749		                          \
  1750		VACCQ  X0, PL, CAR1       \
  1751		VAQ    X0, PL, T0         \
  1752		VACCCQ X1, PH, CAR1, T2   \
  1753		VACQ   X1, PH, CAR1, T1   \
  1754		                          \
  1755		VSEL   X0, T0, SEL1, T0   \
  1756		VSEL   X1, T1, SEL1, T1   \
  1757		VSEL   ZER, T2, SEL1, T2  \
  1758		                          \
  1759		VSLDB  $15, T2, ZER, TT1  \
  1760		VSLDB  $15, T1, ZER, TT0  \
  1761		VREPIB $1, SEL1           \
  1762		VSRL   SEL1, T0, T0       \
  1763		VSRL   SEL1, T1, T1       \
  1764		VREPIB $7, SEL1           \
  1765		VSL    SEL1, TT0, TT0     \
  1766		VSL    SEL1, TT1, TT1     \
  1767		VO     T0, TT0, T0        \
  1768		VO     T1, TT1, T1
  1769	
  1770	// ---------------------------------------
  1771	// func p256MulAsm(res, in1, in2 []byte)
  1772	#define res_ptr R1
  1773	#define x_ptr   R2
  1774	#define y_ptr   R3
  1775	#define CPOOL   R4
  1776	
  1777	// Parameters
  1778	#define X0    V0
  1779	#define X1    V1
  1780	#define Y0    V2
  1781	#define Y1    V3
  1782	#define T0    V4
  1783	#define T1    V5
  1784	
  1785	// Constants
  1786	#define P0    V30
  1787	#define P1    V31
  1788	TEXT ·p256MulAsm(SB), NOSPLIT, $0
  1789		MOVD res+0(FP), res_ptr
  1790		MOVD in1+24(FP), x_ptr
  1791		MOVD in2+48(FP), y_ptr
  1792	
  1793		VL (1*16)(x_ptr), X0
  1794		VL (0*16)(x_ptr), X1
  1795		VL (1*16)(y_ptr), Y0
  1796		VL (0*16)(y_ptr), Y1
  1797	
  1798		MOVD $p256mul<>+0x00(SB), CPOOL
  1799		VL   16(CPOOL), P0
  1800		VL   0(CPOOL), P1
  1801	
  1802		CALL p256MulInternal<>(SB)
  1803	
  1804		VST T0, (1*16)(res_ptr)
  1805		VST T1, (0*16)(res_ptr)
  1806		RET
  1807	
  1808	#undef res_ptr
  1809	#undef x_ptr
  1810	#undef y_ptr
  1811	#undef CPOOL
  1812	
  1813	#undef X0
  1814	#undef X1
  1815	#undef Y0
  1816	#undef Y1
  1817	#undef T0
  1818	#undef T1
  1819	#undef P0
  1820	#undef P1
  1821	
  1822	// ---------------------------------------
  1823	// func p256SqrAsm(res, in1 []byte)
  1824	#define res_ptr R1
  1825	#define x_ptr   R2
  1826	#define y_ptr   R3
  1827	#define CPOOL   R4
  1828	
  1829	// Parameters
  1830	#define X0    V0
  1831	#define X1    V1
  1832	#define T0    V4
  1833	#define T1    V5
  1834	
  1835	// Constants
  1836	#define P0    V30
  1837	#define P1    V31
  1838	TEXT ·p256SqrAsm(SB), NOSPLIT, $0
  1839		MOVD res+0(FP), res_ptr
  1840		MOVD in1+24(FP), x_ptr
  1841	
  1842		VL (1*16)(x_ptr), X0
  1843		VL (0*16)(x_ptr), X1
  1844	
  1845		MOVD $p256mul<>+0x00(SB), CPOOL
  1846		VL   16(CPOOL), P0
  1847		VL   0(CPOOL), P1
  1848	
  1849		CALL p256SqrInternal<>(SB)
  1850	
  1851		VST T0, (1*16)(res_ptr)
  1852		VST T1, (0*16)(res_ptr)
  1853		RET
  1854	
  1855	#undef res_ptr
  1856	#undef x_ptr
  1857	#undef y_ptr
  1858	#undef CPOOL
  1859	
  1860	#undef X0
  1861	#undef X1
  1862	#undef T0
  1863	#undef T1
  1864	#undef P0
  1865	#undef P1
  1866	
  1867	
  1868	// Point add with P2 being affine point
  1869	// If sign == 1 -> P2 = -P2
  1870	// If sel == 0 -> P3 = P1
  1871	// if zero == 0 -> P3 = P2
  1872	// p256PointAddAffineAsm(P3, P1, P2 *p256Point, sign, sel, zero int)
  1873	#define P3ptr   R1
  1874	#define P1ptr   R2
  1875	#define P2ptr   R3
  1876	#define CPOOL   R4
  1877	
  1878	// Temporaries in REGs
  1879	#define Y2L    V15
  1880	#define Y2H    V16
  1881	#define T1L    V17
  1882	#define T1H    V18
  1883	#define T2L    V19
  1884	#define T2H    V20
  1885	#define T3L    V21
  1886	#define T3H    V22
  1887	#define T4L    V23
  1888	#define T4H    V24
  1889	
  1890	// Temps for Sub and Add
  1891	#define TT0  V11
  1892	#define TT1  V12
  1893	#define T2   V13
  1894	
  1895	// p256MulAsm Parameters
  1896	#define X0    V0
  1897	#define X1    V1
  1898	#define Y0    V2
  1899	#define Y1    V3
  1900	#define T0    V4
  1901	#define T1    V5
  1902	
  1903	#define PL    V30
  1904	#define PH    V31
  1905	
  1906	// Names for zero/sel selects
  1907	#define X1L    V0
  1908	#define X1H    V1
  1909	#define Y1L    V2 // p256MulAsmParmY
  1910	#define Y1H    V3 // p256MulAsmParmY
  1911	#define Z1L    V4
  1912	#define Z1H    V5
  1913	#define X2L    V0
  1914	#define X2H    V1
  1915	#define Z2L    V4
  1916	#define Z2H    V5
  1917	#define X3L    V17 // T1L
  1918	#define X3H    V18 // T1H
  1919	#define Y3L    V21 // T3L
  1920	#define Y3H    V22 // T3H
  1921	#define Z3L    V28
  1922	#define Z3H    V29
  1923	
  1924	#define ZER   V6
  1925	#define SEL1  V7
  1926	#define CAR1  V8
  1927	#define CAR2  V9
  1928	/* *
  1929	 * Three operand formula:
  1930	 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1931	 * T1 = Z1²
  1932	 * T2 = T1*Z1
  1933	 * T1 = T1*X2
  1934	 * T2 = T2*Y2
  1935	 * T1 = T1-X1
  1936	 * T2 = T2-Y1
  1937	 * Z3 = Z1*T1
  1938	 * T3 = T1²
  1939	 * T4 = T3*T1
  1940	 * T3 = T3*X1
  1941	 * T1 = 2*T3
  1942	 * X3 = T2²
  1943	 * X3 = X3-T1
  1944	 * X3 = X3-T4
  1945	 * T3 = T3-X3
  1946	 * T3 = T3*T2
  1947	 * T4 = T4*Y1
  1948	 * Y3 = T3-T4
  1949	
  1950	 * Three operand formulas, but with MulInternal X,Y used to store temps
  1951	X=Z1; Y=Z1; MUL;T-   // T1 = Z1²      T1
  1952	X=T ; Y-  ; MUL;T2=T // T2 = T1*Z1    T1   T2
  1953	X-  ; Y=X2; MUL;T1=T // T1 = T1*X2    T1   T2
  1954	X=T2; Y=Y2; MUL;T-   // T2 = T2*Y2    T1   T2
  1955	SUB(T2<T-Y1)         // T2 = T2-Y1    T1   T2
  1956	SUB(Y<T1-X1)         // T1 = T1-X1    T1   T2
  1957	X=Z1; Y- ;  MUL;Z3:=T// Z3 = Z1*T1         T2
  1958	X=Y;  Y- ;  MUL;X=T  // T3 = T1*T1         T2
  1959	X- ;  Y- ;  MUL;T4=T // T4 = T3*T1         T2        T4
  1960	X- ;  Y=X1; MUL;T3=T // T3 = T3*X1         T2   T3   T4
  1961	ADD(T1<T+T)          // T1 = T3+T3    T1   T2   T3   T4
  1962	X=T2; Y=T2; MUL;T-   // X3 = T2*T2    T1   T2   T3   T4
  1963	SUB(T<T-T1)          // X3 = X3-T1    T1   T2   T3   T4
  1964	SUB(T<T-T4) X3:=T    // X3 = X3-T4         T2   T3   T4
  1965	SUB(X<T3-T)          // T3 = T3-X3         T2   T3   T4
  1966	X- ;  Y- ;  MUL;T3=T // T3 = T3*T2         T2   T3   T4
  1967	X=T4; Y=Y1; MUL;T-   // T4 = T4*Y1              T3   T4
  1968	SUB(T<T3-T) Y3:=T    // Y3 = T3-T4              T3   T4
  1969	
  1970		*/
  1971	TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0
  1972		MOVD P3+0(FP), P3ptr
  1973		MOVD P1+8(FP), P1ptr
  1974		MOVD P2+16(FP), P2ptr
  1975	
  1976		MOVD $p256mul<>+0x00(SB), CPOOL
  1977		VL   16(CPOOL), PL
  1978		VL   0(CPOOL), PH
  1979	
  1980		//	if (sign == 1) {
  1981		//		Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2  = P-Y2
  1982		//	}
  1983	
  1984		VL 32(P2ptr), Y2H
  1985		VL 48(P2ptr), Y2L
  1986	
  1987		VLREPG sign+24(FP), SEL1
  1988		VZERO  ZER
  1989		VCEQG  SEL1, ZER, SEL1
  1990	
  1991		VSCBIQ Y2L, PL, CAR1
  1992		VSQ    Y2L, PL, T1L
  1993		VSBIQ  PH, Y2H, CAR1, T1H
  1994	
  1995		VSEL Y2L, T1L, SEL1, Y2L
  1996		VSEL Y2H, T1H, SEL1, Y2H
  1997	
  1998	/* *
  1999	 * Three operand formula:
  2000	 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  2001	 */
  2002		// X=Z1; Y=Z1; MUL; T-   // T1 = Z1²      T1
  2003		VL   64(P1ptr), X1       // Z1H
  2004		VL   80(P1ptr), X0       // Z1L
  2005		VLR  X0, Y0
  2006		VLR  X1, Y1
  2007		CALL p256SqrInternal<>(SB)
  2008	
  2009		// X=T ; Y-  ; MUL; T2=T // T2 = T1*Z1    T1   T2
  2010		VLR  T0, X0
  2011		VLR  T1, X1
  2012		CALL p256MulInternal<>(SB)
  2013		VLR  T0, T2L
  2014		VLR  T1, T2H
  2015	
  2016		// X-  ; Y=X2; MUL; T1=T // T1 = T1*X2    T1   T2
  2017		VL   0(P2ptr), Y1        // X2H
  2018		VL   16(P2ptr), Y0       // X2L
  2019		CALL p256MulInternal<>(SB)
  2020		VLR  T0, T1L
  2021		VLR  T1, T1H
  2022	
  2023		// X=T2; Y=Y2; MUL; T-   // T2 = T2*Y2    T1   T2
  2024		VLR  T2L, X0
  2025		VLR  T2H, X1
  2026		VLR  Y2L, Y0
  2027		VLR  Y2H, Y1
  2028		CALL p256MulInternal<>(SB)
  2029	
  2030		// SUB(T2<T-Y1)          // T2 = T2-Y1    T1   T2
  2031		VL 32(P1ptr), Y1H
  2032		VL 48(P1ptr), Y1L
  2033		p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
  2034	
  2035		// SUB(Y<T1-X1)          // T1 = T1-X1    T1   T2
  2036		VL 0(P1ptr), X1H
  2037		VL 16(P1ptr), X1L
  2038		p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
  2039	
  2040		// X=Z1; Y- ;  MUL; Z3:=T// Z3 = Z1*T1         T2
  2041		VL   64(P1ptr), X1       // Z1H
  2042		VL   80(P1ptr), X0       // Z1L
  2043		CALL p256MulInternal<>(SB)
  2044	
  2045		// VST T1, 64(P3ptr)
  2046		// VST T0, 80(P3ptr)
  2047		VLR T0, Z3L
  2048		VLR T1, Z3H
  2049	
  2050		// X=Y;  Y- ;  MUL; X=T  // T3 = T1*T1         T2
  2051		VLR  Y0, X0
  2052		VLR  Y1, X1
  2053		CALL p256SqrInternal<>(SB)
  2054		VLR  T0, X0
  2055		VLR  T1, X1
  2056	
  2057		// X- ;  Y- ;  MUL; T4=T // T4 = T3*T1         T2        T4
  2058		CALL p256MulInternal<>(SB)
  2059		VLR  T0, T4L
  2060		VLR  T1, T4H
  2061	
  2062		// X- ;  Y=X1; MUL; T3=T // T3 = T3*X1         T2   T3   T4
  2063		VL   0(P1ptr), Y1        // X1H
  2064		VL   16(P1ptr), Y0       // X1L
  2065		CALL p256MulInternal<>(SB)
  2066		VLR  T0, T3L
  2067		VLR  T1, T3H
  2068	
  2069		// ADD(T1<T+T)           // T1 = T3+T3    T1   T2   T3   T4
  2070		p256AddInternal(T1H,T1L, T1,T0,T1,T0)
  2071	
  2072		// X=T2; Y=T2; MUL; T-   // X3 = T2*T2    T1   T2   T3   T4
  2073		VLR  T2L, X0
  2074		VLR  T2H, X1
  2075		VLR  T2L, Y0
  2076		VLR  T2H, Y1
  2077		CALL p256SqrInternal<>(SB)
  2078	
  2079		// SUB(T<T-T1)           // X3 = X3-T1    T1   T2   T3   T4  (T1 = X3)
  2080		p256SubInternal(T1,T0,T1,T0,T1H,T1L)
  2081	
  2082		// SUB(T<T-T4) X3:=T     // X3 = X3-T4         T2   T3   T4
  2083		p256SubInternal(T1,T0,T1,T0,T4H,T4L)
  2084		VLR T0, X3L
  2085		VLR T1, X3H
  2086	
  2087		// SUB(X<T3-T)           // T3 = T3-X3         T2   T3   T4
  2088		p256SubInternal(X1,X0,T3H,T3L,T1,T0)
  2089	
  2090		// X- ;  Y- ;  MUL; T3=T // T3 = T3*T2         T2   T3   T4
  2091		CALL p256MulInternal<>(SB)
  2092		VLR  T0, T3L
  2093		VLR  T1, T3H
  2094	
  2095		// X=T4; Y=Y1; MUL; T-   // T4 = T4*Y1              T3   T4
  2096		VLR  T4L, X0
  2097		VLR  T4H, X1
  2098		VL   32(P1ptr), Y1       // Y1H
  2099		VL   48(P1ptr), Y0       // Y1L
  2100		CALL p256MulInternal<>(SB)
  2101	
  2102		// SUB(T<T3-T) Y3:=T     // Y3 = T3-T4              T3   T4  (T3 = Y3)
  2103		p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0)
  2104	
  2105		//	if (sel == 0) {
  2106		//		copy(P3.x[:], X1)
  2107		//		copy(P3.y[:], Y1)
  2108		//		copy(P3.z[:], Z1)
  2109		//	}
  2110	
  2111		VL 0(P1ptr), X1H
  2112		VL 16(P1ptr), X1L
  2113	
  2114		// Y1 already loaded, left over from addition
  2115		VL 64(P1ptr), Z1H
  2116		VL 80(P1ptr), Z1L
  2117	
  2118		VLREPG sel+32(FP), SEL1
  2119		VZERO  ZER
  2120		VCEQG  SEL1, ZER, SEL1
  2121	
  2122		VSEL X1L, X3L, SEL1, X3L
  2123		VSEL X1H, X3H, SEL1, X3H
  2124		VSEL Y1L, Y3L, SEL1, Y3L
  2125		VSEL Y1H, Y3H, SEL1, Y3H
  2126		VSEL Z1L, Z3L, SEL1, Z3L
  2127		VSEL Z1H, Z3H, SEL1, Z3H
  2128	
  2129		//	if (zero == 0) {
  2130		//		copy(P3.x[:], X2)
  2131		//		copy(P3.y[:], Y2)
  2132		//		copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  2133		//			0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01})  //(p256.z*2^256)%p
  2134		//	}
  2135		VL 0(P2ptr), X2H
  2136		VL 16(P2ptr), X2L
  2137	
  2138		// Y2 already loaded
  2139		VL 128(CPOOL), Z2H
  2140		VL 144(CPOOL), Z2L
  2141	
  2142		VLREPG zero+40(FP), SEL1
  2143		VZERO  ZER
  2144		VCEQG  SEL1, ZER, SEL1
  2145	
  2146		VSEL X2L, X3L, SEL1, X3L
  2147		VSEL X2H, X3H, SEL1, X3H
  2148		VSEL Y2L, Y3L, SEL1, Y3L
  2149		VSEL Y2H, Y3H, SEL1, Y3H
  2150		VSEL Z2L, Z3L, SEL1, Z3L
  2151		VSEL Z2H, Z3H, SEL1, Z3H
  2152	
  2153		// All done, store out the result!!!
  2154		VST X3H, 0(P3ptr)
  2155		VST X3L, 16(P3ptr)
  2156		VST Y3H, 32(P3ptr)
  2157		VST Y3L, 48(P3ptr)
  2158		VST Z3H, 64(P3ptr)
  2159		VST Z3L, 80(P3ptr)
  2160	
  2161		RET
  2162	
  2163	#undef P3ptr
  2164	#undef P1ptr
  2165	#undef P2ptr
  2166	#undef CPOOL
  2167	
  2168	#undef Y2L
  2169	#undef Y2H
  2170	#undef T1L
  2171	#undef T1H
  2172	#undef T2L
  2173	#undef T2H
  2174	#undef T3L
  2175	#undef T3H
  2176	#undef T4L
  2177	#undef T4H
  2178	
  2179	#undef TT0
  2180	#undef TT1
  2181	#undef T2
  2182	
  2183	#undef X0
  2184	#undef X1
  2185	#undef Y0
  2186	#undef Y1
  2187	#undef T0
  2188	#undef T1
  2189	
  2190	#undef PL
  2191	#undef PH
  2192	
  2193	#undef X1L
  2194	#undef X1H
  2195	#undef Y1L
  2196	#undef Y1H
  2197	#undef Z1L
  2198	#undef Z1H
  2199	#undef X2L
  2200	#undef X2H
  2201	#undef Z2L
  2202	#undef Z2H
  2203	#undef X3L
  2204	#undef X3H
  2205	#undef Y3L
  2206	#undef Y3H
  2207	#undef Z3L
  2208	#undef Z3H
  2209	
  2210	#undef ZER
  2211	#undef SEL1
  2212	#undef CAR1
  2213	#undef CAR2
  2214	
  2215	// p256PointDoubleAsm(P3, P1 *p256Point)
  2216	// https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
  2217	// https://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
  2218	// https://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
  2219	#define P3ptr   R1
  2220	#define P1ptr   R2
  2221	#define CPOOL   R4
  2222	
  2223	// Temporaries in REGs
  2224	#define X3L    V15
  2225	#define X3H    V16
  2226	#define Y3L    V17
  2227	#define Y3H    V18
  2228	#define T1L    V19
  2229	#define T1H    V20
  2230	#define T2L    V21
  2231	#define T2H    V22
  2232	#define T3L    V23
  2233	#define T3H    V24
  2234	
  2235	#define X1L    V6
  2236	#define X1H    V7
  2237	#define Y1L    V8
  2238	#define Y1H    V9
  2239	#define Z1L    V10
  2240	#define Z1H    V11
  2241	
  2242	// Temps for Sub and Add
  2243	#define TT0  V11
  2244	#define TT1  V12
  2245	#define T2   V13
  2246	
  2247	// p256MulAsm Parameters
  2248	#define X0    V0
  2249	#define X1    V1
  2250	#define Y0    V2
  2251	#define Y1    V3
  2252	#define T0    V4
  2253	#define T1    V5
  2254	
  2255	#define PL    V30
  2256	#define PH    V31
  2257	
  2258	#define Z3L    V23
  2259	#define Z3H    V24
  2260	
  2261	#define ZER   V26
  2262	#define SEL1  V27
  2263	#define CAR1  V28
  2264	#define CAR2  V29
  2265	/*
  2266	 * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv
  2267	 * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3.
  2268	 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  2269	 * 	A  = 3(X₁-Z₁²)×(X₁+Z₁²)
  2270	 * 	B  = 2Y₁
  2271	 * 	Z₃ = B×Z₁
  2272	 * 	C  = B²
  2273	 * 	D  = C×X₁
  2274	 * 	X₃ = A²-2D
  2275	 * 	Y₃ = (D-X₃)×A-C²/2
  2276	 *
  2277	 * Three-operand formula:
  2278	 *       T1 = Z1²
  2279	 *       T2 = X1-T1
  2280	 *       T1 = X1+T1
  2281	 *       T2 = T2*T1
  2282	 *       T2 = 3*T2
  2283	 *       Y3 = 2*Y1
  2284	 *       Z3 = Y3*Z1
  2285	 *       Y3 = Y3²
  2286	 *       T3 = Y3*X1
  2287	 *       Y3 = Y3²
  2288	 *       Y3 = half*Y3
  2289	 *       X3 = T2²
  2290	 *       T1 = 2*T3
  2291	 *       X3 = X3-T1
  2292	 *       T1 = T3-X3
  2293	 *       T1 = T1*T2
  2294	 *       Y3 = T1-Y3
  2295	 */
  2296	
  2297	TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0
  2298		MOVD P3+0(FP), P3ptr
  2299		MOVD P1+8(FP), P1ptr
  2300	
  2301		MOVD $p256mul<>+0x00(SB), CPOOL
  2302		VL   16(CPOOL), PL
  2303		VL   0(CPOOL), PH
  2304	
  2305		// X=Z1; Y=Z1; MUL; T-    // T1 = Z1²
  2306		VL   64(P1ptr), X1       // Z1H
  2307		VL   80(P1ptr), X0       // Z1L
  2308		VLR  X0, Y0
  2309		VLR  X1, Y1
  2310		CALL p256SqrInternal<>(SB)
  2311	
  2312		// SUB(X<X1-T)            // T2 = X1-T1
  2313		VL 0(P1ptr), X1H
  2314		VL 16(P1ptr), X1L
  2315		p256SubInternal(X1,X0,X1H,X1L,T1,T0)
  2316	
  2317		// ADD(Y<X1+T)            // T1 = X1+T1
  2318		p256AddInternal(Y1,Y0,X1H,X1L,T1,T0)
  2319	
  2320		// X-  ; Y-  ; MUL; T-    // T2 = T2*T1
  2321		CALL p256MulInternal<>(SB)
  2322	
  2323		// ADD(T2<T+T); ADD(T2<T2+T)  // T2 = 3*T2
  2324		p256AddInternal(T2H,T2L,T1,T0,T1,T0)
  2325		p256AddInternal(T2H,T2L,T2H,T2L,T1,T0)
  2326	
  2327		// ADD(X<Y1+Y1)           // Y3 = 2*Y1
  2328		VL 32(P1ptr), Y1H
  2329		VL 48(P1ptr), Y1L
  2330		p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L)
  2331	
  2332		// X-  ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
  2333		VL   64(P1ptr), Y1       // Z1H
  2334		VL   80(P1ptr), Y0       // Z1L
  2335		CALL p256MulInternal<>(SB)
  2336		VST  T1, 64(P3ptr)
  2337		VST  T0, 80(P3ptr)
  2338	
  2339		// X-  ; Y=X ; MUL; T-    // Y3 = Y3²
  2340		VLR  X0, Y0
  2341		VLR  X1, Y1
  2342		CALL p256SqrInternal<>(SB)
  2343	
  2344		// X=T ; Y=X1; MUL; T3=T  // T3 = Y3*X1
  2345		VLR  T0, X0
  2346		VLR  T1, X1
  2347		VL   0(P1ptr), Y1
  2348		VL   16(P1ptr), Y0
  2349		CALL p256MulInternal<>(SB)
  2350		VLR  T0, T3L
  2351		VLR  T1, T3H
  2352	
  2353		// X-  ; Y=X ; MUL; T-    // Y3 = Y3²
  2354		VLR  X0, Y0
  2355		VLR  X1, Y1
  2356		CALL p256SqrInternal<>(SB)
  2357	
  2358		// HAL(Y3<T)              // Y3 = half*Y3
  2359		p256HalfInternal(Y3H,Y3L, T1,T0)
  2360	
  2361		// X=T2; Y=T2; MUL; T-    // X3 = T2²
  2362		VLR  T2L, X0
  2363		VLR  T2H, X1
  2364		VLR  T2L, Y0
  2365		VLR  T2H, Y1
  2366		CALL p256SqrInternal<>(SB)
  2367	
  2368		// ADD(T1<T3+T3)          // T1 = 2*T3
  2369		p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L)
  2370	
  2371		// SUB(X3<T-T1) X3:=X3    // X3 = X3-T1
  2372		p256SubInternal(X3H,X3L,T1,T0,T1H,T1L)
  2373		VST X3H, 0(P3ptr)
  2374		VST X3L, 16(P3ptr)
  2375	
  2376		// SUB(X<T3-X3)           // T1 = T3-X3
  2377		p256SubInternal(X1,X0,T3H,T3L,X3H,X3L)
  2378	
  2379		// X-  ; Y-  ; MUL; T-    // T1 = T1*T2
  2380		CALL p256MulInternal<>(SB)
  2381	
  2382		// SUB(Y3<T-Y3)           // Y3 = T1-Y3
  2383		p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L)
  2384	
  2385		VST Y3H, 32(P3ptr)
  2386		VST Y3L, 48(P3ptr)
  2387		RET
  2388	
  2389	#undef P3ptr
  2390	#undef P1ptr
  2391	#undef CPOOL
  2392	#undef X3L
  2393	#undef X3H
  2394	#undef Y3L
  2395	#undef Y3H
  2396	#undef T1L
  2397	#undef T1H
  2398	#undef T2L
  2399	#undef T2H
  2400	#undef T3L
  2401	#undef T3H
  2402	#undef X1L
  2403	#undef X1H
  2404	#undef Y1L
  2405	#undef Y1H
  2406	#undef Z1L
  2407	#undef Z1H
  2408	#undef TT0
  2409	#undef TT1
  2410	#undef T2
  2411	#undef X0
  2412	#undef X1
  2413	#undef Y0
  2414	#undef Y1
  2415	#undef T0
  2416	#undef T1
  2417	#undef PL
  2418	#undef PH
  2419	#undef Z3L
  2420	#undef Z3H
  2421	#undef ZER
  2422	#undef SEL1
  2423	#undef CAR1
  2424	#undef CAR2
  2425	
  2426	// p256PointAddAsm(P3, P1, P2 *p256Point)
  2427	#define P3ptr  R1
  2428	#define P1ptr  R2
  2429	#define P2ptr  R3
  2430	#define CPOOL  R4
  2431	#define ISZERO R5
  2432	#define TRUE   R6
  2433	
  2434	// Temporaries in REGs
  2435	#define T1L   V16
  2436	#define T1H   V17
  2437	#define T2L   V18
  2438	#define T2H   V19
  2439	#define U1L   V20
  2440	#define U1H   V21
  2441	#define S1L   V22
  2442	#define S1H   V23
  2443	#define HL    V24
  2444	#define HH    V25
  2445	#define RL    V26
  2446	#define RH    V27
  2447	
  2448	// Temps for Sub and Add
  2449	#define ZER   V6
  2450	#define SEL1  V7
  2451	#define CAR1  V8
  2452	#define CAR2  V9
  2453	#define TT0  V11
  2454	#define TT1  V12
  2455	#define T2   V13
  2456	
  2457	// p256MulAsm Parameters
  2458	#define X0    V0
  2459	#define X1    V1
  2460	#define Y0    V2
  2461	#define Y1    V3
  2462	#define T0    V4
  2463	#define T1    V5
  2464	
  2465	#define PL    V30
  2466	#define PH    V31
  2467	/*
  2468	 * https://delta.cs.cinvestav.mx/~francisco/arith/julio.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
  2469	 *
  2470	 * A = X₁×Z₂²
  2471	 * B = Y₁×Z₂³
  2472	 * C = X₂×Z₁²-A
  2473	 * D = Y₂×Z₁³-B
  2474	 * X₃ = D² - 2A×C² - C³
  2475	 * Y₃ = D×(A×C² - X₃) - B×C³
  2476	 * Z₃ = Z₁×Z₂×C
  2477	 *
  2478	 * Three-operand formula (adopted): https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2
  2479	 * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R
  2480	 *
  2481	 * T1 = Z1*Z1
  2482	 * T2 = Z2*Z2
  2483	 * U1 = X1*T2
  2484	 * H  = X2*T1
  2485	 * H  = H-U1
  2486	 * Z3 = Z1*Z2
  2487	 * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
  2488	 *
  2489	 * S1 = Z2*T2
  2490	 * S1 = Y1*S1
  2491	 * R  = Z1*T1
  2492	 * R  = Y2*R
  2493	 * R  = R-S1
  2494	 *
  2495	 * T1 = H*H
  2496	 * T2 = H*T1
  2497	 * U1 = U1*T1
  2498	 *
  2499	 * X3 = R*R
  2500	 * X3 = X3-T2
  2501	 * T1 = 2*U1
  2502	 * X3 = X3-T1 << store-out X3 result reg
  2503	 *
  2504	 * T2 = S1*T2
  2505	 * Y3 = U1-X3
  2506	 * Y3 = R*Y3
  2507	 * Y3 = Y3-T2 << store-out Y3 result reg
  2508	
  2509	 	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
  2510		// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
  2511		// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
  2512		// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
  2513		// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
  2514		// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
  2515		// SUB(H<H-T)            // H  = H-U1
  2516		// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
  2517		// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
  2518		// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
  2519		// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
  2520		// SUB(R<T-S1)           // R  = R-S1
  2521		// X=H ; Y=H ; MUL; T-   // T1 = H*H
  2522		// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
  2523		// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
  2524		// X=R ; Y=R ; MUL; T-   // X3 = R*R
  2525		// SUB(T<T-T2)           // X3 = X3-T2
  2526		// ADD(X<U1+U1)          // T1 = 2*U1
  2527		// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
  2528		// SUB(Y<U1-T)           // Y3 = U1-X3
  2529		// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
  2530		// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
  2531		// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
  2532		*/
  2533	TEXT ·p256PointAddAsm(SB), NOSPLIT, $0
  2534		MOVD P3+0(FP), P3ptr
  2535		MOVD P1+8(FP), P1ptr
  2536		MOVD P2+16(FP), P2ptr
  2537	
  2538		MOVD $p256mul<>+0x00(SB), CPOOL
  2539		VL   16(CPOOL), PL
  2540		VL   0(CPOOL), PH
  2541	
  2542		// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
  2543		VL   64(P1ptr), X1       // Z1H
  2544		VL   80(P1ptr), X0       // Z1L
  2545		VLR  X0, Y0
  2546		VLR  X1, Y1
  2547		CALL p256SqrInternal<>(SB)
  2548	
  2549		// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
  2550		VLR  T0, Y0
  2551		VLR  T1, Y1
  2552		CALL p256MulInternal<>(SB)
  2553		VLR  T0, RL
  2554		VLR  T1, RH
  2555	
  2556		// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
  2557		VL   0(P2ptr), X1        // X2H
  2558		VL   16(P2ptr), X0       // X2L
  2559		CALL p256MulInternal<>(SB)
  2560		VLR  T0, HL
  2561		VLR  T1, HH
  2562	
  2563		// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
  2564		VL   64(P2ptr), X1       // Z2H
  2565		VL   80(P2ptr), X0       // Z2L
  2566		VLR  X0, Y0
  2567		VLR  X1, Y1
  2568		CALL p256SqrInternal<>(SB)
  2569	
  2570		// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
  2571		VLR  T0, Y0
  2572		VLR  T1, Y1
  2573		CALL p256MulInternal<>(SB)
  2574		VLR  T0, S1L
  2575		VLR  T1, S1H
  2576	
  2577		// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
  2578		VL   0(P1ptr), X1        // X1H
  2579		VL   16(P1ptr), X0       // X1L
  2580		CALL p256MulInternal<>(SB)
  2581		VLR  T0, U1L
  2582		VLR  T1, U1H
  2583	
  2584		// SUB(H<H-T)            // H  = H-U1
  2585		p256SubInternal(HH,HL,HH,HL,T1,T0)
  2586	
  2587		// if H == 0 or H^P == 0 then ret=1 else ret=0
  2588		// clobbers T1H and T1L
  2589		MOVD   $0, ISZERO
  2590		MOVD   $1, TRUE
  2591		VZERO  ZER
  2592		VO     HL, HH, T1H
  2593		VCEQGS ZER, T1H, T1H
  2594		MOVDEQ TRUE, ISZERO
  2595		VX     HL, PL, T1L
  2596		VX     HH, PH, T1H
  2597		VO     T1L, T1H, T1H
  2598		VCEQGS ZER, T1H, T1H
  2599		MOVDEQ TRUE, ISZERO
  2600		MOVD   ISZERO, ret+24(FP)
  2601	
  2602		// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
  2603		VL   64(P1ptr), X1       // Z1H
  2604		VL   80(P1ptr), X0       // Z1L
  2605		VL   64(P2ptr), Y1       // Z2H
  2606		VL   80(P2ptr), Y0       // Z2L
  2607		CALL p256MulInternal<>(SB)
  2608	
  2609		// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
  2610		VLR  T0, X0
  2611		VLR  T1, X1
  2612		VLR  HL, Y0
  2613		VLR  HH, Y1
  2614		CALL p256MulInternal<>(SB)
  2615		VST  T1, 64(P3ptr)
  2616		VST  T0, 80(P3ptr)
  2617	
  2618		// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
  2619		VL   32(P1ptr), X1
  2620		VL   48(P1ptr), X0
  2621		VLR  S1L, Y0
  2622		VLR  S1H, Y1
  2623		CALL p256MulInternal<>(SB)
  2624		VLR  T0, S1L
  2625		VLR  T1, S1H
  2626	
  2627		// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
  2628		VL   32(P2ptr), X1
  2629		VL   48(P2ptr), X0
  2630		VLR  RL, Y0
  2631		VLR  RH, Y1
  2632		CALL p256MulInternal<>(SB)
  2633	
  2634		// SUB(R<T-S1)           // R  = T-S1
  2635		p256SubInternal(RH,RL,T1,T0,S1H,S1L)
  2636	
  2637		// if R == 0 or R^P == 0 then ret=ret else ret=0
  2638		// clobbers T1H and T1L
  2639		MOVD   $0, ISZERO
  2640		MOVD   $1, TRUE
  2641		VZERO  ZER
  2642		VO     RL, RH, T1H
  2643		VCEQGS ZER, T1H, T1H
  2644		MOVDEQ TRUE, ISZERO
  2645		VX     RL, PL, T1L
  2646		VX     RH, PH, T1H
  2647		VO     T1L, T1H, T1H
  2648		VCEQGS ZER, T1H, T1H
  2649		MOVDEQ TRUE, ISZERO
  2650		AND    ret+24(FP), ISZERO
  2651		MOVD   ISZERO, ret+24(FP)
  2652	
  2653		// X=H ; Y=H ; MUL; T-   // T1 = H*H
  2654		VLR  HL, X0
  2655		VLR  HH, X1
  2656		VLR  HL, Y0
  2657		VLR  HH, Y1
  2658		CALL p256SqrInternal<>(SB)
  2659	
  2660		// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
  2661		VLR  T0, Y0
  2662		VLR  T1, Y1
  2663		CALL p256MulInternal<>(SB)
  2664		VLR  T0, T2L
  2665		VLR  T1, T2H
  2666	
  2667		// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
  2668		VLR  U1L, X0
  2669		VLR  U1H, X1
  2670		CALL p256MulInternal<>(SB)
  2671		VLR  T0, U1L
  2672		VLR  T1, U1H
  2673	
  2674		// X=R ; Y=R ; MUL; T-   // X3 = R*R
  2675		VLR  RL, X0
  2676		VLR  RH, X1
  2677		VLR  RL, Y0
  2678		VLR  RH, Y1
  2679		CALL p256SqrInternal<>(SB)
  2680	
  2681		// SUB(T<T-T2)           // X3 = X3-T2
  2682		p256SubInternal(T1,T0,T1,T0,T2H,T2L)
  2683	
  2684		// ADD(X<U1+U1)          // T1 = 2*U1
  2685		p256AddInternal(X1,X0,U1H,U1L,U1H,U1L)
  2686	
  2687		// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
  2688		p256SubInternal(T1,T0,T1,T0,X1,X0)
  2689		VST T1, 0(P3ptr)
  2690		VST T0, 16(P3ptr)
  2691	
  2692		// SUB(Y<U1-T)           // Y3 = U1-X3
  2693		p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
  2694	
  2695		// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
  2696		VLR  RL, X0
  2697		VLR  RH, X1
  2698		CALL p256MulInternal<>(SB)
  2699		VLR  T0, U1L
  2700		VLR  T1, U1H
  2701	
  2702		// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
  2703		VLR  S1L, X0
  2704		VLR  S1H, X1
  2705		VLR  T2L, Y0
  2706		VLR  T2H, Y1
  2707		CALL p256MulInternal<>(SB)
  2708	
  2709		// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
  2710		p256SubInternal(T1,T0,U1H,U1L,T1,T0)
  2711		VST T1, 32(P3ptr)
  2712		VST T0, 48(P3ptr)
  2713	
  2714		RET

View as plain text