...

Text file src/crypto/sha256/sha256block_amd64.s

     1	// Copyright 2013 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	#include "textflag.h"
     6	
     7	// SHA256 block routine. See sha256block.go for Go equivalent.
     8	//
     9	// The algorithm is detailed in FIPS 180-4:
    10	//
    11	//  https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
    12	
    13	// The avx2-version is described in an Intel White-Paper:
    14	// "Fast SHA-256 Implementations on Intel Architecture Processors"
    15	// To find it, surf to http://www.intel.com/p/en_US/embedded
    16	// and search for that title.
    17	// AVX2 version by Intel, same algorithm as code in Linux kernel:
    18	// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S
    19	// by
    20	//     James Guilford <james.guilford@intel.com>
    21	//     Kirk Yap <kirk.s.yap@intel.com>
    22	//     Tim Chen <tim.c.chen@linux.intel.com>
    23	
    24	// Wt = Mt; for 0 <= t <= 15
    25	// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
    26	//
    27	// a = H0
    28	// b = H1
    29	// c = H2
    30	// d = H3
    31	// e = H4
    32	// f = H5
    33	// g = H6
    34	// h = H7
    35	//
    36	// for t = 0 to 63 {
    37	//    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
    38	//    T2 = BIGSIGMA0(a) + Maj(a,b,c)
    39	//    h = g
    40	//    g = f
    41	//    f = e
    42	//    e = d + T1
    43	//    d = c
    44	//    c = b
    45	//    b = a
    46	//    a = T1 + T2
    47	// }
    48	//
    49	// H0 = a + H0
    50	// H1 = b + H1
    51	// H2 = c + H2
    52	// H3 = d + H3
    53	// H4 = e + H4
    54	// H5 = f + H5
    55	// H6 = g + H6
    56	// H7 = h + H7
    57	
    58	// Wt = Mt; for 0 <= t <= 15
    59	#define MSGSCHEDULE0(index) \
    60		MOVL	(index*4)(SI), AX; \
    61		BSWAPL	AX; \
    62		MOVL	AX, (index*4)(BP)
    63	
    64	// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
    65	//   SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x)
    66	//   SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x)
    67	#define MSGSCHEDULE1(index) \
    68		MOVL	((index-2)*4)(BP), AX; \
    69		MOVL	AX, CX; \
    70		RORL	$17, AX; \
    71		MOVL	CX, DX; \
    72		RORL	$19, CX; \
    73		SHRL	$10, DX; \
    74		MOVL	((index-15)*4)(BP), BX; \
    75		XORL	CX, AX; \
    76		MOVL	BX, CX; \
    77		XORL	DX, AX; \
    78		RORL	$7, BX; \
    79		MOVL	CX, DX; \
    80		SHRL	$3, DX; \
    81		RORL	$18, CX; \
    82		ADDL	((index-7)*4)(BP), AX; \
    83		XORL	CX, BX; \
    84		XORL	DX, BX; \
    85		ADDL	((index-16)*4)(BP), BX; \
    86		ADDL	BX, AX; \
    87		MOVL	AX, ((index)*4)(BP)
    88	
    89	// Calculate T1 in AX - uses AX, CX and DX registers.
    90	// h is also used as an accumulator. Wt is passed in AX.
    91	//   T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
    92	//     BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x)
    93	//     Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
    94	#define SHA256T1(const, e, f, g, h) \
    95		ADDL	AX, h; \
    96		MOVL	e, AX; \
    97		ADDL	$const, h; \
    98		MOVL	e, CX; \
    99		RORL	$6, AX; \
   100		MOVL	e, DX; \
   101		RORL	$11, CX; \
   102		XORL	CX, AX; \
   103		MOVL	e, CX; \
   104		RORL	$25, DX; \
   105		ANDL	f, CX; \
   106		XORL	AX, DX; \
   107		MOVL	e, AX; \
   108		NOTL	AX; \
   109		ADDL	DX, h; \
   110		ANDL	g, AX; \
   111		XORL	CX, AX; \
   112		ADDL	h, AX
   113	
   114	// Calculate T2 in BX - uses BX, CX, DX and DI registers.
   115	//   T2 = BIGSIGMA0(a) + Maj(a, b, c)
   116	//     BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x)
   117	//     Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
   118	#define SHA256T2(a, b, c) \
   119		MOVL	a, DI; \
   120		MOVL	c, BX; \
   121		RORL	$2, DI; \
   122		MOVL	a, DX; \
   123		ANDL	b, BX; \
   124		RORL	$13, DX; \
   125		MOVL	a, CX; \
   126		ANDL	c, CX; \
   127		XORL	DX, DI; \
   128		XORL	CX, BX; \
   129		MOVL	a, DX; \
   130		MOVL	b, CX; \
   131		RORL	$22, DX; \
   132		ANDL	a, CX; \
   133		XORL	CX, BX; \
   134		XORL	DX, DI; \
   135		ADDL	DI, BX
   136	
   137	// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
   138	// The values for e and a are stored in d and h, ready for rotation.
   139	#define SHA256ROUND(index, const, a, b, c, d, e, f, g, h) \
   140		SHA256T1(const, e, f, g, h); \
   141		SHA256T2(a, b, c); \
   142		MOVL	BX, h; \
   143		ADDL	AX, d; \
   144		ADDL	AX, h
   145	
   146	#define SHA256ROUND0(index, const, a, b, c, d, e, f, g, h) \
   147		MSGSCHEDULE0(index); \
   148		SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
   149	
   150	#define SHA256ROUND1(index, const, a, b, c, d, e, f, g, h) \
   151		MSGSCHEDULE1(index); \
   152		SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
   153	
   154	
   155	// Definitions for AVX2 version
   156	
   157	// addm (mem), reg
   158	// Add reg to mem using reg-mem add and store
   159	#define addm(P1, P2) \
   160		ADDL P2, P1; \
   161		MOVL P1, P2
   162	
   163	#define XDWORD0 Y4
   164	#define XDWORD1 Y5
   165	#define XDWORD2 Y6
   166	#define XDWORD3 Y7
   167	
   168	#define XWORD0 X4
   169	#define XWORD1 X5
   170	#define XWORD2 X6
   171	#define XWORD3 X7
   172	
   173	#define XTMP0 Y0
   174	#define XTMP1 Y1
   175	#define XTMP2 Y2
   176	#define XTMP3 Y3
   177	#define XTMP4 Y8
   178	#define XTMP5 Y11
   179	
   180	#define XFER  Y9
   181	
   182	#define BYTE_FLIP_MASK 	Y13 // mask to convert LE -> BE
   183	#define X_BYTE_FLIP_MASK X13
   184	
   185	#define NUM_BYTES DX
   186	#define INP	DI
   187	
   188	#define CTX SI // Beginning of digest in memory (a, b, c, ... , h)
   189	
   190	#define a AX
   191	#define b BX
   192	#define c CX
   193	#define d R8
   194	#define e DX
   195	#define f R9
   196	#define g R10
   197	#define h R11
   198	
   199	#define old_h R11
   200	
   201	#define TBL BP
   202	
   203	#define SRND SI // SRND is same register as CTX
   204	
   205	#define T1 R12
   206	
   207	#define y0 R13
   208	#define y1 R14
   209	#define y2 R15
   210	#define y3 DI
   211	
   212	// Offsets
   213	#define XFER_SIZE 2*64*4
   214	#define INP_END_SIZE 8
   215	#define INP_SIZE 8
   216	
   217	#define _XFER 0
   218	#define _INP_END _XFER + XFER_SIZE
   219	#define _INP _INP_END + INP_END_SIZE
   220	#define STACK_SIZE _INP + INP_SIZE
   221	
   222	#define ROUND_AND_SCHED_N_0(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   223		;                                     \ // #############################  RND N + 0 ############################//
   224		MOVL     a, y3;                       \ // y3 = a					// MAJA
   225		RORXL    $25, e, y0;                  \ // y0 = e >> 25				// S1A
   226		RORXL    $11, e, y1;                  \ // y1 = e >> 11				// S1B
   227		;                                     \
   228		ADDL     (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h        // disp = k + w
   229		ORL      c, y3;                       \ // y3 = a|c				// MAJA
   230		VPALIGNR $4, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-7]
   231		MOVL     f, y2;                       \ // y2 = f				// CH
   232		RORXL    $13, a, T1;                  \ // T1 = a >> 13			// S0B
   233		;                                     \
   234		XORL     y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)					// S1
   235		XORL     g, y2;                       \ // y2 = f^g                              	// CH
   236		VPADDD   XDWORD0, XTMP0, XTMP0;       \ // XTMP0 = W[-7] + W[-16]	// y1 = (e >> 6)	// S1
   237		RORXL    $6, e, y1;                   \ // y1 = (e >> 6)						// S1
   238		;                                     \
   239		ANDL     e, y2;                       \ // y2 = (f^g)&e                         // CH
   240		XORL     y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
   241		RORXL    $22, a, y1;                  \ // y1 = a >> 22							// S0A
   242		ADDL     h, d;                        \ // d = k + w + h + d                     	// --
   243		;                                     \
   244		ANDL     b, y3;                       \ // y3 = (a|c)&b							// MAJA
   245		VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15]
   246		XORL     T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
   247		RORXL    $2, a, T1;                   \ // T1 = (a >> 2)						// S0
   248		;                                     \
   249		XORL     g, y2;                       \ // y2 = CH = ((f^g)&e)^g				// CH
   250		VPSRLD   $7, XTMP1, XTMP2;            \
   251		XORL     T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   252		MOVL     a, T1;                       \ // T1 = a								// MAJB
   253		ANDL     c, T1;                       \ // T1 = a&c								// MAJB
   254		;                                     \
   255		ADDL     y0, y2;                      \ // y2 = S1 + CH							// --
   256		VPSLLD   $(32-7), XTMP1, XTMP3;       \
   257		ORL      T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
   258		ADDL     y1, h;                       \ // h = k + w + h + S0					// --
   259		;                                     \
   260		ADDL     y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
   261		VPOR     XTMP2, XTMP3, XTMP3;         \ // XTMP3 = W[-15] ror 7
   262		;                                     \
   263		VPSRLD   $18, XTMP1, XTMP2;           \
   264		ADDL     y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   265		ADDL     y3, h                        // h = t1 + S0 + MAJ                     // --
   266	
   267	#define ROUND_AND_SCHED_N_1(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   268		;                                    \ // ################################### RND N + 1 ############################
   269		;                                    \
   270		MOVL    a, y3;                       \ // y3 = a                       // MAJA
   271		RORXL   $25, e, y0;                  \ // y0 = e >> 25					// S1A
   272		RORXL   $11, e, y1;                  \ // y1 = e >> 11					// S1B
   273		ADDL    (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h         		// --
   274		ORL     c, y3;                       \ // y3 = a|c						// MAJA
   275		;                                    \
   276		VPSRLD  $3, XTMP1, XTMP4;            \ // XTMP4 = W[-15] >> 3
   277		MOVL    f, y2;                       \ // y2 = f						// CH
   278		RORXL   $13, a, T1;                  \ // T1 = a >> 13					// S0B
   279		XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)		// S1
   280		XORL    g, y2;                       \ // y2 = f^g						// CH
   281		;                                    \
   282		RORXL   $6, e, y1;                   \ // y1 = (e >> 6)				// S1
   283		XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
   284		RORXL   $22, a, y1;                  \ // y1 = a >> 22						// S0A
   285		ANDL    e, y2;                       \ // y2 = (f^g)&e						// CH
   286		ADDL    h, d;                        \ // d = k + w + h + d				// --
   287		;                                    \
   288		VPSLLD  $(32-18), XTMP1, XTMP1;      \
   289		ANDL    b, y3;                       \ // y3 = (a|c)&b					// MAJA
   290		XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
   291		;                                    \
   292		VPXOR   XTMP1, XTMP3, XTMP3;         \
   293		RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
   294		XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g		// CH
   295		;                                    \
   296		VPXOR   XTMP2, XTMP3, XTMP3;         \ // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
   297		XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   298		MOVL    a, T1;                       \ // T1 = a						// MAJB
   299		ANDL    c, T1;                       \ // T1 = a&c						// MAJB
   300		ADDL    y0, y2;                      \ // y2 = S1 + CH					// --
   301		;                                    \
   302		VPXOR   XTMP4, XTMP3, XTMP1;         \ // XTMP1 = s0
   303		VPSHUFD $0xFA, XDWORD3, XTMP2;       \ // XTMP2 = W[-2] {BBAA}
   304		ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)             // MAJ
   305		ADDL    y1, h;                       \ // h = k + w + h + S0                    // --
   306		;                                    \
   307		VPADDD  XTMP1, XTMP0, XTMP0;         \ // XTMP0 = W[-16] + W[-7] + s0
   308		ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
   309		ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   310		ADDL    y3, h;                       \ // h = t1 + S0 + MAJ                     // --
   311		;                                    \
   312		VPSRLD  $10, XTMP2, XTMP4            // XTMP4 = W[-2] >> 10 {BBAA}
   313	
   314	#define ROUND_AND_SCHED_N_2(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   315		;                                    \ // ################################### RND N + 2 ############################
   316		;                                    \
   317		MOVL    a, y3;                       \ // y3 = a							// MAJA
   318		RORXL   $25, e, y0;                  \ // y0 = e >> 25						// S1A
   319		ADDL    (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h        			// --
   320		;                                    \
   321		VPSRLQ  $19, XTMP2, XTMP3;           \ // XTMP3 = W[-2] ror 19 {xBxA}
   322		RORXL   $11, e, y1;                  \ // y1 = e >> 11						// S1B
   323		ORL     c, y3;                       \ // y3 = a|c                         // MAJA
   324		MOVL    f, y2;                       \ // y2 = f                           // CH
   325		XORL    g, y2;                       \ // y2 = f^g                         // CH
   326		;                                    \
   327		RORXL   $13, a, T1;                  \ // T1 = a >> 13						// S0B
   328		XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)			// S1
   329		VPSRLQ  $17, XTMP2, XTMP2;           \ // XTMP2 = W[-2] ror 17 {xBxA}
   330		ANDL    e, y2;                       \ // y2 = (f^g)&e						// CH
   331		;                                    \
   332		RORXL   $6, e, y1;                   \ // y1 = (e >> 6)					// S1
   333		VPXOR   XTMP3, XTMP2, XTMP2;         \
   334		ADDL    h, d;                        \ // d = k + w + h + d				// --
   335		ANDL    b, y3;                       \ // y3 = (a|c)&b						// MAJA
   336		;                                    \
   337		XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
   338		RORXL   $22, a, y1;                  \ // y1 = a >> 22						// S0A
   339		VPXOR   XTMP2, XTMP4, XTMP4;         \ // XTMP4 = s1 {xBxA}
   340		XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
   341		;                                    \
   342		VPSHUFB shuff_00BA<>(SB), XTMP4, XTMP4;\ // XTMP4 = s1 {00BA}
   343		;                                    \
   344		XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
   345		RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
   346		VPADDD  XTMP4, XTMP0, XTMP0;         \ // XTMP0 = {..., ..., W[1], W[0]}
   347		;                                    \
   348		XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
   349		MOVL    a, T1;                       \ // T1 = a                                // MAJB
   350		ANDL    c, T1;                       \ // T1 = a&c                              // MAJB
   351		ADDL    y0, y2;                      \ // y2 = S1 + CH                          // --
   352		VPSHUFD $80, XTMP0, XTMP2;           \ // XTMP2 = W[-2] {DDCC}
   353		;                                    \
   354		ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)             // MAJ
   355		ADDL    y1, h;                       \ // h = k + w + h + S0                    // --
   356		ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
   357		ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   358		;                                    \
   359		ADDL    y3, h                        // h = t1 + S0 + MAJ                     // --
   360	
   361	#define ROUND_AND_SCHED_N_3(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
   362		;                                    \ // ################################### RND N + 3 ############################
   363		;                                    \
   364		MOVL    a, y3;                       \ // y3 = a						// MAJA
   365		RORXL   $25, e, y0;                  \ // y0 = e >> 25					// S1A
   366		RORXL   $11, e, y1;                  \ // y1 = e >> 11					// S1B
   367		ADDL    (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h				// --
   368		ORL     c, y3;                       \ // y3 = a|c                     // MAJA
   369		;                                    \
   370		VPSRLD  $10, XTMP2, XTMP5;           \ // XTMP5 = W[-2] >> 10 {DDCC}
   371		MOVL    f, y2;                       \ // y2 = f						// CH
   372		RORXL   $13, a, T1;                  \ // T1 = a >> 13					// S0B
   373		XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)		// S1
   374		XORL    g, y2;                       \ // y2 = f^g						// CH
   375		;                                    \
   376		VPSRLQ  $19, XTMP2, XTMP3;           \ // XTMP3 = W[-2] ror 19 {xDxC}
   377		RORXL   $6, e, y1;                   \ // y1 = (e >> 6)				// S1
   378		ANDL    e, y2;                       \ // y2 = (f^g)&e					// CH
   379		ADDL    h, d;                        \ // d = k + w + h + d			// --
   380		ANDL    b, y3;                       \ // y3 = (a|c)&b					// MAJA
   381		;                                    \
   382		VPSRLQ  $17, XTMP2, XTMP2;           \ // XTMP2 = W[-2] ror 17 {xDxC}
   383		XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
   384		XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
   385		;                                    \
   386		VPXOR   XTMP3, XTMP2, XTMP2;         \
   387		RORXL   $22, a, y1;                  \ // y1 = a >> 22					// S0A
   388		ADDL    y0, y2;                      \ // y2 = S1 + CH					// --
   389		;                                    \
   390		VPXOR   XTMP2, XTMP5, XTMP5;         \ // XTMP5 = s1 {xDxC}
   391		XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
   392		ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
   393		;                                    \
   394		RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
   395		;                                    \
   396		VPSHUFB shuff_DC00<>(SB), XTMP5, XTMP5;\ // XTMP5 = s1 {DC00}
   397		;                                    \
   398		VPADDD  XTMP0, XTMP5, XDWORD0;       \ // XDWORD0 = {W[3], W[2], W[1], W[0]}
   399		XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
   400		MOVL    a, T1;                       \ // T1 = a							// MAJB
   401		ANDL    c, T1;                       \ // T1 = a&c							// MAJB
   402		ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)		// MAJ
   403		;                                    \
   404		ADDL    y1, h;                       \ // h = k + w + h + S0				// --
   405		ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   406		ADDL    y3, h                        // h = t1 + S0 + MAJ				// --
   407	
   408	#define DO_ROUND_N_0(disp, a, b, c, d, e, f, g, h, old_h) \
   409		;                                  \ // ################################### RND N + 0 ###########################
   410		MOVL  f, y2;                       \ // y2 = f					// CH
   411		RORXL $25, e, y0;                  \ // y0 = e >> 25				// S1A
   412		RORXL $11, e, y1;                  \ // y1 = e >> 11				// S1B
   413		XORL  g, y2;                       \ // y2 = f^g					// CH
   414		;                                  \
   415		XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)	// S1
   416		RORXL $6, e, y1;                   \ // y1 = (e >> 6)			// S1
   417		ANDL  e, y2;                       \ // y2 = (f^g)&e				// CH
   418		;                                  \
   419		XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
   420		RORXL $13, a, T1;                  \ // T1 = a >> 13						// S0B
   421		XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
   422		RORXL $22, a, y1;                  \ // y1 = a >> 22						// S0A
   423		MOVL  a, y3;                       \ // y3 = a							// MAJA
   424		;                                  \
   425		XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)			// S0
   426		RORXL $2, a, T1;                   \ // T1 = (a >> 2)					// S0
   427		ADDL  (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // --
   428		ORL   c, y3;                       \ // y3 = a|c							// MAJA
   429		;                                  \
   430		XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
   431		MOVL  a, T1;                       \ // T1 = a							// MAJB
   432		ANDL  b, y3;                       \ // y3 = (a|c)&b						// MAJA
   433		ANDL  c, T1;                       \ // T1 = a&c							// MAJB
   434		ADDL  y0, y2;                      \ // y2 = S1 + CH						// --
   435		;                                  \
   436		ADDL  h, d;                        \ // d = k + w + h + d					// --
   437		ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
   438		ADDL  y1, h;                       \ // h = k + w + h + S0					// --
   439		ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1	// --
   440	
   441	#define DO_ROUND_N_1(disp, a, b, c, d, e, f, g, h, old_h) \
   442		;                                  \ // ################################### RND N + 1 ###########################
   443		ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0 // --
   444		MOVL  f, y2;                       \ // y2 = f                                // CH
   445		RORXL $25, e, y0;                  \ // y0 = e >> 25				// S1A
   446		RORXL $11, e, y1;                  \ // y1 = e >> 11				// S1B
   447		XORL  g, y2;                       \ // y2 = f^g                             // CH
   448		;                                  \
   449		XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
   450		RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
   451		ANDL  e, y2;                       \ // y2 = (f^g)&e                         // CH
   452		ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ                    // --
   453		;                                  \
   454		XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
   455		RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
   456		XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g                // CH
   457		RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
   458		MOVL  a, y3;                       \ // y3 = a                               // MAJA
   459		;                                  \
   460		XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
   461		RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
   462		ADDL  (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // --
   463		ORL   c, y3;                       \ // y3 = a|c                             // MAJA
   464		;                                  \
   465		XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   466		MOVL  a, T1;                       \ // T1 = a                               // MAJB
   467		ANDL  b, y3;                       \ // y3 = (a|c)&b                         // MAJA
   468		ANDL  c, T1;                       \ // T1 = a&c                             // MAJB
   469		ADDL  y0, y2;                      \ // y2 = S1 + CH                         // --
   470		;                                  \
   471		ADDL  h, d;                        \ // d = k + w + h + d                    // --
   472		ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)            // MAJ
   473		ADDL  y1, h;                       \ // h = k + w + h + S0                   // --
   474		;                                  \
   475		ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1 // --
   476	
   477	#define DO_ROUND_N_2(disp, a, b, c, d, e, f, g, h, old_h) \
   478		;                                  \ // ################################### RND N + 2 ##############################
   479		ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   480		MOVL  f, y2;                       \ // y2 = f								// CH
   481		RORXL $25, e, y0;                  \ // y0 = e >> 25							// S1A
   482		RORXL $11, e, y1;                  \ // y1 = e >> 11							// S1B
   483		XORL  g, y2;                       \ // y2 = f^g								// CH
   484		;                                  \
   485		XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
   486		RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
   487		ANDL  e, y2;                       \ // y2 = (f^g)&e							// CH
   488		ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ					// --
   489		;                                  \
   490		XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
   491		RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
   492		XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g                // CH
   493		RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
   494		MOVL  a, y3;                       \ // y3 = a								// MAJA
   495		;                                  \
   496		XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
   497		RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
   498		ADDL  (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h 	// --
   499		ORL   c, y3;                       \ // y3 = a|c								// MAJA
   500		;                                  \
   501		XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   502		MOVL  a, T1;                       \ // T1 = a								// MAJB
   503		ANDL  b, y3;                       \ // y3 = (a|c)&b							// MAJA
   504		ANDL  c, T1;                       \ // T1 = a&c								// MAJB
   505		ADDL  y0, y2;                      \ // y2 = S1 + CH							// --
   506		;                                  \
   507		ADDL  h, d;                        \ // d = k + w + h + d					// --
   508		ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
   509		ADDL  y1, h;                       \ // h = k + w + h + S0					// --
   510		;                                  \
   511		ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1 // --
   512	
   513	#define DO_ROUND_N_3(disp, a, b, c, d, e, f, g, h, old_h) \
   514		;                                  \ // ################################### RND N + 3 ###########################
   515		ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   516		MOVL  f, y2;                       \ // y2 = f								// CH
   517		RORXL $25, e, y0;                  \ // y0 = e >> 25							// S1A
   518		RORXL $11, e, y1;                  \ // y1 = e >> 11							// S1B
   519		XORL  g, y2;                       \ // y2 = f^g								// CH
   520		;                                  \
   521		XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
   522		RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
   523		ANDL  e, y2;                       \ // y2 = (f^g)&e							// CH
   524		ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ					// --
   525		;                                  \
   526		XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
   527		RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
   528		XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g				// CH
   529		RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
   530		MOVL  a, y3;                       \ // y3 = a								// MAJA
   531		;                                  \
   532		XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
   533		RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
   534		ADDL  (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h 	// --
   535		ORL   c, y3;                       \ // y3 = a|c								// MAJA
   536		;                                  \
   537		XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
   538		MOVL  a, T1;                       \ // T1 = a								// MAJB
   539		ANDL  b, y3;                       \ // y3 = (a|c)&b							// MAJA
   540		ANDL  c, T1;                       \ // T1 = a&c								// MAJB
   541		ADDL  y0, y2;                      \ // y2 = S1 + CH							// --
   542		;                                  \
   543		ADDL  h, d;                        \ // d = k + w + h + d					// --
   544		ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
   545		ADDL  y1, h;                       \ // h = k + w + h + S0					// --
   546		;                                  \
   547		ADDL  y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1	// --
   548		;                                  \
   549		ADDL  y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
   550		;                                  \
   551		ADDL  y3, h                        // h = t1 + S0 + MAJ					// --
   552	
   553	TEXT ·block(SB), 0, $536-32
   554		CMPB ·useAVX2(SB), $1
   555		JE   avx2
   556	
   557		MOVQ p_base+8(FP), SI
   558		MOVQ p_len+16(FP), DX
   559		SHRQ $6, DX
   560		SHLQ $6, DX
   561	
   562		LEAQ (SI)(DX*1), DI
   563		MOVQ DI, 256(SP)
   564		CMPQ SI, DI
   565		JEQ  end
   566	
   567		MOVQ dig+0(FP), BP
   568		MOVL (0*4)(BP), R8  // a = H0
   569		MOVL (1*4)(BP), R9  // b = H1
   570		MOVL (2*4)(BP), R10 // c = H2
   571		MOVL (3*4)(BP), R11 // d = H3
   572		MOVL (4*4)(BP), R12 // e = H4
   573		MOVL (5*4)(BP), R13 // f = H5
   574		MOVL (6*4)(BP), R14 // g = H6
   575		MOVL (7*4)(BP), R15 // h = H7
   576	
   577	loop:
   578		MOVQ SP, BP
   579	
   580		SHA256ROUND0(0, 0x428a2f98, R8, R9, R10, R11, R12, R13, R14, R15)
   581		SHA256ROUND0(1, 0x71374491, R15, R8, R9, R10, R11, R12, R13, R14)
   582		SHA256ROUND0(2, 0xb5c0fbcf, R14, R15, R8, R9, R10, R11, R12, R13)
   583		SHA256ROUND0(3, 0xe9b5dba5, R13, R14, R15, R8, R9, R10, R11, R12)
   584		SHA256ROUND0(4, 0x3956c25b, R12, R13, R14, R15, R8, R9, R10, R11)
   585		SHA256ROUND0(5, 0x59f111f1, R11, R12, R13, R14, R15, R8, R9, R10)
   586		SHA256ROUND0(6, 0x923f82a4, R10, R11, R12, R13, R14, R15, R8, R9)
   587		SHA256ROUND0(7, 0xab1c5ed5, R9, R10, R11, R12, R13, R14, R15, R8)
   588		SHA256ROUND0(8, 0xd807aa98, R8, R9, R10, R11, R12, R13, R14, R15)
   589		SHA256ROUND0(9, 0x12835b01, R15, R8, R9, R10, R11, R12, R13, R14)
   590		SHA256ROUND0(10, 0x243185be, R14, R15, R8, R9, R10, R11, R12, R13)
   591		SHA256ROUND0(11, 0x550c7dc3, R13, R14, R15, R8, R9, R10, R11, R12)
   592		SHA256ROUND0(12, 0x72be5d74, R12, R13, R14, R15, R8, R9, R10, R11)
   593		SHA256ROUND0(13, 0x80deb1fe, R11, R12, R13, R14, R15, R8, R9, R10)
   594		SHA256ROUND0(14, 0x9bdc06a7, R10, R11, R12, R13, R14, R15, R8, R9)
   595		SHA256ROUND0(15, 0xc19bf174, R9, R10, R11, R12, R13, R14, R15, R8)
   596	
   597		SHA256ROUND1(16, 0xe49b69c1, R8, R9, R10, R11, R12, R13, R14, R15)
   598		SHA256ROUND1(17, 0xefbe4786, R15, R8, R9, R10, R11, R12, R13, R14)
   599		SHA256ROUND1(18, 0x0fc19dc6, R14, R15, R8, R9, R10, R11, R12, R13)
   600		SHA256ROUND1(19, 0x240ca1cc, R13, R14, R15, R8, R9, R10, R11, R12)
   601		SHA256ROUND1(20, 0x2de92c6f, R12, R13, R14, R15, R8, R9, R10, R11)
   602		SHA256ROUND1(21, 0x4a7484aa, R11, R12, R13, R14, R15, R8, R9, R10)
   603		SHA256ROUND1(22, 0x5cb0a9dc, R10, R11, R12, R13, R14, R15, R8, R9)
   604		SHA256ROUND1(23, 0x76f988da, R9, R10, R11, R12, R13, R14, R15, R8)
   605		SHA256ROUND1(24, 0x983e5152, R8, R9, R10, R11, R12, R13, R14, R15)
   606		SHA256ROUND1(25, 0xa831c66d, R15, R8, R9, R10, R11, R12, R13, R14)
   607		SHA256ROUND1(26, 0xb00327c8, R14, R15, R8, R9, R10, R11, R12, R13)
   608		SHA256ROUND1(27, 0xbf597fc7, R13, R14, R15, R8, R9, R10, R11, R12)
   609		SHA256ROUND1(28, 0xc6e00bf3, R12, R13, R14, R15, R8, R9, R10, R11)
   610		SHA256ROUND1(29, 0xd5a79147, R11, R12, R13, R14, R15, R8, R9, R10)
   611		SHA256ROUND1(30, 0x06ca6351, R10, R11, R12, R13, R14, R15, R8, R9)
   612		SHA256ROUND1(31, 0x14292967, R9, R10, R11, R12, R13, R14, R15, R8)
   613		SHA256ROUND1(32, 0x27b70a85, R8, R9, R10, R11, R12, R13, R14, R15)
   614		SHA256ROUND1(33, 0x2e1b2138, R15, R8, R9, R10, R11, R12, R13, R14)
   615		SHA256ROUND1(34, 0x4d2c6dfc, R14, R15, R8, R9, R10, R11, R12, R13)
   616		SHA256ROUND1(35, 0x53380d13, R13, R14, R15, R8, R9, R10, R11, R12)
   617		SHA256ROUND1(36, 0x650a7354, R12, R13, R14, R15, R8, R9, R10, R11)
   618		SHA256ROUND1(37, 0x766a0abb, R11, R12, R13, R14, R15, R8, R9, R10)
   619		SHA256ROUND1(38, 0x81c2c92e, R10, R11, R12, R13, R14, R15, R8, R9)
   620		SHA256ROUND1(39, 0x92722c85, R9, R10, R11, R12, R13, R14, R15, R8)
   621		SHA256ROUND1(40, 0xa2bfe8a1, R8, R9, R10, R11, R12, R13, R14, R15)
   622		SHA256ROUND1(41, 0xa81a664b, R15, R8, R9, R10, R11, R12, R13, R14)
   623		SHA256ROUND1(42, 0xc24b8b70, R14, R15, R8, R9, R10, R11, R12, R13)
   624		SHA256ROUND1(43, 0xc76c51a3, R13, R14, R15, R8, R9, R10, R11, R12)
   625		SHA256ROUND1(44, 0xd192e819, R12, R13, R14, R15, R8, R9, R10, R11)
   626		SHA256ROUND1(45, 0xd6990624, R11, R12, R13, R14, R15, R8, R9, R10)
   627		SHA256ROUND1(46, 0xf40e3585, R10, R11, R12, R13, R14, R15, R8, R9)
   628		SHA256ROUND1(47, 0x106aa070, R9, R10, R11, R12, R13, R14, R15, R8)
   629		SHA256ROUND1(48, 0x19a4c116, R8, R9, R10, R11, R12, R13, R14, R15)
   630		SHA256ROUND1(49, 0x1e376c08, R15, R8, R9, R10, R11, R12, R13, R14)
   631		SHA256ROUND1(50, 0x2748774c, R14, R15, R8, R9, R10, R11, R12, R13)
   632		SHA256ROUND1(51, 0x34b0bcb5, R13, R14, R15, R8, R9, R10, R11, R12)
   633		SHA256ROUND1(52, 0x391c0cb3, R12, R13, R14, R15, R8, R9, R10, R11)
   634		SHA256ROUND1(53, 0x4ed8aa4a, R11, R12, R13, R14, R15, R8, R9, R10)
   635		SHA256ROUND1(54, 0x5b9cca4f, R10, R11, R12, R13, R14, R15, R8, R9)
   636		SHA256ROUND1(55, 0x682e6ff3, R9, R10, R11, R12, R13, R14, R15, R8)
   637		SHA256ROUND1(56, 0x748f82ee, R8, R9, R10, R11, R12, R13, R14, R15)
   638		SHA256ROUND1(57, 0x78a5636f, R15, R8, R9, R10, R11, R12, R13, R14)
   639		SHA256ROUND1(58, 0x84c87814, R14, R15, R8, R9, R10, R11, R12, R13)
   640		SHA256ROUND1(59, 0x8cc70208, R13, R14, R15, R8, R9, R10, R11, R12)
   641		SHA256ROUND1(60, 0x90befffa, R12, R13, R14, R15, R8, R9, R10, R11)
   642		SHA256ROUND1(61, 0xa4506ceb, R11, R12, R13, R14, R15, R8, R9, R10)
   643		SHA256ROUND1(62, 0xbef9a3f7, R10, R11, R12, R13, R14, R15, R8, R9)
   644		SHA256ROUND1(63, 0xc67178f2, R9, R10, R11, R12, R13, R14, R15, R8)
   645	
   646		MOVQ dig+0(FP), BP
   647		ADDL (0*4)(BP), R8  // H0 = a + H0
   648		MOVL R8, (0*4)(BP)
   649		ADDL (1*4)(BP), R9  // H1 = b + H1
   650		MOVL R9, (1*4)(BP)
   651		ADDL (2*4)(BP), R10 // H2 = c + H2
   652		MOVL R10, (2*4)(BP)
   653		ADDL (3*4)(BP), R11 // H3 = d + H3
   654		MOVL R11, (3*4)(BP)
   655		ADDL (4*4)(BP), R12 // H4 = e + H4
   656		MOVL R12, (4*4)(BP)
   657		ADDL (5*4)(BP), R13 // H5 = f + H5
   658		MOVL R13, (5*4)(BP)
   659		ADDL (6*4)(BP), R14 // H6 = g + H6
   660		MOVL R14, (6*4)(BP)
   661		ADDL (7*4)(BP), R15 // H7 = h + H7
   662		MOVL R15, (7*4)(BP)
   663	
   664		ADDQ $64, SI
   665		CMPQ SI, 256(SP)
   666		JB   loop
   667	
   668	end:
   669		RET
   670	
   671	avx2:
   672		MOVQ dig+0(FP), CTX          // d.h[8]
   673		MOVQ p_base+8(FP), INP
   674		MOVQ p_len+16(FP), NUM_BYTES
   675	
   676		LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block
   677		MOVQ NUM_BYTES, _INP_END(SP)
   678	
   679		CMPQ NUM_BYTES, INP
   680		JE   avx2_only_one_block
   681	
   682		// Load initial digest
   683		MOVL 0(CTX), a  // a = H0
   684		MOVL 4(CTX), b  // b = H1
   685		MOVL 8(CTX), c  // c = H2
   686		MOVL 12(CTX), d // d = H3
   687		MOVL 16(CTX), e // e = H4
   688		MOVL 20(CTX), f // f = H5
   689		MOVL 24(CTX), g // g = H6
   690		MOVL 28(CTX), h // h = H7
   691	
   692	avx2_loop0: // at each iteration works with one block (512 bit)
   693	
   694		VMOVDQU (0*32)(INP), XTMP0
   695		VMOVDQU (1*32)(INP), XTMP1
   696		VMOVDQU (2*32)(INP), XTMP2
   697		VMOVDQU (3*32)(INP), XTMP3
   698	
   699		VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
   700	
   701		// Apply Byte Flip Mask: LE -> BE
   702		VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0
   703		VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1
   704		VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2
   705		VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3
   706	
   707		// Transpose data into high/low parts
   708		VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0
   709		VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4
   710		VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8
   711		VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12
   712	
   713		MOVQ $K256<>(SB), TBL // Loading address of table with round-specific constants
   714	
   715	avx2_last_block_enter:
   716		ADDQ $64, INP
   717		MOVQ INP, _INP(SP)
   718		XORQ SRND, SRND
   719	
   720	avx2_loop1: // for w0 - w47
   721		// Do 4 rounds and scheduling
   722		VPADDD  0*32(TBL)(SRND*1), XDWORD0, XFER
   723		VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
   724		ROUND_AND_SCHED_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   725		ROUND_AND_SCHED_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   726		ROUND_AND_SCHED_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   727		ROUND_AND_SCHED_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
   728	
   729		// Do 4 rounds and scheduling
   730		VPADDD  1*32(TBL)(SRND*1), XDWORD1, XFER
   731		VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
   732		ROUND_AND_SCHED_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   733		ROUND_AND_SCHED_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   734		ROUND_AND_SCHED_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   735		ROUND_AND_SCHED_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
   736	
   737		// Do 4 rounds and scheduling
   738		VPADDD  2*32(TBL)(SRND*1), XDWORD2, XFER
   739		VMOVDQU XFER, (_XFER + 2*32)(SP)(SRND*1)
   740		ROUND_AND_SCHED_N_0(_XFER + 2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   741		ROUND_AND_SCHED_N_1(_XFER + 2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   742		ROUND_AND_SCHED_N_2(_XFER + 2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   743		ROUND_AND_SCHED_N_3(_XFER + 2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
   744	
   745		// Do 4 rounds and scheduling
   746		VPADDD  3*32(TBL)(SRND*1), XDWORD3, XFER
   747		VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1)
   748		ROUND_AND_SCHED_N_0(_XFER + 3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   749		ROUND_AND_SCHED_N_1(_XFER + 3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   750		ROUND_AND_SCHED_N_2(_XFER + 3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   751		ROUND_AND_SCHED_N_3(_XFER + 3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
   752	
   753		ADDQ $4*32, SRND
   754		CMPQ SRND, $3*4*32
   755		JB   avx2_loop1
   756	
   757	avx2_loop2:
   758		// w48 - w63 processed with no scheduling (last 16 rounds)
   759		VPADDD  0*32(TBL)(SRND*1), XDWORD0, XFER
   760		VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
   761		DO_ROUND_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, h)
   762		DO_ROUND_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, h)
   763		DO_ROUND_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, g)
   764		DO_ROUND_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, f)
   765	
   766		VPADDD  1*32(TBL)(SRND*1), XDWORD1, XFER
   767		VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
   768		DO_ROUND_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, e)
   769		DO_ROUND_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, d)
   770		DO_ROUND_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, c)
   771		DO_ROUND_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, b)
   772	
   773		ADDQ $2*32, SRND
   774	
   775		VMOVDQU XDWORD2, XDWORD0
   776		VMOVDQU XDWORD3, XDWORD1
   777	
   778		CMPQ SRND, $4*4*32
   779		JB   avx2_loop2
   780	
   781		MOVQ dig+0(FP), CTX // d.h[8]
   782		MOVQ _INP(SP), INP
   783	
   784		addm(  0(CTX), a)
   785		addm(  4(CTX), b)
   786		addm(  8(CTX), c)
   787		addm( 12(CTX), d)
   788		addm( 16(CTX), e)
   789		addm( 20(CTX), f)
   790		addm( 24(CTX), g)
   791		addm( 28(CTX), h)
   792	
   793		CMPQ _INP_END(SP), INP
   794		JB   done_hash
   795	
   796		XORQ SRND, SRND
   797	
   798	avx2_loop3: // Do second block using previously scheduled results
   799		DO_ROUND_N_0(_XFER + 0*32 + 16, a, b, c, d, e, f, g, h, a)
   800		DO_ROUND_N_1(_XFER + 0*32 + 16, h, a, b, c, d, e, f, g, h)
   801		DO_ROUND_N_2(_XFER + 0*32 + 16, g, h, a, b, c, d, e, f, g)
   802		DO_ROUND_N_3(_XFER + 0*32 + 16, f, g, h, a, b, c, d, e, f)
   803	
   804		DO_ROUND_N_0(_XFER + 1*32 + 16, e, f, g, h, a, b, c, d, e)
   805		DO_ROUND_N_1(_XFER + 1*32 + 16, d, e, f, g, h, a, b, c, d)
   806		DO_ROUND_N_2(_XFER + 1*32 + 16, c, d, e, f, g, h, a, b, c)
   807		DO_ROUND_N_3(_XFER + 1*32 + 16, b, c, d, e, f, g, h, a, b)
   808	
   809		ADDQ $2*32, SRND
   810		CMPQ SRND, $4*4*32
   811		JB   avx2_loop3
   812	
   813		MOVQ dig+0(FP), CTX // d.h[8]
   814		MOVQ _INP(SP), INP
   815		ADDQ $64, INP
   816	
   817		addm(  0(CTX), a)
   818		addm(  4(CTX), b)
   819		addm(  8(CTX), c)
   820		addm( 12(CTX), d)
   821		addm( 16(CTX), e)
   822		addm( 20(CTX), f)
   823		addm( 24(CTX), g)
   824		addm( 28(CTX), h)
   825	
   826		CMPQ _INP_END(SP), INP
   827		JA   avx2_loop0
   828		JB   done_hash
   829	
   830	avx2_do_last_block:
   831	
   832		VMOVDQU 0(INP), XWORD0
   833		VMOVDQU 16(INP), XWORD1
   834		VMOVDQU 32(INP), XWORD2
   835		VMOVDQU 48(INP), XWORD3
   836	
   837		VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
   838	
   839		VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
   840		VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
   841		VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
   842		VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
   843	
   844		MOVQ $K256<>(SB), TBL
   845	
   846		JMP avx2_last_block_enter
   847	
   848	avx2_only_one_block:
   849		// Load initial digest
   850		MOVL 0(CTX), a  // a = H0
   851		MOVL 4(CTX), b  // b = H1
   852		MOVL 8(CTX), c  // c = H2
   853		MOVL 12(CTX), d // d = H3
   854		MOVL 16(CTX), e // e = H4
   855		MOVL 20(CTX), f // f = H5
   856		MOVL 24(CTX), g // g = H6
   857		MOVL 28(CTX), h // h = H7
   858	
   859		JMP avx2_do_last_block
   860	
   861	done_hash:
   862		VZEROUPPER
   863		RET
   864	
   865	// shuffle byte order from LE to BE
   866	DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
   867	DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
   868	DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203
   869	DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b
   870	GLOBL flip_mask<>(SB), 8, $32
   871	
   872	// shuffle xBxA -> 00BA
   873	DATA shuff_00BA<>+0x00(SB)/8, $0x0b0a090803020100
   874	DATA shuff_00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
   875	DATA shuff_00BA<>+0x10(SB)/8, $0x0b0a090803020100
   876	DATA shuff_00BA<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
   877	GLOBL shuff_00BA<>(SB), 8, $32
   878	
   879	// shuffle xDxC -> DC00
   880	DATA shuff_DC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF
   881	DATA shuff_DC00<>+0x08(SB)/8, $0x0b0a090803020100
   882	DATA shuff_DC00<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
   883	DATA shuff_DC00<>+0x18(SB)/8, $0x0b0a090803020100
   884	GLOBL shuff_DC00<>(SB), 8, $32
   885	
   886	// Round specific constants
   887	DATA K256<>+0x00(SB)/4, $0x428a2f98 // k1
   888	DATA K256<>+0x04(SB)/4, $0x71374491 // k2
   889	DATA K256<>+0x08(SB)/4, $0xb5c0fbcf // k3
   890	DATA K256<>+0x0c(SB)/4, $0xe9b5dba5 // k4
   891	DATA K256<>+0x10(SB)/4, $0x428a2f98 // k1
   892	DATA K256<>+0x14(SB)/4, $0x71374491 // k2
   893	DATA K256<>+0x18(SB)/4, $0xb5c0fbcf // k3
   894	DATA K256<>+0x1c(SB)/4, $0xe9b5dba5 // k4
   895	
   896	DATA K256<>+0x20(SB)/4, $0x3956c25b // k5 - k8
   897	DATA K256<>+0x24(SB)/4, $0x59f111f1
   898	DATA K256<>+0x28(SB)/4, $0x923f82a4
   899	DATA K256<>+0x2c(SB)/4, $0xab1c5ed5
   900	DATA K256<>+0x30(SB)/4, $0x3956c25b
   901	DATA K256<>+0x34(SB)/4, $0x59f111f1
   902	DATA K256<>+0x38(SB)/4, $0x923f82a4
   903	DATA K256<>+0x3c(SB)/4, $0xab1c5ed5
   904	
   905	DATA K256<>+0x40(SB)/4, $0xd807aa98 // k9 - k12
   906	DATA K256<>+0x44(SB)/4, $0x12835b01
   907	DATA K256<>+0x48(SB)/4, $0x243185be
   908	DATA K256<>+0x4c(SB)/4, $0x550c7dc3
   909	DATA K256<>+0x50(SB)/4, $0xd807aa98
   910	DATA K256<>+0x54(SB)/4, $0x12835b01
   911	DATA K256<>+0x58(SB)/4, $0x243185be
   912	DATA K256<>+0x5c(SB)/4, $0x550c7dc3
   913	
   914	DATA K256<>+0x60(SB)/4, $0x72be5d74 // k13 - k16
   915	DATA K256<>+0x64(SB)/4, $0x80deb1fe
   916	DATA K256<>+0x68(SB)/4, $0x9bdc06a7
   917	DATA K256<>+0x6c(SB)/4, $0xc19bf174
   918	DATA K256<>+0x70(SB)/4, $0x72be5d74
   919	DATA K256<>+0x74(SB)/4, $0x80deb1fe
   920	DATA K256<>+0x78(SB)/4, $0x9bdc06a7
   921	DATA K256<>+0x7c(SB)/4, $0xc19bf174
   922	
   923	DATA K256<>+0x80(SB)/4, $0xe49b69c1 // k17 - k20
   924	DATA K256<>+0x84(SB)/4, $0xefbe4786
   925	DATA K256<>+0x88(SB)/4, $0x0fc19dc6
   926	DATA K256<>+0x8c(SB)/4, $0x240ca1cc
   927	DATA K256<>+0x90(SB)/4, $0xe49b69c1
   928	DATA K256<>+0x94(SB)/4, $0xefbe4786
   929	DATA K256<>+0x98(SB)/4, $0x0fc19dc6
   930	DATA K256<>+0x9c(SB)/4, $0x240ca1cc
   931	
   932	DATA K256<>+0xa0(SB)/4, $0x2de92c6f // k21 - k24
   933	DATA K256<>+0xa4(SB)/4, $0x4a7484aa
   934	DATA K256<>+0xa8(SB)/4, $0x5cb0a9dc
   935	DATA K256<>+0xac(SB)/4, $0x76f988da
   936	DATA K256<>+0xb0(SB)/4, $0x2de92c6f
   937	DATA K256<>+0xb4(SB)/4, $0x4a7484aa
   938	DATA K256<>+0xb8(SB)/4, $0x5cb0a9dc
   939	DATA K256<>+0xbc(SB)/4, $0x76f988da
   940	
   941	DATA K256<>+0xc0(SB)/4, $0x983e5152 // k25 - k28
   942	DATA K256<>+0xc4(SB)/4, $0xa831c66d
   943	DATA K256<>+0xc8(SB)/4, $0xb00327c8
   944	DATA K256<>+0xcc(SB)/4, $0xbf597fc7
   945	DATA K256<>+0xd0(SB)/4, $0x983e5152
   946	DATA K256<>+0xd4(SB)/4, $0xa831c66d
   947	DATA K256<>+0xd8(SB)/4, $0xb00327c8
   948	DATA K256<>+0xdc(SB)/4, $0xbf597fc7
   949	
   950	DATA K256<>+0xe0(SB)/4, $0xc6e00bf3 // k29 - k32
   951	DATA K256<>+0xe4(SB)/4, $0xd5a79147
   952	DATA K256<>+0xe8(SB)/4, $0x06ca6351
   953	DATA K256<>+0xec(SB)/4, $0x14292967
   954	DATA K256<>+0xf0(SB)/4, $0xc6e00bf3
   955	DATA K256<>+0xf4(SB)/4, $0xd5a79147
   956	DATA K256<>+0xf8(SB)/4, $0x06ca6351
   957	DATA K256<>+0xfc(SB)/4, $0x14292967
   958	
   959	DATA K256<>+0x100(SB)/4, $0x27b70a85
   960	DATA K256<>+0x104(SB)/4, $0x2e1b2138
   961	DATA K256<>+0x108(SB)/4, $0x4d2c6dfc
   962	DATA K256<>+0x10c(SB)/4, $0x53380d13
   963	DATA K256<>+0x110(SB)/4, $0x27b70a85
   964	DATA K256<>+0x114(SB)/4, $0x2e1b2138
   965	DATA K256<>+0x118(SB)/4, $0x4d2c6dfc
   966	DATA K256<>+0x11c(SB)/4, $0x53380d13
   967	
   968	DATA K256<>+0x120(SB)/4, $0x650a7354
   969	DATA K256<>+0x124(SB)/4, $0x766a0abb
   970	DATA K256<>+0x128(SB)/4, $0x81c2c92e
   971	DATA K256<>+0x12c(SB)/4, $0x92722c85
   972	DATA K256<>+0x130(SB)/4, $0x650a7354
   973	DATA K256<>+0x134(SB)/4, $0x766a0abb
   974	DATA K256<>+0x138(SB)/4, $0x81c2c92e
   975	DATA K256<>+0x13c(SB)/4, $0x92722c85
   976	
   977	DATA K256<>+0x140(SB)/4, $0xa2bfe8a1
   978	DATA K256<>+0x144(SB)/4, $0xa81a664b
   979	DATA K256<>+0x148(SB)/4, $0xc24b8b70
   980	DATA K256<>+0x14c(SB)/4, $0xc76c51a3
   981	DATA K256<>+0x150(SB)/4, $0xa2bfe8a1
   982	DATA K256<>+0x154(SB)/4, $0xa81a664b
   983	DATA K256<>+0x158(SB)/4, $0xc24b8b70
   984	DATA K256<>+0x15c(SB)/4, $0xc76c51a3
   985	
   986	DATA K256<>+0x160(SB)/4, $0xd192e819
   987	DATA K256<>+0x164(SB)/4, $0xd6990624
   988	DATA K256<>+0x168(SB)/4, $0xf40e3585
   989	DATA K256<>+0x16c(SB)/4, $0x106aa070
   990	DATA K256<>+0x170(SB)/4, $0xd192e819
   991	DATA K256<>+0x174(SB)/4, $0xd6990624
   992	DATA K256<>+0x178(SB)/4, $0xf40e3585
   993	DATA K256<>+0x17c(SB)/4, $0x106aa070
   994	
   995	DATA K256<>+0x180(SB)/4, $0x19a4c116
   996	DATA K256<>+0x184(SB)/4, $0x1e376c08
   997	DATA K256<>+0x188(SB)/4, $0x2748774c
   998	DATA K256<>+0x18c(SB)/4, $0x34b0bcb5
   999	DATA K256<>+0x190(SB)/4, $0x19a4c116
  1000	DATA K256<>+0x194(SB)/4, $0x1e376c08
  1001	DATA K256<>+0x198(SB)/4, $0x2748774c
  1002	DATA K256<>+0x19c(SB)/4, $0x34b0bcb5
  1003	
  1004	DATA K256<>+0x1a0(SB)/4, $0x391c0cb3
  1005	DATA K256<>+0x1a4(SB)/4, $0x4ed8aa4a
  1006	DATA K256<>+0x1a8(SB)/4, $0x5b9cca4f
  1007	DATA K256<>+0x1ac(SB)/4, $0x682e6ff3
  1008	DATA K256<>+0x1b0(SB)/4, $0x391c0cb3
  1009	DATA K256<>+0x1b4(SB)/4, $0x4ed8aa4a
  1010	DATA K256<>+0x1b8(SB)/4, $0x5b9cca4f
  1011	DATA K256<>+0x1bc(SB)/4, $0x682e6ff3
  1012	
  1013	DATA K256<>+0x1c0(SB)/4, $0x748f82ee
  1014	DATA K256<>+0x1c4(SB)/4, $0x78a5636f
  1015	DATA K256<>+0x1c8(SB)/4, $0x84c87814
  1016	DATA K256<>+0x1cc(SB)/4, $0x8cc70208
  1017	DATA K256<>+0x1d0(SB)/4, $0x748f82ee
  1018	DATA K256<>+0x1d4(SB)/4, $0x78a5636f
  1019	DATA K256<>+0x1d8(SB)/4, $0x84c87814
  1020	DATA K256<>+0x1dc(SB)/4, $0x8cc70208
  1021	
  1022	DATA K256<>+0x1e0(SB)/4, $0x90befffa
  1023	DATA K256<>+0x1e4(SB)/4, $0xa4506ceb
  1024	DATA K256<>+0x1e8(SB)/4, $0xbef9a3f7
  1025	DATA K256<>+0x1ec(SB)/4, $0xc67178f2
  1026	DATA K256<>+0x1f0(SB)/4, $0x90befffa
  1027	DATA K256<>+0x1f4(SB)/4, $0xa4506ceb
  1028	DATA K256<>+0x1f8(SB)/4, $0xbef9a3f7
  1029	DATA K256<>+0x1fc(SB)/4, $0xc67178f2
  1030	
  1031	GLOBL K256<>(SB), (NOPTR + RODATA), $512

View as plain text