...

Text file src/crypto/sha512/sha512block_amd64.s

     1	// Copyright 2013 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	#include "textflag.h"
     6	
     7	// SHA512 block routine. See sha512block.go for Go equivalent.
     8	//
     9	// The algorithm is detailed in FIPS 180-4:
    10	//
    11	//  https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
    12	//
    13	// Wt = Mt; for 0 <= t <= 15
    14	// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
    15	//
    16	// a = H0
    17	// b = H1
    18	// c = H2
    19	// d = H3
    20	// e = H4
    21	// f = H5
    22	// g = H6
    23	// h = H7
    24	//
    25	// for t = 0 to 79 {
    26	//    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
    27	//    T2 = BIGSIGMA0(a) + Maj(a,b,c)
    28	//    h = g
    29	//    g = f
    30	//    f = e
    31	//    e = d + T1
    32	//    d = c
    33	//    c = b
    34	//    b = a
    35	//    a = T1 + T2
    36	// }
    37	//
    38	// H0 = a + H0
    39	// H1 = b + H1
    40	// H2 = c + H2
    41	// H3 = d + H3
    42	// H4 = e + H4
    43	// H5 = f + H5
    44	// H6 = g + H6
    45	// H7 = h + H7
    46	
    47	// Wt = Mt; for 0 <= t <= 15
    48	#define MSGSCHEDULE0(index) \
    49		MOVQ	(index*8)(SI), AX; \
    50		BSWAPQ	AX; \
    51		MOVQ	AX, (index*8)(BP)
    52	
    53	// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
    54	//   SIGMA0(x) = ROTR(1,x) XOR ROTR(8,x) XOR SHR(7,x)
    55	//   SIGMA1(x) = ROTR(19,x) XOR ROTR(61,x) XOR SHR(6,x)
    56	#define MSGSCHEDULE1(index) \
    57		MOVQ	((index-2)*8)(BP), AX; \
    58		MOVQ	AX, CX; \
    59		RORQ	$19, AX; \
    60		MOVQ	CX, DX; \
    61		RORQ	$61, CX; \
    62		SHRQ	$6, DX; \
    63		MOVQ	((index-15)*8)(BP), BX; \
    64		XORQ	CX, AX; \
    65		MOVQ	BX, CX; \
    66		XORQ	DX, AX; \
    67		RORQ	$1, BX; \
    68		MOVQ	CX, DX; \
    69		SHRQ	$7, DX; \
    70		RORQ	$8, CX; \
    71		ADDQ	((index-7)*8)(BP), AX; \
    72		XORQ	CX, BX; \
    73		XORQ	DX, BX; \
    74		ADDQ	((index-16)*8)(BP), BX; \
    75		ADDQ	BX, AX; \
    76		MOVQ	AX, ((index)*8)(BP)
    77	
    78	// Calculate T1 in AX - uses AX, CX and DX registers.
    79	// h is also used as an accumulator. Wt is passed in AX.
    80	//   T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
    81	//     BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x)
    82	//     Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
    83	#define SHA512T1(const, e, f, g, h) \
    84		MOVQ	$const, DX; \
    85		ADDQ	AX, h; \
    86		MOVQ	e, AX; \
    87		ADDQ	DX, h; \
    88		MOVQ	e, CX; \
    89		RORQ	$14, AX; \
    90		MOVQ	e, DX; \
    91		RORQ	$18, CX; \
    92		XORQ	CX, AX; \
    93		MOVQ	e, CX; \
    94		RORQ	$41, DX; \
    95		ANDQ	f, CX; \
    96		XORQ	AX, DX; \
    97		MOVQ	e, AX; \
    98		NOTQ	AX; \
    99		ADDQ	DX, h; \
   100		ANDQ	g, AX; \
   101		XORQ	CX, AX; \
   102		ADDQ	h, AX
   103	
   104	// Calculate T2 in BX - uses BX, CX, DX and DI registers.
   105	//   T2 = BIGSIGMA0(a) + Maj(a, b, c)
   106	//     BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x)
   107	//     Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
   108	#define SHA512T2(a, b, c) \
   109		MOVQ	a, DI; \
   110		MOVQ	c, BX; \
   111		RORQ	$28, DI; \
   112		MOVQ	a, DX; \
   113		ANDQ	b, BX; \
   114		RORQ	$34, DX; \
   115		MOVQ	a, CX; \
   116		ANDQ	c, CX; \
   117		XORQ	DX, DI; \
   118		XORQ	CX, BX; \
   119		MOVQ	a, DX; \
   120		MOVQ	b, CX; \
   121		RORQ	$39, DX; \
   122		ANDQ	a, CX; \
   123		XORQ	CX, BX; \
   124		XORQ	DX, DI; \
   125		ADDQ	DI, BX
   126	
   127	// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
   128	// The values for e and a are stored in d and h, ready for rotation.
   129	#define SHA512ROUND(index, const, a, b, c, d, e, f, g, h) \
   130		SHA512T1(const, e, f, g, h); \
   131		SHA512T2(a, b, c); \
   132		MOVQ	BX, h; \
   133		ADDQ	AX, d; \
   134		ADDQ	AX, h
   135	
   136	#define SHA512ROUND0(index, const, a, b, c, d, e, f, g, h) \
   137		MSGSCHEDULE0(index); \
   138		SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
   139	
   140	#define SHA512ROUND1(index, const, a, b, c, d, e, f, g, h) \
   141		MSGSCHEDULE1(index); \
   142		SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
   143	
   144	TEXT ·blockAMD64(SB),0,$648-32
   145		MOVQ	p_base+8(FP), SI
   146		MOVQ	p_len+16(FP), DX
   147		SHRQ	$7, DX
   148		SHLQ	$7, DX
   149	
   150		LEAQ	(SI)(DX*1), DI
   151		MOVQ	DI, 640(SP)
   152		CMPQ	SI, DI
   153		JEQ	end
   154	
   155		MOVQ	dig+0(FP), BP
   156		MOVQ	(0*8)(BP), R8		// a = H0
   157		MOVQ	(1*8)(BP), R9		// b = H1
   158		MOVQ	(2*8)(BP), R10		// c = H2
   159		MOVQ	(3*8)(BP), R11		// d = H3
   160		MOVQ	(4*8)(BP), R12		// e = H4
   161		MOVQ	(5*8)(BP), R13		// f = H5
   162		MOVQ	(6*8)(BP), R14		// g = H6
   163		MOVQ	(7*8)(BP), R15		// h = H7
   164	
   165	loop:
   166		MOVQ	SP, BP			// message schedule
   167	
   168		SHA512ROUND0(0, 0x428a2f98d728ae22, R8, R9, R10, R11, R12, R13, R14, R15)
   169		SHA512ROUND0(1, 0x7137449123ef65cd, R15, R8, R9, R10, R11, R12, R13, R14)
   170		SHA512ROUND0(2, 0xb5c0fbcfec4d3b2f, R14, R15, R8, R9, R10, R11, R12, R13)
   171		SHA512ROUND0(3, 0xe9b5dba58189dbbc, R13, R14, R15, R8, R9, R10, R11, R12)
   172		SHA512ROUND0(4, 0x3956c25bf348b538, R12, R13, R14, R15, R8, R9, R10, R11)
   173		SHA512ROUND0(5, 0x59f111f1b605d019, R11, R12, R13, R14, R15, R8, R9, R10)
   174		SHA512ROUND0(6, 0x923f82a4af194f9b, R10, R11, R12, R13, R14, R15, R8, R9)
   175		SHA512ROUND0(7, 0xab1c5ed5da6d8118, R9, R10, R11, R12, R13, R14, R15, R8)
   176		SHA512ROUND0(8, 0xd807aa98a3030242, R8, R9, R10, R11, R12, R13, R14, R15)
   177		SHA512ROUND0(9, 0x12835b0145706fbe, R15, R8, R9, R10, R11, R12, R13, R14)
   178		SHA512ROUND0(10, 0x243185be4ee4b28c, R14, R15, R8, R9, R10, R11, R12, R13)
   179		SHA512ROUND0(11, 0x550c7dc3d5ffb4e2, R13, R14, R15, R8, R9, R10, R11, R12)
   180		SHA512ROUND0(12, 0x72be5d74f27b896f, R12, R13, R14, R15, R8, R9, R10, R11)
   181		SHA512ROUND0(13, 0x80deb1fe3b1696b1, R11, R12, R13, R14, R15, R8, R9, R10)
   182		SHA512ROUND0(14, 0x9bdc06a725c71235, R10, R11, R12, R13, R14, R15, R8, R9)
   183		SHA512ROUND0(15, 0xc19bf174cf692694, R9, R10, R11, R12, R13, R14, R15, R8)
   184	
   185		SHA512ROUND1(16, 0xe49b69c19ef14ad2, R8, R9, R10, R11, R12, R13, R14, R15)
   186		SHA512ROUND1(17, 0xefbe4786384f25e3, R15, R8, R9, R10, R11, R12, R13, R14)
   187		SHA512ROUND1(18, 0x0fc19dc68b8cd5b5, R14, R15, R8, R9, R10, R11, R12, R13)
   188		SHA512ROUND1(19, 0x240ca1cc77ac9c65, R13, R14, R15, R8, R9, R10, R11, R12)
   189		SHA512ROUND1(20, 0x2de92c6f592b0275, R12, R13, R14, R15, R8, R9, R10, R11)
   190		SHA512ROUND1(21, 0x4a7484aa6ea6e483, R11, R12, R13, R14, R15, R8, R9, R10)
   191		SHA512ROUND1(22, 0x5cb0a9dcbd41fbd4, R10, R11, R12, R13, R14, R15, R8, R9)
   192		SHA512ROUND1(23, 0x76f988da831153b5, R9, R10, R11, R12, R13, R14, R15, R8)
   193		SHA512ROUND1(24, 0x983e5152ee66dfab, R8, R9, R10, R11, R12, R13, R14, R15)
   194		SHA512ROUND1(25, 0xa831c66d2db43210, R15, R8, R9, R10, R11, R12, R13, R14)
   195		SHA512ROUND1(26, 0xb00327c898fb213f, R14, R15, R8, R9, R10, R11, R12, R13)
   196		SHA512ROUND1(27, 0xbf597fc7beef0ee4, R13, R14, R15, R8, R9, R10, R11, R12)
   197		SHA512ROUND1(28, 0xc6e00bf33da88fc2, R12, R13, R14, R15, R8, R9, R10, R11)
   198		SHA512ROUND1(29, 0xd5a79147930aa725, R11, R12, R13, R14, R15, R8, R9, R10)
   199		SHA512ROUND1(30, 0x06ca6351e003826f, R10, R11, R12, R13, R14, R15, R8, R9)
   200		SHA512ROUND1(31, 0x142929670a0e6e70, R9, R10, R11, R12, R13, R14, R15, R8)
   201		SHA512ROUND1(32, 0x27b70a8546d22ffc, R8, R9, R10, R11, R12, R13, R14, R15)
   202		SHA512ROUND1(33, 0x2e1b21385c26c926, R15, R8, R9, R10, R11, R12, R13, R14)
   203		SHA512ROUND1(34, 0x4d2c6dfc5ac42aed, R14, R15, R8, R9, R10, R11, R12, R13)
   204		SHA512ROUND1(35, 0x53380d139d95b3df, R13, R14, R15, R8, R9, R10, R11, R12)
   205		SHA512ROUND1(36, 0x650a73548baf63de, R12, R13, R14, R15, R8, R9, R10, R11)
   206		SHA512ROUND1(37, 0x766a0abb3c77b2a8, R11, R12, R13, R14, R15, R8, R9, R10)
   207		SHA512ROUND1(38, 0x81c2c92e47edaee6, R10, R11, R12, R13, R14, R15, R8, R9)
   208		SHA512ROUND1(39, 0x92722c851482353b, R9, R10, R11, R12, R13, R14, R15, R8)
   209		SHA512ROUND1(40, 0xa2bfe8a14cf10364, R8, R9, R10, R11, R12, R13, R14, R15)
   210		SHA512ROUND1(41, 0xa81a664bbc423001, R15, R8, R9, R10, R11, R12, R13, R14)
   211		SHA512ROUND1(42, 0xc24b8b70d0f89791, R14, R15, R8, R9, R10, R11, R12, R13)
   212		SHA512ROUND1(43, 0xc76c51a30654be30, R13, R14, R15, R8, R9, R10, R11, R12)
   213		SHA512ROUND1(44, 0xd192e819d6ef5218, R12, R13, R14, R15, R8, R9, R10, R11)
   214		SHA512ROUND1(45, 0xd69906245565a910, R11, R12, R13, R14, R15, R8, R9, R10)
   215		SHA512ROUND1(46, 0xf40e35855771202a, R10, R11, R12, R13, R14, R15, R8, R9)
   216		SHA512ROUND1(47, 0x106aa07032bbd1b8, R9, R10, R11, R12, R13, R14, R15, R8)
   217		SHA512ROUND1(48, 0x19a4c116b8d2d0c8, R8, R9, R10, R11, R12, R13, R14, R15)
   218		SHA512ROUND1(49, 0x1e376c085141ab53, R15, R8, R9, R10, R11, R12, R13, R14)
   219		SHA512ROUND1(50, 0x2748774cdf8eeb99, R14, R15, R8, R9, R10, R11, R12, R13)
   220		SHA512ROUND1(51, 0x34b0bcb5e19b48a8, R13, R14, R15, R8, R9, R10, R11, R12)
   221		SHA512ROUND1(52, 0x391c0cb3c5c95a63, R12, R13, R14, R15, R8, R9, R10, R11)
   222		SHA512ROUND1(53, 0x4ed8aa4ae3418acb, R11, R12, R13, R14, R15, R8, R9, R10)
   223		SHA512ROUND1(54, 0x5b9cca4f7763e373, R10, R11, R12, R13, R14, R15, R8, R9)
   224		SHA512ROUND1(55, 0x682e6ff3d6b2b8a3, R9, R10, R11, R12, R13, R14, R15, R8)
   225		SHA512ROUND1(56, 0x748f82ee5defb2fc, R8, R9, R10, R11, R12, R13, R14, R15)
   226		SHA512ROUND1(57, 0x78a5636f43172f60, R15, R8, R9, R10, R11, R12, R13, R14)
   227		SHA512ROUND1(58, 0x84c87814a1f0ab72, R14, R15, R8, R9, R10, R11, R12, R13)
   228		SHA512ROUND1(59, 0x8cc702081a6439ec, R13, R14, R15, R8, R9, R10, R11, R12)
   229		SHA512ROUND1(60, 0x90befffa23631e28, R12, R13, R14, R15, R8, R9, R10, R11)
   230		SHA512ROUND1(61, 0xa4506cebde82bde9, R11, R12, R13, R14, R15, R8, R9, R10)
   231		SHA512ROUND1(62, 0xbef9a3f7b2c67915, R10, R11, R12, R13, R14, R15, R8, R9)
   232		SHA512ROUND1(63, 0xc67178f2e372532b, R9, R10, R11, R12, R13, R14, R15, R8)
   233		SHA512ROUND1(64, 0xca273eceea26619c, R8, R9, R10, R11, R12, R13, R14, R15)
   234		SHA512ROUND1(65, 0xd186b8c721c0c207, R15, R8, R9, R10, R11, R12, R13, R14)
   235		SHA512ROUND1(66, 0xeada7dd6cde0eb1e, R14, R15, R8, R9, R10, R11, R12, R13)
   236		SHA512ROUND1(67, 0xf57d4f7fee6ed178, R13, R14, R15, R8, R9, R10, R11, R12)
   237		SHA512ROUND1(68, 0x06f067aa72176fba, R12, R13, R14, R15, R8, R9, R10, R11)
   238		SHA512ROUND1(69, 0x0a637dc5a2c898a6, R11, R12, R13, R14, R15, R8, R9, R10)
   239		SHA512ROUND1(70, 0x113f9804bef90dae, R10, R11, R12, R13, R14, R15, R8, R9)
   240		SHA512ROUND1(71, 0x1b710b35131c471b, R9, R10, R11, R12, R13, R14, R15, R8)
   241		SHA512ROUND1(72, 0x28db77f523047d84, R8, R9, R10, R11, R12, R13, R14, R15)
   242		SHA512ROUND1(73, 0x32caab7b40c72493, R15, R8, R9, R10, R11, R12, R13, R14)
   243		SHA512ROUND1(74, 0x3c9ebe0a15c9bebc, R14, R15, R8, R9, R10, R11, R12, R13)
   244		SHA512ROUND1(75, 0x431d67c49c100d4c, R13, R14, R15, R8, R9, R10, R11, R12)
   245		SHA512ROUND1(76, 0x4cc5d4becb3e42b6, R12, R13, R14, R15, R8, R9, R10, R11)
   246		SHA512ROUND1(77, 0x597f299cfc657e2a, R11, R12, R13, R14, R15, R8, R9, R10)
   247		SHA512ROUND1(78, 0x5fcb6fab3ad6faec, R10, R11, R12, R13, R14, R15, R8, R9)
   248		SHA512ROUND1(79, 0x6c44198c4a475817, R9, R10, R11, R12, R13, R14, R15, R8)
   249	
   250		MOVQ	dig+0(FP), BP
   251		ADDQ	(0*8)(BP), R8	// H0 = a + H0
   252		MOVQ	R8, (0*8)(BP)
   253		ADDQ	(1*8)(BP), R9	// H1 = b + H1
   254		MOVQ	R9, (1*8)(BP)
   255		ADDQ	(2*8)(BP), R10	// H2 = c + H2
   256		MOVQ	R10, (2*8)(BP)
   257		ADDQ	(3*8)(BP), R11	// H3 = d + H3
   258		MOVQ	R11, (3*8)(BP)
   259		ADDQ	(4*8)(BP), R12	// H4 = e + H4
   260		MOVQ	R12, (4*8)(BP)
   261		ADDQ	(5*8)(BP), R13	// H5 = f + H5
   262		MOVQ	R13, (5*8)(BP)
   263		ADDQ	(6*8)(BP), R14	// H6 = g + H6
   264		MOVQ	R14, (6*8)(BP)
   265		ADDQ	(7*8)(BP), R15	// H7 = h + H7
   266		MOVQ	R15, (7*8)(BP)
   267	
   268		ADDQ	$128, SI
   269		CMPQ	SI, 640(SP)
   270		JB	loop
   271	
   272	end:
   273		RET
   274	
   275	// Version below is based on "Fast SHA512 Implementations on Intel
   276	// Architecture Processors" White-paper
   277	// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-sha512-implementations-ia-processors-paper.pdf
   278	// AVX2 version by Intel, same algorithm in Linux kernel:
   279	// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha512-avx2-asm.S
   280	
   281	// James Guilford <james.guilford@intel.com>
   282	// Kirk Yap <kirk.s.yap@intel.com>
   283	// Tim Chen <tim.c.chen@linux.intel.com>
   284	// David Cote <david.m.cote@intel.com>
   285	// Aleksey Sidorov <aleksey.sidorov@intel.com>
   286	
   287	#define YFER_SIZE (4*8)
   288	#define SRND_SIZE (1*8)
   289	#define INP_SIZE (1*8)
   290	
   291	#define frame_YFER (0)
   292	#define frame_SRND (frame_YFER + YFER_SIZE)
   293	#define frame_INP (frame_SRND + SRND_SIZE)
   294	#define frame_INPEND (frame_INP + INP_SIZE)
   295	
   296	#define addm(p1, p2) \
   297		ADDQ p1, p2; \
   298		MOVQ p2, p1
   299	
   300	#define COPY_YMM_AND_BSWAP(p1, p2, p3) \
   301		VMOVDQU p2, p1;    \
   302		VPSHUFB p3, p1, p1
   303	
   304	#define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \
   305		VPERM2F128 $0x3, YSRC2, YSRC1, YDST; \
   306		VPALIGNR   $RVAL, YSRC2, YDST, YDST
   307	
   308	DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x00(SB)/8, $0x0001020304050607
   309	DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x08(SB)/8, $0x08090a0b0c0d0e0f
   310	DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x10(SB)/8, $0x1011121314151617
   311	DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x18(SB)/8, $0x18191a1b1c1d1e1f
   312	
   313	GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), (NOPTR+RODATA), $32
   314	
   315	DATA MASK_YMM_LO<>+0x00(SB)/8, $0x0000000000000000
   316	DATA MASK_YMM_LO<>+0x08(SB)/8, $0x0000000000000000
   317	DATA MASK_YMM_LO<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
   318	DATA MASK_YMM_LO<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
   319	
   320	GLOBL MASK_YMM_LO<>(SB), (NOPTR+RODATA), $32
   321	
   322	TEXT ·blockAVX2(SB), NOSPLIT, $56-32
   323		MOVQ dig+0(FP), SI
   324		MOVQ p_base+8(FP), DI
   325		MOVQ p_len+16(FP), DX
   326	
   327		SHRQ $7, DX
   328		SHLQ $7, DX
   329	
   330		JZ   done_hash
   331		ADDQ DI, DX
   332		MOVQ DX, frame_INPEND(SP)
   333	
   334		MOVQ (0*8)(SI), AX
   335		MOVQ (1*8)(SI), BX
   336		MOVQ (2*8)(SI), CX
   337		MOVQ (3*8)(SI), R8
   338		MOVQ (4*8)(SI), DX
   339		MOVQ (5*8)(SI), R9
   340		MOVQ (6*8)(SI), R10
   341		MOVQ (7*8)(SI), R11
   342	
   343		VMOVDQU PSHUFFLE_BYTE_FLIP_MASK<>(SB), Y9
   344	
   345	loop0:
   346		MOVQ ·_K+0(SB), BP
   347	
   348		// byte swap first 16 dwords
   349		COPY_YMM_AND_BSWAP(Y4, (0*32)(DI), Y9)
   350		COPY_YMM_AND_BSWAP(Y5, (1*32)(DI), Y9)
   351		COPY_YMM_AND_BSWAP(Y6, (2*32)(DI), Y9)
   352		COPY_YMM_AND_BSWAP(Y7, (3*32)(DI), Y9)
   353	
   354		MOVQ DI, frame_INP(SP)
   355	
   356		// schedule 64 input dwords, by doing 12 rounds of 4 each
   357		MOVQ $4, frame_SRND(SP)
   358	
   359	loop1:
   360		VPADDQ  (BP), Y4, Y0
   361		VMOVDQU Y0, frame_YFER(SP)
   362	
   363		MY_VPALIGNR(Y0, Y7, Y6, 8)
   364	
   365		VPADDQ Y4, Y0, Y0
   366	
   367		MY_VPALIGNR(Y1, Y5, Y4, 8)
   368	
   369		VPSRLQ $1, Y1, Y2
   370		VPSLLQ $(64-1), Y1, Y3
   371		VPOR   Y2, Y3, Y3
   372	
   373		VPSRLQ $7, Y1, Y8
   374	
   375		MOVQ  AX, DI
   376		RORXQ $41, DX, R13
   377		RORXQ $18, DX, R14
   378		ADDQ  frame_YFER(SP), R11
   379		ORQ   CX, DI
   380		MOVQ  R9, R15
   381		RORXQ $34, AX, R12
   382	
   383		XORQ  R14, R13
   384		XORQ  R10, R15
   385		RORXQ $14, DX, R14
   386	
   387		ANDQ  DX, R15
   388		XORQ  R14, R13
   389		RORXQ $39, AX, R14
   390		ADDQ  R11, R8
   391	
   392		ANDQ  BX, DI
   393		XORQ  R12, R14
   394		RORXQ $28, AX, R12
   395	
   396		XORQ R10, R15
   397		XORQ R12, R14
   398		MOVQ AX, R12
   399		ANDQ CX, R12
   400	
   401		ADDQ R13, R15
   402		ORQ  R12, DI
   403		ADDQ R14, R11
   404	
   405		ADDQ R15, R8
   406	
   407		ADDQ R15, R11
   408		ADDQ DI, R11
   409	
   410		VPSRLQ $8, Y1, Y2
   411		VPSLLQ $(64-8), Y1, Y1
   412		VPOR   Y2, Y1, Y1
   413	
   414		VPXOR Y8, Y3, Y3
   415		VPXOR Y1, Y3, Y1
   416	
   417		VPADDQ Y1, Y0, Y0
   418	
   419		VPERM2F128 $0x0, Y0, Y0, Y4
   420	
   421		VPAND MASK_YMM_LO<>(SB), Y0, Y0
   422	
   423		VPERM2F128 $0x11, Y7, Y7, Y2
   424		VPSRLQ     $6, Y2, Y8
   425	
   426		MOVQ  R11, DI
   427		RORXQ $41, R8, R13
   428		RORXQ $18, R8, R14
   429		ADDQ  1*8+frame_YFER(SP), R10
   430		ORQ   BX, DI
   431	
   432		MOVQ  DX, R15
   433		RORXQ $34, R11, R12
   434		XORQ  R14, R13
   435		XORQ  R9, R15
   436	
   437		RORXQ $14, R8, R14
   438		XORQ  R14, R13
   439		RORXQ $39, R11, R14
   440		ANDQ  R8, R15
   441		ADDQ  R10, CX
   442	
   443		ANDQ AX, DI
   444		XORQ R12, R14
   445	
   446		RORXQ $28, R11, R12
   447		XORQ  R9, R15
   448	
   449		XORQ R12, R14
   450		MOVQ R11, R12
   451		ANDQ BX, R12
   452		ADDQ R13, R15
   453	
   454		ORQ  R12, DI
   455		ADDQ R14, R10
   456	
   457		ADDQ R15, CX
   458		ADDQ R15, R10
   459		ADDQ DI, R10
   460	
   461		VPSRLQ $19, Y2, Y3
   462		VPSLLQ $(64-19), Y2, Y1
   463		VPOR   Y1, Y3, Y3
   464		VPXOR  Y3, Y8, Y8
   465		VPSRLQ $61, Y2, Y3
   466		VPSLLQ $(64-61), Y2, Y1
   467		VPOR   Y1, Y3, Y3
   468		VPXOR  Y3, Y8, Y8
   469	
   470		VPADDQ Y8, Y4, Y4
   471	
   472		VPSRLQ $6, Y4, Y8
   473	
   474		MOVQ  R10, DI
   475		RORXQ $41, CX, R13
   476		ADDQ  2*8+frame_YFER(SP), R9
   477	
   478		RORXQ $18, CX, R14
   479		ORQ   AX, DI
   480		MOVQ  R8, R15
   481		XORQ  DX, R15
   482	
   483		RORXQ $34, R10, R12
   484		XORQ  R14, R13
   485		ANDQ  CX, R15
   486	
   487		RORXQ $14, CX, R14
   488		ADDQ  R9, BX
   489		ANDQ  R11, DI
   490	
   491		XORQ  R14, R13
   492		RORXQ $39, R10, R14
   493		XORQ  DX, R15
   494	
   495		XORQ  R12, R14
   496		RORXQ $28, R10, R12
   497	
   498		XORQ R12, R14
   499		MOVQ R10, R12
   500		ANDQ AX, R12
   501		ADDQ R13, R15
   502	
   503		ORQ  R12, DI
   504		ADDQ R14, R9
   505		ADDQ R15, BX
   506		ADDQ R15, R9
   507	
   508		ADDQ DI, R9
   509	
   510		VPSRLQ $19, Y4, Y3
   511		VPSLLQ $(64-19), Y4, Y1
   512		VPOR   Y1, Y3, Y3
   513		VPXOR  Y3, Y8, Y8
   514		VPSRLQ $61, Y4, Y3
   515		VPSLLQ $(64-61), Y4, Y1
   516		VPOR   Y1, Y3, Y3
   517		VPXOR  Y3, Y8, Y8
   518	
   519		VPADDQ Y8, Y0, Y2
   520	
   521		VPBLENDD $0xF0, Y2, Y4, Y4
   522	
   523		MOVQ  R9, DI
   524		RORXQ $41, BX, R13
   525		RORXQ $18, BX, R14
   526		ADDQ  3*8+frame_YFER(SP), DX
   527		ORQ   R11, DI
   528	
   529		MOVQ  CX, R15
   530		RORXQ $34, R9, R12
   531		XORQ  R14, R13
   532		XORQ  R8, R15
   533	
   534		RORXQ $14, BX, R14
   535		ANDQ  BX, R15
   536		ADDQ  DX, AX
   537		ANDQ  R10, DI
   538	
   539		XORQ R14, R13
   540		XORQ R8, R15
   541	
   542		RORXQ $39, R9, R14
   543		ADDQ  R13, R15
   544	
   545		XORQ R12, R14
   546		ADDQ R15, AX
   547	
   548		RORXQ $28, R9, R12
   549	
   550		XORQ R12, R14
   551		MOVQ R9, R12
   552		ANDQ R11, R12
   553		ORQ  R12, DI
   554	
   555		ADDQ R14, DX
   556		ADDQ R15, DX
   557		ADDQ DI, DX
   558	
   559		VPADDQ  1*32(BP), Y5, Y0
   560		VMOVDQU Y0, frame_YFER(SP)
   561	
   562		MY_VPALIGNR(Y0, Y4, Y7, 8)
   563	
   564		VPADDQ Y5, Y0, Y0
   565	
   566		MY_VPALIGNR(Y1, Y6, Y5, 8)
   567	
   568		VPSRLQ $1, Y1, Y2
   569		VPSLLQ $(64-1), Y1, Y3
   570		VPOR   Y2, Y3, Y3
   571	
   572		VPSRLQ $7, Y1, Y8
   573	
   574		MOVQ  DX, DI
   575		RORXQ $41, AX, R13
   576		RORXQ $18, AX, R14
   577		ADDQ  frame_YFER(SP), R8
   578		ORQ   R10, DI
   579		MOVQ  BX, R15
   580		RORXQ $34, DX, R12
   581	
   582		XORQ  R14, R13
   583		XORQ  CX, R15
   584		RORXQ $14, AX, R14
   585	
   586		ANDQ  AX, R15
   587		XORQ  R14, R13
   588		RORXQ $39, DX, R14
   589		ADDQ  R8, R11
   590	
   591		ANDQ  R9, DI
   592		XORQ  R12, R14
   593		RORXQ $28, DX, R12
   594	
   595		XORQ CX, R15
   596		XORQ R12, R14
   597		MOVQ DX, R12
   598		ANDQ R10, R12
   599	
   600		ADDQ R13, R15
   601		ORQ  R12, DI
   602		ADDQ R14, R8
   603	
   604		ADDQ R15, R11
   605	
   606		ADDQ R15, R8
   607		ADDQ DI, R8
   608	
   609		VPSRLQ $8, Y1, Y2
   610		VPSLLQ $(64-8), Y1, Y1
   611		VPOR   Y2, Y1, Y1
   612	
   613		VPXOR Y8, Y3, Y3
   614		VPXOR Y1, Y3, Y1
   615	
   616		VPADDQ Y1, Y0, Y0
   617	
   618		VPERM2F128 $0x0, Y0, Y0, Y5
   619	
   620		VPAND MASK_YMM_LO<>(SB), Y0, Y0
   621	
   622		VPERM2F128 $0x11, Y4, Y4, Y2
   623		VPSRLQ     $6, Y2, Y8
   624	
   625		MOVQ  R8, DI
   626		RORXQ $41, R11, R13
   627		RORXQ $18, R11, R14
   628		ADDQ  1*8+frame_YFER(SP), CX
   629		ORQ   R9, DI
   630	
   631		MOVQ  AX, R15
   632		RORXQ $34, R8, R12
   633		XORQ  R14, R13
   634		XORQ  BX, R15
   635	
   636		RORXQ $14, R11, R14
   637		XORQ  R14, R13
   638		RORXQ $39, R8, R14
   639		ANDQ  R11, R15
   640		ADDQ  CX, R10
   641	
   642		ANDQ DX, DI
   643		XORQ R12, R14
   644	
   645		RORXQ $28, R8, R12
   646		XORQ  BX, R15
   647	
   648		XORQ R12, R14
   649		MOVQ R8, R12
   650		ANDQ R9, R12
   651		ADDQ R13, R15
   652	
   653		ORQ  R12, DI
   654		ADDQ R14, CX
   655	
   656		ADDQ R15, R10
   657		ADDQ R15, CX
   658		ADDQ DI, CX
   659	
   660		VPSRLQ $19, Y2, Y3
   661		VPSLLQ $(64-19), Y2, Y1
   662		VPOR   Y1, Y3, Y3
   663		VPXOR  Y3, Y8, Y8
   664		VPSRLQ $61, Y2, Y3
   665		VPSLLQ $(64-61), Y2, Y1
   666		VPOR   Y1, Y3, Y3
   667		VPXOR  Y3, Y8, Y8
   668	
   669		VPADDQ Y8, Y5, Y5
   670	
   671		VPSRLQ $6, Y5, Y8
   672	
   673		MOVQ  CX, DI
   674		RORXQ $41, R10, R13
   675		ADDQ  2*8+frame_YFER(SP), BX
   676	
   677		RORXQ $18, R10, R14
   678		ORQ   DX, DI
   679		MOVQ  R11, R15
   680		XORQ  AX, R15
   681	
   682		RORXQ $34, CX, R12
   683		XORQ  R14, R13
   684		ANDQ  R10, R15
   685	
   686		RORXQ $14, R10, R14
   687		ADDQ  BX, R9
   688		ANDQ  R8, DI
   689	
   690		XORQ  R14, R13
   691		RORXQ $39, CX, R14
   692		XORQ  AX, R15
   693	
   694		XORQ  R12, R14
   695		RORXQ $28, CX, R12
   696	
   697		XORQ R12, R14
   698		MOVQ CX, R12
   699		ANDQ DX, R12
   700		ADDQ R13, R15
   701	
   702		ORQ  R12, DI
   703		ADDQ R14, BX
   704		ADDQ R15, R9
   705		ADDQ R15, BX
   706	
   707		ADDQ DI, BX
   708	
   709		VPSRLQ $19, Y5, Y3
   710		VPSLLQ $(64-19), Y5, Y1
   711		VPOR   Y1, Y3, Y3
   712		VPXOR  Y3, Y8, Y8
   713		VPSRLQ $61, Y5, Y3
   714		VPSLLQ $(64-61), Y5, Y1
   715		VPOR   Y1, Y3, Y3
   716		VPXOR  Y3, Y8, Y8
   717	
   718		VPADDQ Y8, Y0, Y2
   719	
   720		VPBLENDD $0xF0, Y2, Y5, Y5
   721	
   722		MOVQ  BX, DI
   723		RORXQ $41, R9, R13
   724		RORXQ $18, R9, R14
   725		ADDQ  3*8+frame_YFER(SP), AX
   726		ORQ   R8, DI
   727	
   728		MOVQ  R10, R15
   729		RORXQ $34, BX, R12
   730		XORQ  R14, R13
   731		XORQ  R11, R15
   732	
   733		RORXQ $14, R9, R14
   734		ANDQ  R9, R15
   735		ADDQ  AX, DX
   736		ANDQ  CX, DI
   737	
   738		XORQ R14, R13
   739		XORQ R11, R15
   740	
   741		RORXQ $39, BX, R14
   742		ADDQ  R13, R15
   743	
   744		XORQ R12, R14
   745		ADDQ R15, DX
   746	
   747		RORXQ $28, BX, R12
   748	
   749		XORQ R12, R14
   750		MOVQ BX, R12
   751		ANDQ R8, R12
   752		ORQ  R12, DI
   753	
   754		ADDQ R14, AX
   755		ADDQ R15, AX
   756		ADDQ DI, AX
   757	
   758		VPADDQ  2*32(BP), Y6, Y0
   759		VMOVDQU Y0, frame_YFER(SP)
   760	
   761		MY_VPALIGNR(Y0, Y5, Y4, 8)
   762	
   763		VPADDQ Y6, Y0, Y0
   764	
   765		MY_VPALIGNR(Y1, Y7, Y6, 8)
   766	
   767		VPSRLQ $1, Y1, Y2
   768		VPSLLQ $(64-1), Y1, Y3
   769		VPOR   Y2, Y3, Y3
   770	
   771		VPSRLQ $7, Y1, Y8
   772	
   773		MOVQ  AX, DI
   774		RORXQ $41, DX, R13
   775		RORXQ $18, DX, R14
   776		ADDQ  frame_YFER(SP), R11
   777		ORQ   CX, DI
   778		MOVQ  R9, R15
   779		RORXQ $34, AX, R12
   780	
   781		XORQ  R14, R13
   782		XORQ  R10, R15
   783		RORXQ $14, DX, R14
   784	
   785		ANDQ  DX, R15
   786		XORQ  R14, R13
   787		RORXQ $39, AX, R14
   788		ADDQ  R11, R8
   789	
   790		ANDQ  BX, DI
   791		XORQ  R12, R14
   792		RORXQ $28, AX, R12
   793	
   794		XORQ R10, R15
   795		XORQ R12, R14
   796		MOVQ AX, R12
   797		ANDQ CX, R12
   798	
   799		ADDQ R13, R15
   800		ORQ  R12, DI
   801		ADDQ R14, R11
   802	
   803		ADDQ R15, R8
   804	
   805		ADDQ R15, R11
   806		ADDQ DI, R11
   807	
   808		VPSRLQ $8, Y1, Y2
   809		VPSLLQ $(64-8), Y1, Y1
   810		VPOR   Y2, Y1, Y1
   811	
   812		VPXOR Y8, Y3, Y3
   813		VPXOR Y1, Y3, Y1
   814	
   815		VPADDQ Y1, Y0, Y0
   816	
   817		VPERM2F128 $0x0, Y0, Y0, Y6
   818	
   819		VPAND MASK_YMM_LO<>(SB), Y0, Y0
   820	
   821		VPERM2F128 $0x11, Y5, Y5, Y2
   822		VPSRLQ     $6, Y2, Y8
   823	
   824		MOVQ  R11, DI
   825		RORXQ $41, R8, R13
   826		RORXQ $18, R8, R14
   827		ADDQ  1*8+frame_YFER(SP), R10
   828		ORQ   BX, DI
   829	
   830		MOVQ  DX, R15
   831		RORXQ $34, R11, R12
   832		XORQ  R14, R13
   833		XORQ  R9, R15
   834	
   835		RORXQ $14, R8, R14
   836		XORQ  R14, R13
   837		RORXQ $39, R11, R14
   838		ANDQ  R8, R15
   839		ADDQ  R10, CX
   840	
   841		ANDQ AX, DI
   842		XORQ R12, R14
   843	
   844		RORXQ $28, R11, R12
   845		XORQ  R9, R15
   846	
   847		XORQ R12, R14
   848		MOVQ R11, R12
   849		ANDQ BX, R12
   850		ADDQ R13, R15
   851	
   852		ORQ  R12, DI
   853		ADDQ R14, R10
   854	
   855		ADDQ R15, CX
   856		ADDQ R15, R10
   857		ADDQ DI, R10
   858	
   859		VPSRLQ $19, Y2, Y3
   860		VPSLLQ $(64-19), Y2, Y1
   861		VPOR   Y1, Y3, Y3
   862		VPXOR  Y3, Y8, Y8
   863		VPSRLQ $61, Y2, Y3
   864		VPSLLQ $(64-61), Y2, Y1
   865		VPOR   Y1, Y3, Y3
   866		VPXOR  Y3, Y8, Y8
   867	
   868		VPADDQ Y8, Y6, Y6
   869	
   870		VPSRLQ $6, Y6, Y8
   871	
   872		MOVQ  R10, DI
   873		RORXQ $41, CX, R13
   874		ADDQ  2*8+frame_YFER(SP), R9
   875	
   876		RORXQ $18, CX, R14
   877		ORQ   AX, DI
   878		MOVQ  R8, R15
   879		XORQ  DX, R15
   880	
   881		RORXQ $34, R10, R12
   882		XORQ  R14, R13
   883		ANDQ  CX, R15
   884	
   885		RORXQ $14, CX, R14
   886		ADDQ  R9, BX
   887		ANDQ  R11, DI
   888	
   889		XORQ  R14, R13
   890		RORXQ $39, R10, R14
   891		XORQ  DX, R15
   892	
   893		XORQ  R12, R14
   894		RORXQ $28, R10, R12
   895	
   896		XORQ R12, R14
   897		MOVQ R10, R12
   898		ANDQ AX, R12
   899		ADDQ R13, R15
   900	
   901		ORQ  R12, DI
   902		ADDQ R14, R9
   903		ADDQ R15, BX
   904		ADDQ R15, R9
   905	
   906		ADDQ DI, R9
   907	
   908		VPSRLQ $19, Y6, Y3
   909		VPSLLQ $(64-19), Y6, Y1
   910		VPOR   Y1, Y3, Y3
   911		VPXOR  Y3, Y8, Y8
   912		VPSRLQ $61, Y6, Y3
   913		VPSLLQ $(64-61), Y6, Y1
   914		VPOR   Y1, Y3, Y3
   915		VPXOR  Y3, Y8, Y8
   916	
   917		VPADDQ Y8, Y0, Y2
   918	
   919		VPBLENDD $0xF0, Y2, Y6, Y6
   920	
   921		MOVQ  R9, DI
   922		RORXQ $41, BX, R13
   923		RORXQ $18, BX, R14
   924		ADDQ  3*8+frame_YFER(SP), DX
   925		ORQ   R11, DI
   926	
   927		MOVQ  CX, R15
   928		RORXQ $34, R9, R12
   929		XORQ  R14, R13
   930		XORQ  R8, R15
   931	
   932		RORXQ $14, BX, R14
   933		ANDQ  BX, R15
   934		ADDQ  DX, AX
   935		ANDQ  R10, DI
   936	
   937		XORQ R14, R13
   938		XORQ R8, R15
   939	
   940		RORXQ $39, R9, R14
   941		ADDQ  R13, R15
   942	
   943		XORQ R12, R14
   944		ADDQ R15, AX
   945	
   946		RORXQ $28, R9, R12
   947	
   948		XORQ R12, R14
   949		MOVQ R9, R12
   950		ANDQ R11, R12
   951		ORQ  R12, DI
   952	
   953		ADDQ R14, DX
   954		ADDQ R15, DX
   955		ADDQ DI, DX
   956	
   957		VPADDQ  3*32(BP), Y7, Y0
   958		VMOVDQU Y0, frame_YFER(SP)
   959		ADDQ    $(4*32), BP
   960	
   961		MY_VPALIGNR(Y0, Y6, Y5, 8)
   962	
   963		VPADDQ Y7, Y0, Y0
   964	
   965		MY_VPALIGNR(Y1, Y4, Y7, 8)
   966	
   967		VPSRLQ $1, Y1, Y2
   968		VPSLLQ $(64-1), Y1, Y3
   969		VPOR   Y2, Y3, Y3
   970	
   971		VPSRLQ $7, Y1, Y8
   972	
   973		MOVQ  DX, DI
   974		RORXQ $41, AX, R13
   975		RORXQ $18, AX, R14
   976		ADDQ  frame_YFER(SP), R8
   977		ORQ   R10, DI
   978		MOVQ  BX, R15
   979		RORXQ $34, DX, R12
   980	
   981		XORQ  R14, R13
   982		XORQ  CX, R15
   983		RORXQ $14, AX, R14
   984	
   985		ANDQ  AX, R15
   986		XORQ  R14, R13
   987		RORXQ $39, DX, R14
   988		ADDQ  R8, R11
   989	
   990		ANDQ  R9, DI
   991		XORQ  R12, R14
   992		RORXQ $28, DX, R12
   993	
   994		XORQ CX, R15
   995		XORQ R12, R14
   996		MOVQ DX, R12
   997		ANDQ R10, R12
   998	
   999		ADDQ R13, R15
  1000		ORQ  R12, DI
  1001		ADDQ R14, R8
  1002	
  1003		ADDQ R15, R11
  1004	
  1005		ADDQ R15, R8
  1006		ADDQ DI, R8
  1007	
  1008		VPSRLQ $8, Y1, Y2
  1009		VPSLLQ $(64-8), Y1, Y1
  1010		VPOR   Y2, Y1, Y1
  1011	
  1012		VPXOR Y8, Y3, Y3
  1013		VPXOR Y1, Y3, Y1
  1014	
  1015		VPADDQ Y1, Y0, Y0
  1016	
  1017		VPERM2F128 $0x0, Y0, Y0, Y7
  1018	
  1019		VPAND MASK_YMM_LO<>(SB), Y0, Y0
  1020	
  1021		VPERM2F128 $0x11, Y6, Y6, Y2
  1022		VPSRLQ     $6, Y2, Y8
  1023	
  1024		MOVQ  R8, DI
  1025		RORXQ $41, R11, R13
  1026		RORXQ $18, R11, R14
  1027		ADDQ  1*8+frame_YFER(SP), CX
  1028		ORQ   R9, DI
  1029	
  1030		MOVQ  AX, R15
  1031		RORXQ $34, R8, R12
  1032		XORQ  R14, R13
  1033		XORQ  BX, R15
  1034	
  1035		RORXQ $14, R11, R14
  1036		XORQ  R14, R13
  1037		RORXQ $39, R8, R14
  1038		ANDQ  R11, R15
  1039		ADDQ  CX, R10
  1040	
  1041		ANDQ DX, DI
  1042		XORQ R12, R14
  1043	
  1044		RORXQ $28, R8, R12
  1045		XORQ  BX, R15
  1046	
  1047		XORQ R12, R14
  1048		MOVQ R8, R12
  1049		ANDQ R9, R12
  1050		ADDQ R13, R15
  1051	
  1052		ORQ  R12, DI
  1053		ADDQ R14, CX
  1054	
  1055		ADDQ R15, R10
  1056		ADDQ R15, CX
  1057		ADDQ DI, CX
  1058	
  1059		VPSRLQ $19, Y2, Y3
  1060		VPSLLQ $(64-19), Y2, Y1
  1061		VPOR   Y1, Y3, Y3
  1062		VPXOR  Y3, Y8, Y8
  1063		VPSRLQ $61, Y2, Y3
  1064		VPSLLQ $(64-61), Y2, Y1
  1065		VPOR   Y1, Y3, Y3
  1066		VPXOR  Y3, Y8, Y8
  1067	
  1068		VPADDQ Y8, Y7, Y7
  1069	
  1070		VPSRLQ $6, Y7, Y8
  1071	
  1072		MOVQ  CX, DI
  1073		RORXQ $41, R10, R13
  1074		ADDQ  2*8+frame_YFER(SP), BX
  1075	
  1076		RORXQ $18, R10, R14
  1077		ORQ   DX, DI
  1078		MOVQ  R11, R15
  1079		XORQ  AX, R15
  1080	
  1081		RORXQ $34, CX, R12
  1082		XORQ  R14, R13
  1083		ANDQ  R10, R15
  1084	
  1085		RORXQ $14, R10, R14
  1086		ADDQ  BX, R9
  1087		ANDQ  R8, DI
  1088	
  1089		XORQ  R14, R13
  1090		RORXQ $39, CX, R14
  1091		XORQ  AX, R15
  1092	
  1093		XORQ  R12, R14
  1094		RORXQ $28, CX, R12
  1095	
  1096		XORQ R12, R14
  1097		MOVQ CX, R12
  1098		ANDQ DX, R12
  1099		ADDQ R13, R15
  1100	
  1101		ORQ  R12, DI
  1102		ADDQ R14, BX
  1103		ADDQ R15, R9
  1104		ADDQ R15, BX
  1105	
  1106		ADDQ DI, BX
  1107	
  1108		VPSRLQ $19, Y7, Y3
  1109		VPSLLQ $(64-19), Y7, Y1
  1110		VPOR   Y1, Y3, Y3
  1111		VPXOR  Y3, Y8, Y8
  1112		VPSRLQ $61, Y7, Y3
  1113		VPSLLQ $(64-61), Y7, Y1
  1114		VPOR   Y1, Y3, Y3
  1115		VPXOR  Y3, Y8, Y8
  1116	
  1117		VPADDQ Y8, Y0, Y2
  1118	
  1119		VPBLENDD $0xF0, Y2, Y7, Y7
  1120	
  1121		MOVQ  BX, DI
  1122		RORXQ $41, R9, R13
  1123		RORXQ $18, R9, R14
  1124		ADDQ  3*8+frame_YFER(SP), AX
  1125		ORQ   R8, DI
  1126	
  1127		MOVQ  R10, R15
  1128		RORXQ $34, BX, R12
  1129		XORQ  R14, R13
  1130		XORQ  R11, R15
  1131	
  1132		RORXQ $14, R9, R14
  1133		ANDQ  R9, R15
  1134		ADDQ  AX, DX
  1135		ANDQ  CX, DI
  1136	
  1137		XORQ R14, R13
  1138		XORQ R11, R15
  1139	
  1140		RORXQ $39, BX, R14
  1141		ADDQ  R13, R15
  1142	
  1143		XORQ R12, R14
  1144		ADDQ R15, DX
  1145	
  1146		RORXQ $28, BX, R12
  1147	
  1148		XORQ R12, R14
  1149		MOVQ BX, R12
  1150		ANDQ R8, R12
  1151		ORQ  R12, DI
  1152	
  1153		ADDQ R14, AX
  1154		ADDQ R15, AX
  1155		ADDQ DI, AX
  1156	
  1157		SUBQ $1, frame_SRND(SP)
  1158		JNE  loop1
  1159	
  1160		MOVQ $2, frame_SRND(SP)
  1161	
  1162	loop2:
  1163		VPADDQ  (BP), Y4, Y0
  1164		VMOVDQU Y0, frame_YFER(SP)
  1165	
  1166		MOVQ  R9, R15
  1167		RORXQ $41, DX, R13
  1168		RORXQ $18, DX, R14
  1169		XORQ  R10, R15
  1170	
  1171		XORQ  R14, R13
  1172		RORXQ $14, DX, R14
  1173		ANDQ  DX, R15
  1174	
  1175		XORQ  R14, R13
  1176		RORXQ $34, AX, R12
  1177		XORQ  R10, R15
  1178		RORXQ $39, AX, R14
  1179		MOVQ  AX, DI
  1180	
  1181		XORQ  R12, R14
  1182		RORXQ $28, AX, R12
  1183		ADDQ  frame_YFER(SP), R11
  1184		ORQ   CX, DI
  1185	
  1186		XORQ R12, R14
  1187		MOVQ AX, R12
  1188		ANDQ BX, DI
  1189		ANDQ CX, R12
  1190		ADDQ R13, R15
  1191	
  1192		ADDQ R11, R8
  1193		ORQ  R12, DI
  1194		ADDQ R14, R11
  1195	
  1196		ADDQ R15, R8
  1197	
  1198		ADDQ  R15, R11
  1199		MOVQ  DX, R15
  1200		RORXQ $41, R8, R13
  1201		RORXQ $18, R8, R14
  1202		XORQ  R9, R15
  1203	
  1204		XORQ  R14, R13
  1205		RORXQ $14, R8, R14
  1206		ANDQ  R8, R15
  1207		ADDQ  DI, R11
  1208	
  1209		XORQ  R14, R13
  1210		RORXQ $34, R11, R12
  1211		XORQ  R9, R15
  1212		RORXQ $39, R11, R14
  1213		MOVQ  R11, DI
  1214	
  1215		XORQ  R12, R14
  1216		RORXQ $28, R11, R12
  1217		ADDQ  8*1+frame_YFER(SP), R10
  1218		ORQ   BX, DI
  1219	
  1220		XORQ R12, R14
  1221		MOVQ R11, R12
  1222		ANDQ AX, DI
  1223		ANDQ BX, R12
  1224		ADDQ R13, R15
  1225	
  1226		ADDQ R10, CX
  1227		ORQ  R12, DI
  1228		ADDQ R14, R10
  1229	
  1230		ADDQ R15, CX
  1231	
  1232		ADDQ  R15, R10
  1233		MOVQ  R8, R15
  1234		RORXQ $41, CX, R13
  1235		RORXQ $18, CX, R14
  1236		XORQ  DX, R15
  1237	
  1238		XORQ  R14, R13
  1239		RORXQ $14, CX, R14
  1240		ANDQ  CX, R15
  1241		ADDQ  DI, R10
  1242	
  1243		XORQ  R14, R13
  1244		RORXQ $34, R10, R12
  1245		XORQ  DX, R15
  1246		RORXQ $39, R10, R14
  1247		MOVQ  R10, DI
  1248	
  1249		XORQ  R12, R14
  1250		RORXQ $28, R10, R12
  1251		ADDQ  8*2+frame_YFER(SP), R9
  1252		ORQ   AX, DI
  1253	
  1254		XORQ R12, R14
  1255		MOVQ R10, R12
  1256		ANDQ R11, DI
  1257		ANDQ AX, R12
  1258		ADDQ R13, R15
  1259	
  1260		ADDQ R9, BX
  1261		ORQ  R12, DI
  1262		ADDQ R14, R9
  1263	
  1264		ADDQ R15, BX
  1265	
  1266		ADDQ  R15, R9
  1267		MOVQ  CX, R15
  1268		RORXQ $41, BX, R13
  1269		RORXQ $18, BX, R14
  1270		XORQ  R8, R15
  1271	
  1272		XORQ  R14, R13
  1273		RORXQ $14, BX, R14
  1274		ANDQ  BX, R15
  1275		ADDQ  DI, R9
  1276	
  1277		XORQ  R14, R13
  1278		RORXQ $34, R9, R12
  1279		XORQ  R8, R15
  1280		RORXQ $39, R9, R14
  1281		MOVQ  R9, DI
  1282	
  1283		XORQ  R12, R14
  1284		RORXQ $28, R9, R12
  1285		ADDQ  8*3+frame_YFER(SP), DX
  1286		ORQ   R11, DI
  1287	
  1288		XORQ R12, R14
  1289		MOVQ R9, R12
  1290		ANDQ R10, DI
  1291		ANDQ R11, R12
  1292		ADDQ R13, R15
  1293	
  1294		ADDQ DX, AX
  1295		ORQ  R12, DI
  1296		ADDQ R14, DX
  1297	
  1298		ADDQ R15, AX
  1299	
  1300		ADDQ R15, DX
  1301	
  1302		ADDQ DI, DX
  1303	
  1304		VPADDQ  1*32(BP), Y5, Y0
  1305		VMOVDQU Y0, frame_YFER(SP)
  1306		ADDQ    $(2*32), BP
  1307	
  1308		MOVQ  BX, R15
  1309		RORXQ $41, AX, R13
  1310		RORXQ $18, AX, R14
  1311		XORQ  CX, R15
  1312	
  1313		XORQ  R14, R13
  1314		RORXQ $14, AX, R14
  1315		ANDQ  AX, R15
  1316	
  1317		XORQ  R14, R13
  1318		RORXQ $34, DX, R12
  1319		XORQ  CX, R15
  1320		RORXQ $39, DX, R14
  1321		MOVQ  DX, DI
  1322	
  1323		XORQ  R12, R14
  1324		RORXQ $28, DX, R12
  1325		ADDQ  frame_YFER(SP), R8
  1326		ORQ   R10, DI
  1327	
  1328		XORQ R12, R14
  1329		MOVQ DX, R12
  1330		ANDQ R9, DI
  1331		ANDQ R10, R12
  1332		ADDQ R13, R15
  1333	
  1334		ADDQ R8, R11
  1335		ORQ  R12, DI
  1336		ADDQ R14, R8
  1337	
  1338		ADDQ R15, R11
  1339	
  1340		ADDQ  R15, R8
  1341		MOVQ  AX, R15
  1342		RORXQ $41, R11, R13
  1343		RORXQ $18, R11, R14
  1344		XORQ  BX, R15
  1345	
  1346		XORQ  R14, R13
  1347		RORXQ $14, R11, R14
  1348		ANDQ  R11, R15
  1349		ADDQ  DI, R8
  1350	
  1351		XORQ  R14, R13
  1352		RORXQ $34, R8, R12
  1353		XORQ  BX, R15
  1354		RORXQ $39, R8, R14
  1355		MOVQ  R8, DI
  1356	
  1357		XORQ  R12, R14
  1358		RORXQ $28, R8, R12
  1359		ADDQ  8*1+frame_YFER(SP), CX
  1360		ORQ   R9, DI
  1361	
  1362		XORQ R12, R14
  1363		MOVQ R8, R12
  1364		ANDQ DX, DI
  1365		ANDQ R9, R12
  1366		ADDQ R13, R15
  1367	
  1368		ADDQ CX, R10
  1369		ORQ  R12, DI
  1370		ADDQ R14, CX
  1371	
  1372		ADDQ R15, R10
  1373	
  1374		ADDQ  R15, CX
  1375		MOVQ  R11, R15
  1376		RORXQ $41, R10, R13
  1377		RORXQ $18, R10, R14
  1378		XORQ  AX, R15
  1379	
  1380		XORQ  R14, R13
  1381		RORXQ $14, R10, R14
  1382		ANDQ  R10, R15
  1383		ADDQ  DI, CX
  1384	
  1385		XORQ  R14, R13
  1386		RORXQ $34, CX, R12
  1387		XORQ  AX, R15
  1388		RORXQ $39, CX, R14
  1389		MOVQ  CX, DI
  1390	
  1391		XORQ  R12, R14
  1392		RORXQ $28, CX, R12
  1393		ADDQ  8*2+frame_YFER(SP), BX
  1394		ORQ   DX, DI
  1395	
  1396		XORQ R12, R14
  1397		MOVQ CX, R12
  1398		ANDQ R8, DI
  1399		ANDQ DX, R12
  1400		ADDQ R13, R15
  1401	
  1402		ADDQ BX, R9
  1403		ORQ  R12, DI
  1404		ADDQ R14, BX
  1405	
  1406		ADDQ R15, R9
  1407	
  1408		ADDQ  R15, BX
  1409		MOVQ  R10, R15
  1410		RORXQ $41, R9, R13
  1411		RORXQ $18, R9, R14
  1412		XORQ  R11, R15
  1413	
  1414		XORQ  R14, R13
  1415		RORXQ $14, R9, R14
  1416		ANDQ  R9, R15
  1417		ADDQ  DI, BX
  1418	
  1419		XORQ  R14, R13
  1420		RORXQ $34, BX, R12
  1421		XORQ  R11, R15
  1422		RORXQ $39, BX, R14
  1423		MOVQ  BX, DI
  1424	
  1425		XORQ  R12, R14
  1426		RORXQ $28, BX, R12
  1427		ADDQ  8*3+frame_YFER(SP), AX
  1428		ORQ   R8, DI
  1429	
  1430		XORQ R12, R14
  1431		MOVQ BX, R12
  1432		ANDQ CX, DI
  1433		ANDQ R8, R12
  1434		ADDQ R13, R15
  1435	
  1436		ADDQ AX, DX
  1437		ORQ  R12, DI
  1438		ADDQ R14, AX
  1439	
  1440		ADDQ R15, DX
  1441	
  1442		ADDQ R15, AX
  1443	
  1444		ADDQ DI, AX
  1445	
  1446		VMOVDQU Y6, Y4
  1447		VMOVDQU Y7, Y5
  1448	
  1449		SUBQ $1, frame_SRND(SP)
  1450		JNE  loop2
  1451	
  1452		addm(8*0(SI),AX)
  1453		addm(8*1(SI),BX)
  1454		addm(8*2(SI),CX)
  1455		addm(8*3(SI),R8)
  1456		addm(8*4(SI),DX)
  1457		addm(8*5(SI),R9)
  1458		addm(8*6(SI),R10)
  1459		addm(8*7(SI),R11)
  1460	
  1461		MOVQ frame_INP(SP), DI
  1462		ADDQ $128, DI
  1463		CMPQ DI, frame_INPEND(SP)
  1464		JNE  loop0
  1465	
  1466	done_hash:
  1467		VZEROUPPER
  1468		RET

View as plain text