...

Text file src/pkg/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s

     1	// Copyright 2016 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare.
     6	
     7	// +build go1.7,amd64,!gccgo,!appengine
     8	
     9	#include "textflag.h"
    10	// General register allocation
    11	#define oup DI
    12	#define inp SI
    13	#define inl BX
    14	#define adp CX // free to reuse, after we hash the additional data
    15	#define keyp R8 // free to reuse, when we copy the key to stack
    16	#define itr2 R9 // general iterator
    17	#define itr1 CX // general iterator
    18	#define acc0 R10
    19	#define acc1 R11
    20	#define acc2 R12
    21	#define t0 R13
    22	#define t1 R14
    23	#define t2 R15
    24	#define t3 R8
    25	// Register and stack allocation for the SSE code
    26	#define rStore (0*16)(BP)
    27	#define sStore (1*16)(BP)
    28	#define state1Store (2*16)(BP)
    29	#define state2Store (3*16)(BP)
    30	#define tmpStore (4*16)(BP)
    31	#define ctr0Store (5*16)(BP)
    32	#define ctr1Store (6*16)(BP)
    33	#define ctr2Store (7*16)(BP)
    34	#define ctr3Store (8*16)(BP)
    35	#define A0 X0
    36	#define A1 X1
    37	#define A2 X2
    38	#define B0 X3
    39	#define B1 X4
    40	#define B2 X5
    41	#define C0 X6
    42	#define C1 X7
    43	#define C2 X8
    44	#define D0 X9
    45	#define D1 X10
    46	#define D2 X11
    47	#define T0 X12
    48	#define T1 X13
    49	#define T2 X14
    50	#define T3 X15
    51	#define A3 T0
    52	#define B3 T1
    53	#define C3 T2
    54	#define D3 T3
    55	// Register and stack allocation for the AVX2 code
    56	#define rsStoreAVX2 (0*32)(BP)
    57	#define state1StoreAVX2 (1*32)(BP)
    58	#define state2StoreAVX2 (2*32)(BP)
    59	#define ctr0StoreAVX2 (3*32)(BP)
    60	#define ctr1StoreAVX2 (4*32)(BP)
    61	#define ctr2StoreAVX2 (5*32)(BP)
    62	#define ctr3StoreAVX2 (6*32)(BP)
    63	#define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack
    64	#define AA0 Y0
    65	#define AA1 Y5
    66	#define AA2 Y6
    67	#define AA3 Y7
    68	#define BB0 Y14
    69	#define BB1 Y9
    70	#define BB2 Y10
    71	#define BB3 Y11
    72	#define CC0 Y12
    73	#define CC1 Y13
    74	#define CC2 Y8
    75	#define CC3 Y15
    76	#define DD0 Y4
    77	#define DD1 Y1
    78	#define DD2 Y2
    79	#define DD3 Y3
    80	#define TT0 DD3
    81	#define TT1 AA3
    82	#define TT2 BB3
    83	#define TT3 CC3
    84	// ChaCha20 constants
    85	DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865
    86	DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e
    87	DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32
    88	DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574
    89	DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865
    90	DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e
    91	DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32
    92	DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574
    93	// <<< 16 with PSHUFB
    94	DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
    95	DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
    96	DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302
    97	DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
    98	// <<< 8 with PSHUFB
    99	DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
   100	DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
   101	DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003
   102	DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
   103	
   104	DATA ·avx2InitMask<>+0x00(SB)/8, $0x0
   105	DATA ·avx2InitMask<>+0x08(SB)/8, $0x0
   106	DATA ·avx2InitMask<>+0x10(SB)/8, $0x1
   107	DATA ·avx2InitMask<>+0x18(SB)/8, $0x0
   108	
   109	DATA ·avx2IncMask<>+0x00(SB)/8, $0x2
   110	DATA ·avx2IncMask<>+0x08(SB)/8, $0x0
   111	DATA ·avx2IncMask<>+0x10(SB)/8, $0x2
   112	DATA ·avx2IncMask<>+0x18(SB)/8, $0x0
   113	// Poly1305 key clamp
   114	DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
   115	DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
   116	DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
   117	DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
   118	
   119	DATA ·sseIncMask<>+0x00(SB)/8, $0x1
   120	DATA ·sseIncMask<>+0x08(SB)/8, $0x0
   121	// To load/store the last < 16 bytes in a buffer
   122	DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff
   123	DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000
   124	DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff
   125	DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000
   126	DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff
   127	DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000
   128	DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff
   129	DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000
   130	DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff
   131	DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000
   132	DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff
   133	DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000
   134	DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff
   135	DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000
   136	DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff
   137	DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000
   138	DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff
   139	DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff
   140	DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff
   141	DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff
   142	DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff
   143	DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff
   144	DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff
   145	DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff
   146	DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff
   147	DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff
   148	DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff
   149	DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
   150	DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff
   151	DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
   152	
   153	GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32
   154	GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32
   155	GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32
   156	GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16
   157	GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32
   158	GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32
   159	GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32
   160	GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
   161	// No PALIGNR in Go ASM yet (but VPALIGNR is present).
   162	#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
   163	#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
   164	#define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5
   165	#define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13
   166	#define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6
   167	#define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7
   168	#define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8
   169	#define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14
   170	#define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9
   171	#define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10
   172	#define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11
   173	#define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15
   174	#define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3
   175	#define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4
   176	#define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5
   177	#define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13
   178	#define shiftC0Right shiftC0Left
   179	#define shiftC1Right shiftC1Left
   180	#define shiftC2Right shiftC2Left
   181	#define shiftC3Right shiftC3Left
   182	#define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9
   183	#define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10
   184	#define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11
   185	#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
   186	// Some macros
   187	#define chachaQR(A, B, C, D, T) \
   188		PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D                            \
   189		PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
   190		PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D                             \
   191		PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
   192	
   193	#define chachaQR_AVX2(A, B, C, D, T) \
   194		VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D                         \
   195		VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
   196		VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D                          \
   197		VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
   198	
   199	#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
   200	#define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2
   201	#define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX
   202	#define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3
   203	#define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t3, t2; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2
   204	
   205	#define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2
   206	#define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3
   207	#define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3
   208	
   209	#define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage
   210	#define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage
   211	// ----------------------------------------------------------------------------
   212	TEXT polyHashADInternal<>(SB), NOSPLIT, $0
   213		// adp points to beginning of additional data
   214		// itr2 holds ad length
   215		XORQ acc0, acc0
   216		XORQ acc1, acc1
   217		XORQ acc2, acc2
   218		CMPQ itr2, $13
   219		JNE  hashADLoop
   220	
   221	openFastTLSAD:
   222		// Special treatment for the TLS case of 13 bytes
   223		MOVQ (adp), acc0
   224		MOVQ 5(adp), acc1
   225		SHRQ $24, acc1
   226		MOVQ $1, acc2
   227		polyMul
   228		RET
   229	
   230	hashADLoop:
   231		// Hash in 16 byte chunks
   232		CMPQ itr2, $16
   233		JB   hashADTail
   234		polyAdd(0(adp))
   235		LEAQ (1*16)(adp), adp
   236		SUBQ $16, itr2
   237		polyMul
   238		JMP  hashADLoop
   239	
   240	hashADTail:
   241		CMPQ itr2, $0
   242		JE   hashADDone
   243	
   244		// Hash last < 16 byte tail
   245		XORQ t0, t0
   246		XORQ t1, t1
   247		XORQ t2, t2
   248		ADDQ itr2, adp
   249	
   250	hashADTailLoop:
   251		SHLQ $8, t0, t1
   252		SHLQ $8, t0
   253		MOVB -1(adp), t2
   254		XORQ t2, t0
   255		DECQ adp
   256		DECQ itr2
   257		JNE  hashADTailLoop
   258	
   259	hashADTailFinish:
   260		ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
   261		polyMul
   262	
   263		// Finished AD
   264	hashADDone:
   265		RET
   266	
   267	// ----------------------------------------------------------------------------
   268	// func chacha20Poly1305Open(dst, key, src, ad []byte) bool
   269	TEXT ·chacha20Poly1305Open(SB), 0, $288-97
   270		// For aligned stack access
   271		MOVQ SP, BP
   272		ADDQ $32, BP
   273		ANDQ $-32, BP
   274		MOVQ dst+0(FP), oup
   275		MOVQ key+24(FP), keyp
   276		MOVQ src+48(FP), inp
   277		MOVQ src_len+56(FP), inl
   278		MOVQ ad+72(FP), adp
   279	
   280		// Check for AVX2 support
   281		CMPB ·useAVX2(SB), $1
   282		JE   chacha20Poly1305Open_AVX2
   283	
   284		// Special optimization, for very short buffers
   285		CMPQ inl, $128
   286		JBE  openSSE128 // About 16% faster
   287	
   288		// For long buffers, prepare the poly key first
   289		MOVOU ·chacha20Constants<>(SB), A0
   290		MOVOU (1*16)(keyp), B0
   291		MOVOU (2*16)(keyp), C0
   292		MOVOU (3*16)(keyp), D0
   293		MOVO  D0, T1
   294	
   295		// Store state on stack for future use
   296		MOVO B0, state1Store
   297		MOVO C0, state2Store
   298		MOVO D0, ctr3Store
   299		MOVQ $10, itr2
   300	
   301	openSSEPreparePolyKey:
   302		chachaQR(A0, B0, C0, D0, T0)
   303		shiftB0Left;  shiftC0Left; shiftD0Left
   304		chachaQR(A0, B0, C0, D0, T0)
   305		shiftB0Right; shiftC0Right; shiftD0Right
   306		DECQ          itr2
   307		JNE           openSSEPreparePolyKey
   308	
   309		// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
   310		PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0
   311	
   312		// Clamp and store the key
   313		PAND ·polyClampMask<>(SB), A0
   314		MOVO A0, rStore; MOVO B0, sStore
   315	
   316		// Hash AAD
   317		MOVQ ad_len+80(FP), itr2
   318		CALL polyHashADInternal<>(SB)
   319	
   320	openSSEMainLoop:
   321		CMPQ inl, $256
   322		JB   openSSEMainLoopDone
   323	
   324		// Load state, increment counter blocks
   325		MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
   326		MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
   327		MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
   328		MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
   329	
   330		// Store counters
   331		MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
   332	
   333		// There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
   334		MOVQ $4, itr1
   335		MOVQ inp, itr2
   336	
   337	openSSEInternalLoop:
   338		MOVO          C3, tmpStore
   339		chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
   340		MOVO          tmpStore, C3
   341		MOVO          C1, tmpStore
   342		chachaQR(A3, B3, C3, D3, C1)
   343		MOVO          tmpStore, C1
   344		polyAdd(0(itr2))
   345		shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
   346		shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
   347		shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
   348		polyMulStage1
   349		polyMulStage2
   350		LEAQ          (2*8)(itr2), itr2
   351		MOVO          C3, tmpStore
   352		chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
   353		MOVO          tmpStore, C3
   354		MOVO          C1, tmpStore
   355		polyMulStage3
   356		chachaQR(A3, B3, C3, D3, C1)
   357		MOVO          tmpStore, C1
   358		polyMulReduceStage
   359		shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
   360		shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
   361		shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
   362		DECQ          itr1
   363		JGE           openSSEInternalLoop
   364	
   365		polyAdd(0(itr2))
   366		polyMul
   367		LEAQ (2*8)(itr2), itr2
   368	
   369		CMPQ itr1, $-6
   370		JG   openSSEInternalLoop
   371	
   372		// Add in the state
   373		PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
   374		PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
   375		PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
   376		PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
   377	
   378		// Load - xor - store
   379		MOVO  D3, tmpStore
   380		MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup)
   381		MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup)
   382		MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup)
   383		MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup)
   384		MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup)
   385		MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup)
   386		MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup)
   387		MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup)
   388		MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup)
   389		MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup)
   390		MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup)
   391		MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup)
   392		MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup)
   393		MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup)
   394		MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup)
   395		MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup)
   396		LEAQ  256(inp), inp
   397		LEAQ  256(oup), oup
   398		SUBQ  $256, inl
   399		JMP   openSSEMainLoop
   400	
   401	openSSEMainLoopDone:
   402		// Handle the various tail sizes efficiently
   403		TESTQ inl, inl
   404		JE    openSSEFinalize
   405		CMPQ  inl, $64
   406		JBE   openSSETail64
   407		CMPQ  inl, $128
   408		JBE   openSSETail128
   409		CMPQ  inl, $192
   410		JBE   openSSETail192
   411		JMP   openSSETail256
   412	
   413	openSSEFinalize:
   414		// Hash in the PT, AAD lengths
   415		ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2
   416		polyMul
   417	
   418		// Final reduce
   419		MOVQ    acc0, t0
   420		MOVQ    acc1, t1
   421		MOVQ    acc2, t2
   422		SUBQ    $-5, acc0
   423		SBBQ    $-1, acc1
   424		SBBQ    $3, acc2
   425		CMOVQCS t0, acc0
   426		CMOVQCS t1, acc1
   427		CMOVQCS t2, acc2
   428	
   429		// Add in the "s" part of the key
   430		ADDQ 0+sStore, acc0
   431		ADCQ 8+sStore, acc1
   432	
   433		// Finally, constant time compare to the tag at the end of the message
   434		XORQ    AX, AX
   435		MOVQ    $1, DX
   436		XORQ    (0*8)(inp), acc0
   437		XORQ    (1*8)(inp), acc1
   438		ORQ     acc1, acc0
   439		CMOVQEQ DX, AX
   440	
   441		// Return true iff tags are equal
   442		MOVB AX, ret+96(FP)
   443		RET
   444	
   445	// ----------------------------------------------------------------------------
   446	// Special optimization for buffers smaller than 129 bytes
   447	openSSE128:
   448		// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
   449		MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
   450		MOVO  A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
   451		MOVO  A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
   452		MOVO  B0, T1; MOVO C0, T2; MOVO D1, T3
   453		MOVQ  $10, itr2
   454	
   455	openSSE128InnerCipherLoop:
   456		chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
   457		shiftB0Left;  shiftB1Left; shiftB2Left
   458		shiftC0Left;  shiftC1Left; shiftC2Left
   459		shiftD0Left;  shiftD1Left; shiftD2Left
   460		chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
   461		shiftB0Right; shiftB1Right; shiftB2Right
   462		shiftC0Right; shiftC1Right; shiftC2Right
   463		shiftD0Right; shiftD1Right; shiftD2Right
   464		DECQ          itr2
   465		JNE           openSSE128InnerCipherLoop
   466	
   467		// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
   468		PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
   469		PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
   470		PADDL T2, C1; PADDL T2, C2
   471		PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
   472	
   473		// Clamp and store the key
   474		PAND  ·polyClampMask<>(SB), A0
   475		MOVOU A0, rStore; MOVOU B0, sStore
   476	
   477		// Hash
   478		MOVQ ad_len+80(FP), itr2
   479		CALL polyHashADInternal<>(SB)
   480	
   481	openSSE128Open:
   482		CMPQ inl, $16
   483		JB   openSSETail16
   484		SUBQ $16, inl
   485	
   486		// Load for hashing
   487		polyAdd(0(inp))
   488	
   489		// Load for decryption
   490		MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup)
   491		LEAQ  (1*16)(inp), inp
   492		LEAQ  (1*16)(oup), oup
   493		polyMul
   494	
   495		// Shift the stream "left"
   496		MOVO B1, A1
   497		MOVO C1, B1
   498		MOVO D1, C1
   499		MOVO A2, D1
   500		MOVO B2, A2
   501		MOVO C2, B2
   502		MOVO D2, C2
   503		JMP  openSSE128Open
   504	
   505	openSSETail16:
   506		TESTQ inl, inl
   507		JE    openSSEFinalize
   508	
   509		// We can safely load the CT from the end, because it is padded with the MAC
   510		MOVQ   inl, itr2
   511		SHLQ   $4, itr2
   512		LEAQ   ·andMask<>(SB), t0
   513		MOVOU  (inp), T0
   514		ADDQ   inl, inp
   515		PAND   -16(t0)(itr2*1), T0
   516		MOVO   T0, 0+tmpStore
   517		MOVQ   T0, t0
   518		MOVQ   8+tmpStore, t1
   519		PXOR   A1, T0
   520	
   521		// We can only store one byte at a time, since plaintext can be shorter than 16 bytes
   522	openSSETail16Store:
   523		MOVQ T0, t3
   524		MOVB t3, (oup)
   525		PSRLDQ $1, T0
   526		INCQ   oup
   527		DECQ   inl
   528		JNE    openSSETail16Store
   529		ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
   530		polyMul
   531		JMP    openSSEFinalize
   532	
   533	// ----------------------------------------------------------------------------
   534	// Special optimization for the last 64 bytes of ciphertext
   535	openSSETail64:
   536		// Need to decrypt up to 64 bytes - prepare single block
   537		MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
   538		XORQ itr2, itr2
   539		MOVQ inl, itr1
   540		CMPQ itr1, $16
   541		JB   openSSETail64LoopB
   542	
   543	openSSETail64LoopA:
   544		// Perform ChaCha rounds, while hashing the remaining input
   545		polyAdd(0(inp)(itr2*1))
   546		polyMul
   547		SUBQ $16, itr1
   548	
   549	openSSETail64LoopB:
   550		ADDQ          $16, itr2
   551		chachaQR(A0, B0, C0, D0, T0)
   552		shiftB0Left;  shiftC0Left; shiftD0Left
   553		chachaQR(A0, B0, C0, D0, T0)
   554		shiftB0Right; shiftC0Right; shiftD0Right
   555	
   556		CMPQ itr1, $16
   557		JAE  openSSETail64LoopA
   558	
   559		CMPQ itr2, $160
   560		JNE  openSSETail64LoopB
   561	
   562		PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
   563	
   564	openSSETail64DecLoop:
   565		CMPQ  inl, $16
   566		JB    openSSETail64DecLoopDone
   567		SUBQ  $16, inl
   568		MOVOU (inp), T0
   569		PXOR  T0, A0
   570		MOVOU A0, (oup)
   571		LEAQ  16(inp), inp
   572		LEAQ  16(oup), oup
   573		MOVO  B0, A0
   574		MOVO  C0, B0
   575		MOVO  D0, C0
   576		JMP   openSSETail64DecLoop
   577	
   578	openSSETail64DecLoopDone:
   579		MOVO A0, A1
   580		JMP  openSSETail16
   581	
   582	// ----------------------------------------------------------------------------
   583	// Special optimization for the last 128 bytes of ciphertext
   584	openSSETail128:
   585		// Need to decrypt up to 128 bytes - prepare two blocks
   586		MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store
   587		MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store
   588		XORQ itr2, itr2
   589		MOVQ inl, itr1
   590		ANDQ $-16, itr1
   591	
   592	openSSETail128LoopA:
   593		// Perform ChaCha rounds, while hashing the remaining input
   594		polyAdd(0(inp)(itr2*1))
   595		polyMul
   596	
   597	openSSETail128LoopB:
   598		ADDQ          $16, itr2
   599		chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
   600		shiftB0Left;  shiftC0Left; shiftD0Left
   601		shiftB1Left;  shiftC1Left; shiftD1Left
   602		chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
   603		shiftB0Right; shiftC0Right; shiftD0Right
   604		shiftB1Right; shiftC1Right; shiftD1Right
   605	
   606		CMPQ itr2, itr1
   607		JB   openSSETail128LoopA
   608	
   609		CMPQ itr2, $160
   610		JNE  openSSETail128LoopB
   611	
   612		PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
   613		PADDL state1Store, B0; PADDL state1Store, B1
   614		PADDL state2Store, C0; PADDL state2Store, C1
   615		PADDL ctr1Store, D0; PADDL ctr0Store, D1
   616	
   617		MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
   618		PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
   619		MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
   620	
   621		SUBQ $64, inl
   622		LEAQ 64(inp), inp
   623		LEAQ 64(oup), oup
   624		JMP  openSSETail64DecLoop
   625	
   626	// ----------------------------------------------------------------------------
   627	// Special optimization for the last 192 bytes of ciphertext
   628	openSSETail192:
   629		// Need to decrypt up to 192 bytes - prepare three blocks
   630		MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store
   631		MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
   632		MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store
   633	
   634		MOVQ    inl, itr1
   635		MOVQ    $160, itr2
   636		CMPQ    itr1, $160
   637		CMOVQGT itr2, itr1
   638		ANDQ    $-16, itr1
   639		XORQ    itr2, itr2
   640	
   641	openSSLTail192LoopA:
   642		// Perform ChaCha rounds, while hashing the remaining input
   643		polyAdd(0(inp)(itr2*1))
   644		polyMul
   645	
   646	openSSLTail192LoopB:
   647		ADDQ         $16, itr2
   648		chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
   649		shiftB0Left; shiftC0Left; shiftD0Left
   650		shiftB1Left; shiftC1Left; shiftD1Left
   651		shiftB2Left; shiftC2Left; shiftD2Left
   652	
   653		chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
   654		shiftB0Right; shiftC0Right; shiftD0Right
   655		shiftB1Right; shiftC1Right; shiftD1Right
   656		shiftB2Right; shiftC2Right; shiftD2Right
   657	
   658		CMPQ itr2, itr1
   659		JB   openSSLTail192LoopA
   660	
   661		CMPQ itr2, $160
   662		JNE  openSSLTail192LoopB
   663	
   664		CMPQ inl, $176
   665		JB   openSSLTail192Store
   666	
   667		polyAdd(160(inp))
   668		polyMul
   669	
   670		CMPQ inl, $192
   671		JB   openSSLTail192Store
   672	
   673		polyAdd(176(inp))
   674		polyMul
   675	
   676	openSSLTail192Store:
   677		PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
   678		PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
   679		PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
   680		PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
   681	
   682		MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
   683		PXOR  T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2
   684		MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup)
   685	
   686		MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
   687		PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
   688		MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
   689	
   690		SUBQ $128, inl
   691		LEAQ 128(inp), inp
   692		LEAQ 128(oup), oup
   693		JMP  openSSETail64DecLoop
   694	
   695	// ----------------------------------------------------------------------------
   696	// Special optimization for the last 256 bytes of ciphertext
   697	openSSETail256:
   698		// Need to decrypt up to 256 bytes - prepare four blocks
   699		MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
   700		MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
   701		MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
   702		MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
   703	
   704		// Store counters
   705		MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
   706		XORQ itr2, itr2
   707	
   708	openSSETail256Loop:
   709		// This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication
   710		polyAdd(0(inp)(itr2*1))
   711		MOVO          C3, tmpStore
   712		chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
   713		MOVO          tmpStore, C3
   714		MOVO          C1, tmpStore
   715		chachaQR(A3, B3, C3, D3, C1)
   716		MOVO          tmpStore, C1
   717		shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
   718		shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
   719		shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
   720		polyMulStage1
   721		polyMulStage2
   722		MOVO          C3, tmpStore
   723		chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
   724		MOVO          tmpStore, C3
   725		MOVO          C1, tmpStore
   726		chachaQR(A3, B3, C3, D3, C1)
   727		MOVO          tmpStore, C1
   728		polyMulStage3
   729		polyMulReduceStage
   730		shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
   731		shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
   732		shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
   733		ADDQ          $2*8, itr2
   734		CMPQ          itr2, $160
   735		JB            openSSETail256Loop
   736		MOVQ          inl, itr1
   737		ANDQ          $-16, itr1
   738	
   739	openSSETail256HashLoop:
   740		polyAdd(0(inp)(itr2*1))
   741		polyMul
   742		ADDQ $2*8, itr2
   743		CMPQ itr2, itr1
   744		JB   openSSETail256HashLoop
   745	
   746		// Add in the state
   747		PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
   748		PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
   749		PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
   750		PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
   751		MOVO  D3, tmpStore
   752	
   753		// Load - xor - store
   754		MOVOU (0*16)(inp), D3; PXOR D3, A0
   755		MOVOU (1*16)(inp), D3; PXOR D3, B0
   756		MOVOU (2*16)(inp), D3; PXOR D3, C0
   757		MOVOU (3*16)(inp), D3; PXOR D3, D0
   758		MOVOU A0, (0*16)(oup)
   759		MOVOU B0, (1*16)(oup)
   760		MOVOU C0, (2*16)(oup)
   761		MOVOU D0, (3*16)(oup)
   762		MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
   763		PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
   764		MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
   765		MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
   766		PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
   767		MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
   768		LEAQ  192(inp), inp
   769		LEAQ  192(oup), oup
   770		SUBQ  $192, inl
   771		MOVO  A3, A0
   772		MOVO  B3, B0
   773		MOVO  C3, C0
   774		MOVO  tmpStore, D0
   775	
   776		JMP openSSETail64DecLoop
   777	
   778	// ----------------------------------------------------------------------------
   779	// ------------------------- AVX2 Code ----------------------------------------
   780	chacha20Poly1305Open_AVX2:
   781		VZEROUPPER
   782		VMOVDQU ·chacha20Constants<>(SB), AA0
   783		BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
   784		BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
   785		BYTE    $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
   786		VPADDD  ·avx2InitMask<>(SB), DD0, DD0
   787	
   788		// Special optimization, for very short buffers
   789		CMPQ inl, $192
   790		JBE  openAVX2192
   791		CMPQ inl, $320
   792		JBE  openAVX2320
   793	
   794		// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
   795		VMOVDQA BB0, state1StoreAVX2
   796		VMOVDQA CC0, state2StoreAVX2
   797		VMOVDQA DD0, ctr3StoreAVX2
   798		MOVQ    $10, itr2
   799	
   800	openAVX2PreparePolyKey:
   801		chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
   802		VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
   803		chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
   804		VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
   805		DECQ     itr2
   806		JNE      openAVX2PreparePolyKey
   807	
   808		VPADDD ·chacha20Constants<>(SB), AA0, AA0
   809		VPADDD state1StoreAVX2, BB0, BB0
   810		VPADDD state2StoreAVX2, CC0, CC0
   811		VPADDD ctr3StoreAVX2, DD0, DD0
   812	
   813		VPERM2I128 $0x02, AA0, BB0, TT0
   814	
   815		// Clamp and store poly key
   816		VPAND   ·polyClampMask<>(SB), TT0, TT0
   817		VMOVDQA TT0, rsStoreAVX2
   818	
   819		// Stream for the first 64 bytes
   820		VPERM2I128 $0x13, AA0, BB0, AA0
   821		VPERM2I128 $0x13, CC0, DD0, BB0
   822	
   823		// Hash AD + first 64 bytes
   824		MOVQ ad_len+80(FP), itr2
   825		CALL polyHashADInternal<>(SB)
   826		XORQ itr1, itr1
   827	
   828	openAVX2InitialHash64:
   829		polyAdd(0(inp)(itr1*1))
   830		polyMulAVX2
   831		ADDQ $16, itr1
   832		CMPQ itr1, $64
   833		JNE  openAVX2InitialHash64
   834	
   835		// Decrypt the first 64 bytes
   836		VPXOR   (0*32)(inp), AA0, AA0
   837		VPXOR   (1*32)(inp), BB0, BB0
   838		VMOVDQU AA0, (0*32)(oup)
   839		VMOVDQU BB0, (1*32)(oup)
   840		LEAQ    (2*32)(inp), inp
   841		LEAQ    (2*32)(oup), oup
   842		SUBQ    $64, inl
   843	
   844	openAVX2MainLoop:
   845		CMPQ inl, $512
   846		JB   openAVX2MainLoopDone
   847	
   848		// Load state, increment counter blocks, store the incremented counters
   849		VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
   850		VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
   851		VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
   852		VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
   853		VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
   854		XORQ    itr1, itr1
   855	
   856	openAVX2InternalLoop:
   857		// Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications
   858		// Effectively per 512 bytes of stream we hash 480 bytes of ciphertext
   859		polyAdd(0*8(inp)(itr1*1))
   860		VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   861		polyMulStage1_AVX2
   862		VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   863		VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
   864		polyMulStage2_AVX2
   865		VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   866		VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   867		polyMulStage3_AVX2
   868		VMOVDQA  CC3, tmpStoreAVX2
   869		VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
   870		VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
   871		VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
   872		VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
   873		VMOVDQA  tmpStoreAVX2, CC3
   874		polyMulReduceStage
   875		VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   876		VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   877		VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
   878		polyAdd(2*8(inp)(itr1*1))
   879		VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   880		polyMulStage1_AVX2
   881		VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   882		VMOVDQA  CC3, tmpStoreAVX2
   883		VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
   884		VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
   885		VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
   886		VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
   887		VMOVDQA  tmpStoreAVX2, CC3
   888		polyMulStage2_AVX2
   889		VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
   890		VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
   891		VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
   892		VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   893		polyMulStage3_AVX2
   894		VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   895		VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
   896		polyMulReduceStage
   897		VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   898		VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   899		polyAdd(4*8(inp)(itr1*1))
   900		LEAQ     (6*8)(itr1), itr1
   901		VMOVDQA  CC3, tmpStoreAVX2
   902		VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
   903		VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
   904		VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
   905		VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
   906		VMOVDQA  tmpStoreAVX2, CC3
   907		polyMulStage1_AVX2
   908		VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
   909		VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
   910		polyMulStage2_AVX2
   911		VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
   912		VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
   913		polyMulStage3_AVX2
   914		VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
   915		VMOVDQA  CC3, tmpStoreAVX2
   916		VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
   917		VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
   918		VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
   919		VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
   920		VMOVDQA  tmpStoreAVX2, CC3
   921		polyMulReduceStage
   922		VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
   923		VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
   924		VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
   925		CMPQ     itr1, $480
   926		JNE      openAVX2InternalLoop
   927	
   928		VPADDD  ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
   929		VPADDD  state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
   930		VPADDD  state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
   931		VPADDD  ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
   932		VMOVDQA CC3, tmpStoreAVX2
   933	
   934		// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
   935		polyAdd(480(inp))
   936		polyMulAVX2
   937		VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
   938		VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
   939		VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
   940		VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
   941		VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
   942		VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
   943	
   944		// and here
   945		polyAdd(496(inp))
   946		polyMulAVX2
   947		VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
   948		VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
   949		VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
   950		VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
   951		VPXOR      (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
   952		VMOVDQU    AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
   953		LEAQ       (32*16)(inp), inp
   954		LEAQ       (32*16)(oup), oup
   955		SUBQ       $(32*16), inl
   956		JMP        openAVX2MainLoop
   957	
   958	openAVX2MainLoopDone:
   959		// Handle the various tail sizes efficiently
   960		TESTQ inl, inl
   961		JE    openSSEFinalize
   962		CMPQ  inl, $128
   963		JBE   openAVX2Tail128
   964		CMPQ  inl, $256
   965		JBE   openAVX2Tail256
   966		CMPQ  inl, $384
   967		JBE   openAVX2Tail384
   968		JMP   openAVX2Tail512
   969	
   970	// ----------------------------------------------------------------------------
   971	// Special optimization for buffers smaller than 193 bytes
   972	openAVX2192:
   973		// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
   974		VMOVDQA AA0, AA1
   975		VMOVDQA BB0, BB1
   976		VMOVDQA CC0, CC1
   977		VPADDD  ·avx2IncMask<>(SB), DD0, DD1
   978		VMOVDQA AA0, AA2
   979		VMOVDQA BB0, BB2
   980		VMOVDQA CC0, CC2
   981		VMOVDQA DD0, DD2
   982		VMOVDQA DD1, TT3
   983		MOVQ    $10, itr2
   984	
   985	openAVX2192InnerCipherLoop:
   986		chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
   987		VPALIGNR   $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
   988		VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
   989		VPALIGNR   $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
   990		chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
   991		VPALIGNR   $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
   992		VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
   993		VPALIGNR   $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
   994		DECQ       itr2
   995		JNE        openAVX2192InnerCipherLoop
   996		VPADDD     AA2, AA0, AA0; VPADDD AA2, AA1, AA1
   997		VPADDD     BB2, BB0, BB0; VPADDD BB2, BB1, BB1
   998		VPADDD     CC2, CC0, CC0; VPADDD CC2, CC1, CC1
   999		VPADDD     DD2, DD0, DD0; VPADDD TT3, DD1, DD1
  1000		VPERM2I128 $0x02, AA0, BB0, TT0
  1001	
  1002		// Clamp and store poly key
  1003		VPAND   ·polyClampMask<>(SB), TT0, TT0
  1004		VMOVDQA TT0, rsStoreAVX2
  1005	
  1006		// Stream for up to 192 bytes
  1007		VPERM2I128 $0x13, AA0, BB0, AA0
  1008		VPERM2I128 $0x13, CC0, DD0, BB0
  1009		VPERM2I128 $0x02, AA1, BB1, CC0
  1010		VPERM2I128 $0x02, CC1, DD1, DD0
  1011		VPERM2I128 $0x13, AA1, BB1, AA1
  1012		VPERM2I128 $0x13, CC1, DD1, BB1
  1013	
  1014	openAVX2ShortOpen:
  1015		// Hash
  1016		MOVQ ad_len+80(FP), itr2
  1017		CALL polyHashADInternal<>(SB)
  1018	
  1019	openAVX2ShortOpenLoop:
  1020		CMPQ inl, $32
  1021		JB   openAVX2ShortTail32
  1022		SUBQ $32, inl
  1023	
  1024		// Load for hashing
  1025		polyAdd(0*8(inp))
  1026		polyMulAVX2
  1027		polyAdd(2*8(inp))
  1028		polyMulAVX2
  1029	
  1030		// Load for decryption
  1031		VPXOR   (inp), AA0, AA0
  1032		VMOVDQU AA0, (oup)
  1033		LEAQ    (1*32)(inp), inp
  1034		LEAQ    (1*32)(oup), oup
  1035	
  1036		// Shift stream left
  1037		VMOVDQA BB0, AA0
  1038		VMOVDQA CC0, BB0
  1039		VMOVDQA DD0, CC0
  1040		VMOVDQA AA1, DD0
  1041		VMOVDQA BB1, AA1
  1042		VMOVDQA CC1, BB1
  1043		VMOVDQA DD1, CC1
  1044		VMOVDQA AA2, DD1
  1045		VMOVDQA BB2, AA2
  1046		JMP     openAVX2ShortOpenLoop
  1047	
  1048	openAVX2ShortTail32:
  1049		CMPQ    inl, $16
  1050		VMOVDQA A0, A1
  1051		JB      openAVX2ShortDone
  1052	
  1053		SUBQ $16, inl
  1054	
  1055		// Load for hashing
  1056		polyAdd(0*8(inp))
  1057		polyMulAVX2
  1058	
  1059		// Load for decryption
  1060		VPXOR      (inp), A0, T0
  1061		VMOVDQU    T0, (oup)
  1062		LEAQ       (1*16)(inp), inp
  1063		LEAQ       (1*16)(oup), oup
  1064		VPERM2I128 $0x11, AA0, AA0, AA0
  1065		VMOVDQA    A0, A1
  1066	
  1067	openAVX2ShortDone:
  1068		VZEROUPPER
  1069		JMP openSSETail16
  1070	
  1071	// ----------------------------------------------------------------------------
  1072	// Special optimization for buffers smaller than 321 bytes
  1073	openAVX2320:
  1074		// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
  1075		VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
  1076		VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
  1077		VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
  1078		MOVQ    $10, itr2
  1079	
  1080	openAVX2320InnerCipherLoop:
  1081		chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  1082		VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  1083		VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  1084		VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  1085		chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  1086		VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  1087		VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  1088		VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  1089		DECQ     itr2
  1090		JNE      openAVX2320InnerCipherLoop
  1091	
  1092		VMOVDQA ·chacha20Constants<>(SB), TT0
  1093		VPADDD  TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
  1094		VPADDD  TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
  1095		VPADDD  TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
  1096		VMOVDQA ·avx2IncMask<>(SB), TT0
  1097		VPADDD  TT3, DD0, DD0; VPADDD TT0, TT3, TT3
  1098		VPADDD  TT3, DD1, DD1; VPADDD TT0, TT3, TT3
  1099		VPADDD  TT3, DD2, DD2
  1100	
  1101		// Clamp and store poly key
  1102		VPERM2I128 $0x02, AA0, BB0, TT0
  1103		VPAND      ·polyClampMask<>(SB), TT0, TT0
  1104		VMOVDQA    TT0, rsStoreAVX2
  1105	
  1106		// Stream for up to 320 bytes
  1107		VPERM2I128 $0x13, AA0, BB0, AA0
  1108		VPERM2I128 $0x13, CC0, DD0, BB0
  1109		VPERM2I128 $0x02, AA1, BB1, CC0
  1110		VPERM2I128 $0x02, CC1, DD1, DD0
  1111		VPERM2I128 $0x13, AA1, BB1, AA1
  1112		VPERM2I128 $0x13, CC1, DD1, BB1
  1113		VPERM2I128 $0x02, AA2, BB2, CC1
  1114		VPERM2I128 $0x02, CC2, DD2, DD1
  1115		VPERM2I128 $0x13, AA2, BB2, AA2
  1116		VPERM2I128 $0x13, CC2, DD2, BB2
  1117		JMP        openAVX2ShortOpen
  1118	
  1119	// ----------------------------------------------------------------------------
  1120	// Special optimization for the last 128 bytes of ciphertext
  1121	openAVX2Tail128:
  1122		// Need to decrypt up to 128 bytes - prepare two blocks
  1123		VMOVDQA ·chacha20Constants<>(SB), AA1
  1124		VMOVDQA state1StoreAVX2, BB1
  1125		VMOVDQA state2StoreAVX2, CC1
  1126		VMOVDQA ctr3StoreAVX2, DD1
  1127		VPADDD  ·avx2IncMask<>(SB), DD1, DD1
  1128		VMOVDQA DD1, DD0
  1129	
  1130		XORQ  itr2, itr2
  1131		MOVQ  inl, itr1
  1132		ANDQ  $-16, itr1
  1133		TESTQ itr1, itr1
  1134		JE    openAVX2Tail128LoopB
  1135	
  1136	openAVX2Tail128LoopA:
  1137		// Perform ChaCha rounds, while hashing the remaining input
  1138		polyAdd(0(inp)(itr2*1))
  1139		polyMulAVX2
  1140	
  1141	openAVX2Tail128LoopB:
  1142		ADDQ     $16, itr2
  1143		chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1144		VPALIGNR $4, BB1, BB1, BB1
  1145		VPALIGNR $8, CC1, CC1, CC1
  1146		VPALIGNR $12, DD1, DD1, DD1
  1147		chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1148		VPALIGNR $12, BB1, BB1, BB1
  1149		VPALIGNR $8, CC1, CC1, CC1
  1150		VPALIGNR $4, DD1, DD1, DD1
  1151		CMPQ     itr2, itr1
  1152		JB       openAVX2Tail128LoopA
  1153		CMPQ     itr2, $160
  1154		JNE      openAVX2Tail128LoopB
  1155	
  1156		VPADDD     ·chacha20Constants<>(SB), AA1, AA1
  1157		VPADDD     state1StoreAVX2, BB1, BB1
  1158		VPADDD     state2StoreAVX2, CC1, CC1
  1159		VPADDD     DD0, DD1, DD1
  1160		VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  1161	
  1162	openAVX2TailLoop:
  1163		CMPQ inl, $32
  1164		JB   openAVX2Tail
  1165		SUBQ $32, inl
  1166	
  1167		// Load for decryption
  1168		VPXOR   (inp), AA0, AA0
  1169		VMOVDQU AA0, (oup)
  1170		LEAQ    (1*32)(inp), inp
  1171		LEAQ    (1*32)(oup), oup
  1172		VMOVDQA BB0, AA0
  1173		VMOVDQA CC0, BB0
  1174		VMOVDQA DD0, CC0
  1175		JMP     openAVX2TailLoop
  1176	
  1177	openAVX2Tail:
  1178		CMPQ    inl, $16
  1179		VMOVDQA A0, A1
  1180		JB      openAVX2TailDone
  1181		SUBQ    $16, inl
  1182	
  1183		// Load for decryption
  1184		VPXOR      (inp), A0, T0
  1185		VMOVDQU    T0, (oup)
  1186		LEAQ       (1*16)(inp), inp
  1187		LEAQ       (1*16)(oup), oup
  1188		VPERM2I128 $0x11, AA0, AA0, AA0
  1189		VMOVDQA    A0, A1
  1190	
  1191	openAVX2TailDone:
  1192		VZEROUPPER
  1193		JMP openSSETail16
  1194	
  1195	// ----------------------------------------------------------------------------
  1196	// Special optimization for the last 256 bytes of ciphertext
  1197	openAVX2Tail256:
  1198		// Need to decrypt up to 256 bytes - prepare four blocks
  1199		VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
  1200		VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
  1201		VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
  1202		VMOVDQA ctr3StoreAVX2, DD0
  1203		VPADDD  ·avx2IncMask<>(SB), DD0, DD0
  1204		VPADDD  ·avx2IncMask<>(SB), DD0, DD1
  1205		VMOVDQA DD0, TT1
  1206		VMOVDQA DD1, TT2
  1207	
  1208		// Compute the number of iterations that will hash data
  1209		MOVQ    inl, tmpStoreAVX2
  1210		MOVQ    inl, itr1
  1211		SUBQ    $128, itr1
  1212		SHRQ    $4, itr1
  1213		MOVQ    $10, itr2
  1214		CMPQ    itr1, $10
  1215		CMOVQGT itr2, itr1
  1216		MOVQ    inp, inl
  1217		XORQ    itr2, itr2
  1218	
  1219	openAVX2Tail256LoopA:
  1220		polyAdd(0(inl))
  1221		polyMulAVX2
  1222		LEAQ 16(inl), inl
  1223	
  1224		// Perform ChaCha rounds, while hashing the remaining input
  1225	openAVX2Tail256LoopB:
  1226		chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1227		VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
  1228		VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  1229		VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
  1230		INCQ     itr2
  1231		chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  1232		VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
  1233		VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  1234		VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
  1235		CMPQ     itr2, itr1
  1236		JB       openAVX2Tail256LoopA
  1237	
  1238		CMPQ itr2, $10
  1239		JNE  openAVX2Tail256LoopB
  1240	
  1241		MOVQ inl, itr2
  1242		SUBQ inp, inl
  1243		MOVQ inl, itr1
  1244		MOVQ tmpStoreAVX2, inl
  1245	
  1246		// Hash the remainder of data (if any)
  1247	openAVX2Tail256Hash:
  1248		ADDQ $16, itr1
  1249		CMPQ itr1, inl
  1250		JGT  openAVX2Tail256HashEnd
  1251		polyAdd (0(itr2))
  1252		polyMulAVX2
  1253		LEAQ 16(itr2), itr2
  1254		JMP  openAVX2Tail256Hash
  1255	
  1256	// Store 128 bytes safely, then go to store loop
  1257	openAVX2Tail256HashEnd:
  1258		VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
  1259		VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
  1260		VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
  1261		VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1
  1262		VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2
  1263		VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  1264	
  1265		VPXOR   (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2
  1266		VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup)
  1267		LEAQ    (4*32)(inp), inp
  1268		LEAQ    (4*32)(oup), oup
  1269		SUBQ    $4*32, inl
  1270	
  1271		JMP openAVX2TailLoop
  1272	
  1273	// ----------------------------------------------------------------------------
  1274	// Special optimization for the last 384 bytes of ciphertext
  1275	openAVX2Tail384:
  1276		// Need to decrypt up to 384 bytes - prepare six blocks
  1277		VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
  1278		VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
  1279		VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
  1280		VMOVDQA ctr3StoreAVX2, DD0
  1281		VPADDD  ·avx2IncMask<>(SB), DD0, DD0
  1282		VPADDD  ·avx2IncMask<>(SB), DD0, DD1
  1283		VPADDD  ·avx2IncMask<>(SB), DD1, DD2
  1284		VMOVDQA DD0, ctr0StoreAVX2
  1285		VMOVDQA DD1, ctr1StoreAVX2
  1286		VMOVDQA DD2, ctr2StoreAVX2
  1287	
  1288		// Compute the number of iterations that will hash two blocks of data
  1289		MOVQ    inl, tmpStoreAVX2
  1290		MOVQ    inl, itr1
  1291		SUBQ    $256, itr1
  1292		SHRQ    $4, itr1
  1293		ADDQ    $6, itr1
  1294		MOVQ    $10, itr2
  1295		CMPQ    itr1, $10
  1296		CMOVQGT itr2, itr1
  1297		MOVQ    inp, inl
  1298		XORQ    itr2, itr2
  1299	
  1300		// Perform ChaCha rounds, while hashing the remaining input
  1301	openAVX2Tail384LoopB:
  1302		polyAdd(0(inl))
  1303		polyMulAVX2
  1304		LEAQ 16(inl), inl
  1305	
  1306	openAVX2Tail384LoopA:
  1307		chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  1308		VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  1309		VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  1310		VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  1311		polyAdd(0(inl))
  1312		polyMulAVX2
  1313		LEAQ     16(inl), inl
  1314		INCQ     itr2
  1315		chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  1316		VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  1317		VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  1318		VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  1319	
  1320		CMPQ itr2, itr1
  1321		JB   openAVX2Tail384LoopB
  1322	
  1323		CMPQ itr2, $10
  1324		JNE  openAVX2Tail384LoopA
  1325	
  1326		MOVQ inl, itr2
  1327		SUBQ inp, inl
  1328		MOVQ inl, itr1
  1329		MOVQ tmpStoreAVX2, inl
  1330	
  1331	openAVX2Tail384Hash:
  1332		ADDQ $16, itr1
  1333		CMPQ itr1, inl
  1334		JGT  openAVX2Tail384HashEnd
  1335		polyAdd(0(itr2))
  1336		polyMulAVX2
  1337		LEAQ 16(itr2), itr2
  1338		JMP  openAVX2Tail384Hash
  1339	
  1340	// Store 256 bytes safely, then go to store loop
  1341	openAVX2Tail384HashEnd:
  1342		VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
  1343		VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
  1344		VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
  1345		VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
  1346		VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3
  1347		VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
  1348		VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
  1349		VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3
  1350		VPXOR      (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
  1351		VMOVDQU    TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
  1352		VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  1353		LEAQ       (8*32)(inp), inp
  1354		LEAQ       (8*32)(oup), oup
  1355		SUBQ       $8*32, inl
  1356		JMP        openAVX2TailLoop
  1357	
  1358	// ----------------------------------------------------------------------------
  1359	// Special optimization for the last 512 bytes of ciphertext
  1360	openAVX2Tail512:
  1361		VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  1362		VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  1363		VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  1364		VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  1365		VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  1366		XORQ    itr1, itr1
  1367		MOVQ    inp, itr2
  1368	
  1369	openAVX2Tail512LoopB:
  1370		polyAdd(0(itr2))
  1371		polyMulAVX2
  1372		LEAQ (2*8)(itr2), itr2
  1373	
  1374	openAVX2Tail512LoopA:
  1375		VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1376		VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1377		VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  1378		VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1379		VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1380		VMOVDQA  CC3, tmpStoreAVX2
  1381		VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  1382		VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  1383		VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  1384		VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  1385		VMOVDQA  tmpStoreAVX2, CC3
  1386		polyAdd(0*8(itr2))
  1387		polyMulAVX2
  1388		VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1389		VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1390		VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  1391		VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1392		VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1393		VMOVDQA  CC3, tmpStoreAVX2
  1394		VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  1395		VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  1396		VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  1397		VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  1398		VMOVDQA  tmpStoreAVX2, CC3
  1399		VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
  1400		VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  1401		VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
  1402		VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1403		VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1404		VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  1405		VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1406		VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1407		polyAdd(2*8(itr2))
  1408		polyMulAVX2
  1409		LEAQ     (4*8)(itr2), itr2
  1410		VMOVDQA  CC3, tmpStoreAVX2
  1411		VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  1412		VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  1413		VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  1414		VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  1415		VMOVDQA  tmpStoreAVX2, CC3
  1416		VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  1417		VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  1418		VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  1419		VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  1420		VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  1421		VMOVDQA  CC3, tmpStoreAVX2
  1422		VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  1423		VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  1424		VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  1425		VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  1426		VMOVDQA  tmpStoreAVX2, CC3
  1427		VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
  1428		VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  1429		VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
  1430		INCQ     itr1
  1431		CMPQ     itr1, $4
  1432		JLT      openAVX2Tail512LoopB
  1433	
  1434		CMPQ itr1, $10
  1435		JNE  openAVX2Tail512LoopA
  1436	
  1437		MOVQ inl, itr1
  1438		SUBQ $384, itr1
  1439		ANDQ $-16, itr1
  1440	
  1441	openAVX2Tail512HashLoop:
  1442		TESTQ itr1, itr1
  1443		JE    openAVX2Tail512HashEnd
  1444		polyAdd(0(itr2))
  1445		polyMulAVX2
  1446		LEAQ  16(itr2), itr2
  1447		SUBQ  $16, itr1
  1448		JMP   openAVX2Tail512HashLoop
  1449	
  1450	openAVX2Tail512HashEnd:
  1451		VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  1452		VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  1453		VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  1454		VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  1455		VMOVDQA    CC3, tmpStoreAVX2
  1456		VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
  1457		VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
  1458		VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
  1459		VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  1460		VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
  1461		VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
  1462		VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  1463		VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
  1464		VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
  1465		VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
  1466	
  1467		LEAQ (12*32)(inp), inp
  1468		LEAQ (12*32)(oup), oup
  1469		SUBQ $12*32, inl
  1470	
  1471		JMP openAVX2TailLoop
  1472	
  1473	// ----------------------------------------------------------------------------
  1474	// ----------------------------------------------------------------------------
  1475	// func chacha20Poly1305Seal(dst, key, src, ad []byte)
  1476	TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
  1477		// For aligned stack access
  1478		MOVQ SP, BP
  1479		ADDQ $32, BP
  1480		ANDQ $-32, BP
  1481		MOVQ dst+0(FP), oup
  1482		MOVQ key+24(FP), keyp
  1483		MOVQ src+48(FP), inp
  1484		MOVQ src_len+56(FP), inl
  1485		MOVQ ad+72(FP), adp
  1486	
  1487		CMPB ·useAVX2(SB), $1
  1488		JE   chacha20Poly1305Seal_AVX2
  1489	
  1490		// Special optimization, for very short buffers
  1491		CMPQ inl, $128
  1492		JBE  sealSSE128 // About 15% faster
  1493	
  1494		// In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
  1495		MOVOU ·chacha20Constants<>(SB), A0
  1496		MOVOU (1*16)(keyp), B0
  1497		MOVOU (2*16)(keyp), C0
  1498		MOVOU (3*16)(keyp), D0
  1499	
  1500		// Store state on stack for future use
  1501		MOVO B0, state1Store
  1502		MOVO C0, state2Store
  1503	
  1504		// Load state, increment counter blocks
  1505		MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  1506		MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  1507		MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
  1508	
  1509		// Store counters
  1510		MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
  1511		MOVQ $10, itr2
  1512	
  1513	sealSSEIntroLoop:
  1514		MOVO         C3, tmpStore
  1515		chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1516		MOVO         tmpStore, C3
  1517		MOVO         C1, tmpStore
  1518		chachaQR(A3, B3, C3, D3, C1)
  1519		MOVO         tmpStore, C1
  1520		shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
  1521		shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
  1522		shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
  1523	
  1524		MOVO          C3, tmpStore
  1525		chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1526		MOVO          tmpStore, C3
  1527		MOVO          C1, tmpStore
  1528		chachaQR(A3, B3, C3, D3, C1)
  1529		MOVO          tmpStore, C1
  1530		shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
  1531		shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
  1532		shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
  1533		DECQ          itr2
  1534		JNE           sealSSEIntroLoop
  1535	
  1536		// Add in the state
  1537		PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
  1538		PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
  1539		PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
  1540		PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
  1541	
  1542		// Clamp and store the key
  1543		PAND ·polyClampMask<>(SB), A0
  1544		MOVO A0, rStore
  1545		MOVO B0, sStore
  1546	
  1547		// Hash AAD
  1548		MOVQ ad_len+80(FP), itr2
  1549		CALL polyHashADInternal<>(SB)
  1550	
  1551		MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
  1552		PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
  1553		MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
  1554		MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
  1555		PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
  1556		MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup)
  1557	
  1558		MOVQ $128, itr1
  1559		SUBQ $128, inl
  1560		LEAQ 128(inp), inp
  1561	
  1562		MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1
  1563	
  1564		CMPQ inl, $64
  1565		JBE  sealSSE128SealHash
  1566	
  1567		MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
  1568		PXOR  A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
  1569		MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup)
  1570	
  1571		ADDQ $64, itr1
  1572		SUBQ $64, inl
  1573		LEAQ 64(inp), inp
  1574	
  1575		MOVQ $2, itr1
  1576		MOVQ $8, itr2
  1577	
  1578		CMPQ inl, $64
  1579		JBE  sealSSETail64
  1580		CMPQ inl, $128
  1581		JBE  sealSSETail128
  1582		CMPQ inl, $192
  1583		JBE  sealSSETail192
  1584	
  1585	sealSSEMainLoop:
  1586		// Load state, increment counter blocks
  1587		MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
  1588		MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  1589		MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  1590		MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
  1591	
  1592		// Store counters
  1593		MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
  1594	
  1595	sealSSEInnerLoop:
  1596		MOVO          C3, tmpStore
  1597		chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1598		MOVO          tmpStore, C3
  1599		MOVO          C1, tmpStore
  1600		chachaQR(A3, B3, C3, D3, C1)
  1601		MOVO          tmpStore, C1
  1602		polyAdd(0(oup))
  1603		shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
  1604		shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
  1605		shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
  1606		polyMulStage1
  1607		polyMulStage2
  1608		LEAQ          (2*8)(oup), oup
  1609		MOVO          C3, tmpStore
  1610		chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
  1611		MOVO          tmpStore, C3
  1612		MOVO          C1, tmpStore
  1613		polyMulStage3
  1614		chachaQR(A3, B3, C3, D3, C1)
  1615		MOVO          tmpStore, C1
  1616		polyMulReduceStage
  1617		shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
  1618		shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
  1619		shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
  1620		DECQ          itr2
  1621		JGE           sealSSEInnerLoop
  1622		polyAdd(0(oup))
  1623		polyMul
  1624		LEAQ          (2*8)(oup), oup
  1625		DECQ          itr1
  1626		JG            sealSSEInnerLoop
  1627	
  1628		// Add in the state
  1629		PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
  1630		PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
  1631		PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
  1632		PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
  1633		MOVO  D3, tmpStore
  1634	
  1635		// Load - xor - store
  1636		MOVOU (0*16)(inp), D3; PXOR D3, A0
  1637		MOVOU (1*16)(inp), D3; PXOR D3, B0
  1638		MOVOU (2*16)(inp), D3; PXOR D3, C0
  1639		MOVOU (3*16)(inp), D3; PXOR D3, D0
  1640		MOVOU A0, (0*16)(oup)
  1641		MOVOU B0, (1*16)(oup)
  1642		MOVOU C0, (2*16)(oup)
  1643		MOVOU D0, (3*16)(oup)
  1644		MOVO  tmpStore, D3
  1645	
  1646		MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
  1647		PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
  1648		MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
  1649		MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
  1650		PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
  1651		MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
  1652		ADDQ  $192, inp
  1653		MOVQ  $192, itr1
  1654		SUBQ  $192, inl
  1655		MOVO  A3, A1
  1656		MOVO  B3, B1
  1657		MOVO  C3, C1
  1658		MOVO  D3, D1
  1659		CMPQ  inl, $64
  1660		JBE   sealSSE128SealHash
  1661		MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
  1662		PXOR  A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
  1663		MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup)
  1664		LEAQ  64(inp), inp
  1665		SUBQ  $64, inl
  1666		MOVQ  $6, itr1
  1667		MOVQ  $4, itr2
  1668		CMPQ  inl, $192
  1669		JG    sealSSEMainLoop
  1670	
  1671		MOVQ  inl, itr1
  1672		TESTQ inl, inl
  1673		JE    sealSSE128SealHash
  1674		MOVQ  $6, itr1
  1675		CMPQ  inl, $64
  1676		JBE   sealSSETail64
  1677		CMPQ  inl, $128
  1678		JBE   sealSSETail128
  1679		JMP   sealSSETail192
  1680	
  1681	// ----------------------------------------------------------------------------
  1682	// Special optimization for the last 64 bytes of plaintext
  1683	sealSSETail64:
  1684		// Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
  1685		MOVO  ·chacha20Constants<>(SB), A1
  1686		MOVO  state1Store, B1
  1687		MOVO  state2Store, C1
  1688		MOVO  ctr3Store, D1
  1689		PADDL ·sseIncMask<>(SB), D1
  1690		MOVO  D1, ctr0Store
  1691	
  1692	sealSSETail64LoopA:
  1693		// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
  1694		polyAdd(0(oup))
  1695		polyMul
  1696		LEAQ 16(oup), oup
  1697	
  1698	sealSSETail64LoopB:
  1699		chachaQR(A1, B1, C1, D1, T1)
  1700		shiftB1Left;  shiftC1Left; shiftD1Left
  1701		chachaQR(A1, B1, C1, D1, T1)
  1702		shiftB1Right; shiftC1Right; shiftD1Right
  1703		polyAdd(0(oup))
  1704		polyMul
  1705		LEAQ          16(oup), oup
  1706	
  1707		DECQ itr1
  1708		JG   sealSSETail64LoopA
  1709	
  1710		DECQ  itr2
  1711		JGE   sealSSETail64LoopB
  1712		PADDL ·chacha20Constants<>(SB), A1
  1713		PADDL state1Store, B1
  1714		PADDL state2Store, C1
  1715		PADDL ctr0Store, D1
  1716	
  1717		JMP sealSSE128Seal
  1718	
  1719	// ----------------------------------------------------------------------------
  1720	// Special optimization for the last 128 bytes of plaintext
  1721	sealSSETail128:
  1722		// Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
  1723		MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
  1724		MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
  1725	
  1726	sealSSETail128LoopA:
  1727		// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
  1728		polyAdd(0(oup))
  1729		polyMul
  1730		LEAQ 16(oup), oup
  1731	
  1732	sealSSETail128LoopB:
  1733		chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
  1734		shiftB0Left;  shiftC0Left; shiftD0Left
  1735		shiftB1Left;  shiftC1Left; shiftD1Left
  1736		polyAdd(0(oup))
  1737		polyMul
  1738		LEAQ          16(oup), oup
  1739		chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
  1740		shiftB0Right; shiftC0Right; shiftD0Right
  1741		shiftB1Right; shiftC1Right; shiftD1Right
  1742	
  1743		DECQ itr1
  1744		JG   sealSSETail128LoopA
  1745	
  1746		DECQ itr2
  1747		JGE  sealSSETail128LoopB
  1748	
  1749		PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
  1750		PADDL state1Store, B0; PADDL state1Store, B1
  1751		PADDL state2Store, C0; PADDL state2Store, C1
  1752		PADDL ctr0Store, D0; PADDL ctr1Store, D1
  1753	
  1754		MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
  1755		PXOR  T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
  1756		MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
  1757	
  1758		MOVQ $64, itr1
  1759		LEAQ 64(inp), inp
  1760		SUBQ $64, inl
  1761	
  1762		JMP sealSSE128SealHash
  1763	
  1764	// ----------------------------------------------------------------------------
  1765	// Special optimization for the last 192 bytes of plaintext
  1766	sealSSETail192:
  1767		// Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
  1768		MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
  1769		MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
  1770		MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store
  1771	
  1772	sealSSETail192LoopA:
  1773		// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
  1774		polyAdd(0(oup))
  1775		polyMul
  1776		LEAQ 16(oup), oup
  1777	
  1778	sealSSETail192LoopB:
  1779		chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1780		shiftB0Left; shiftC0Left; shiftD0Left
  1781		shiftB1Left; shiftC1Left; shiftD1Left
  1782		shiftB2Left; shiftC2Left; shiftD2Left
  1783	
  1784		polyAdd(0(oup))
  1785		polyMul
  1786		LEAQ 16(oup), oup
  1787	
  1788		chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1789		shiftB0Right; shiftC0Right; shiftD0Right
  1790		shiftB1Right; shiftC1Right; shiftD1Right
  1791		shiftB2Right; shiftC2Right; shiftD2Right
  1792	
  1793		DECQ itr1
  1794		JG   sealSSETail192LoopA
  1795	
  1796		DECQ itr2
  1797		JGE  sealSSETail192LoopB
  1798	
  1799		PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
  1800		PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
  1801		PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
  1802		PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
  1803	
  1804		MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
  1805		PXOR  T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
  1806		MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
  1807		MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
  1808		PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
  1809		MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
  1810	
  1811		MOVO A2, A1
  1812		MOVO B2, B1
  1813		MOVO C2, C1
  1814		MOVO D2, D1
  1815		MOVQ $128, itr1
  1816		LEAQ 128(inp), inp
  1817		SUBQ $128, inl
  1818	
  1819		JMP sealSSE128SealHash
  1820	
  1821	// ----------------------------------------------------------------------------
  1822	// Special seal optimization for buffers smaller than 129 bytes
  1823	sealSSE128:
  1824		// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
  1825		MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
  1826		MOVO  A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
  1827		MOVO  A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
  1828		MOVO  B0, T1; MOVO C0, T2; MOVO D1, T3
  1829		MOVQ  $10, itr2
  1830	
  1831	sealSSE128InnerCipherLoop:
  1832		chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1833		shiftB0Left;  shiftB1Left; shiftB2Left
  1834		shiftC0Left;  shiftC1Left; shiftC2Left
  1835		shiftD0Left;  shiftD1Left; shiftD2Left
  1836		chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
  1837		shiftB0Right; shiftB1Right; shiftB2Right
  1838		shiftC0Right; shiftC1Right; shiftC2Right
  1839		shiftD0Right; shiftD1Right; shiftD2Right
  1840		DECQ          itr2
  1841		JNE           sealSSE128InnerCipherLoop
  1842	
  1843		// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
  1844		PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
  1845		PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
  1846		PADDL T2, C1; PADDL T2, C2
  1847		PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
  1848		PAND  ·polyClampMask<>(SB), A0
  1849		MOVOU A0, rStore
  1850		MOVOU B0, sStore
  1851	
  1852		// Hash
  1853		MOVQ ad_len+80(FP), itr2
  1854		CALL polyHashADInternal<>(SB)
  1855		XORQ itr1, itr1
  1856	
  1857	sealSSE128SealHash:
  1858		// itr1 holds the number of bytes encrypted but not yet hashed
  1859		CMPQ itr1, $16
  1860		JB   sealSSE128Seal
  1861		polyAdd(0(oup))
  1862		polyMul
  1863	
  1864		SUBQ $16, itr1
  1865		ADDQ $16, oup
  1866	
  1867		JMP sealSSE128SealHash
  1868	
  1869	sealSSE128Seal:
  1870		CMPQ inl, $16
  1871		JB   sealSSETail
  1872		SUBQ $16, inl
  1873	
  1874		// Load for decryption
  1875		MOVOU (inp), T0
  1876		PXOR  T0, A1
  1877		MOVOU A1, (oup)
  1878		LEAQ  (1*16)(inp), inp
  1879		LEAQ  (1*16)(oup), oup
  1880	
  1881		// Extract for hashing
  1882		MOVQ   A1, t0
  1883		PSRLDQ $8, A1
  1884		MOVQ A1, t1
  1885		ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
  1886		polyMul
  1887	
  1888		// Shift the stream "left"
  1889		MOVO B1, A1
  1890		MOVO C1, B1
  1891		MOVO D1, C1
  1892		MOVO A2, D1
  1893		MOVO B2, A2
  1894		MOVO C2, B2
  1895		MOVO D2, C2
  1896		JMP  sealSSE128Seal
  1897	
  1898	sealSSETail:
  1899		TESTQ inl, inl
  1900		JE    sealSSEFinalize
  1901	
  1902		// We can only load the PT one byte at a time to avoid read after end of buffer
  1903		MOVQ inl, itr2
  1904		SHLQ $4, itr2
  1905		LEAQ ·andMask<>(SB), t0
  1906		MOVQ inl, itr1
  1907		LEAQ -1(inp)(inl*1), inp
  1908		XORQ t2, t2
  1909		XORQ t3, t3
  1910		XORQ AX, AX
  1911	
  1912	sealSSETailLoadLoop:
  1913		SHLQ $8, t2, t3
  1914		SHLQ $8, t2
  1915		MOVB (inp), AX
  1916		XORQ AX, t2
  1917		LEAQ   -1(inp), inp
  1918		DECQ   itr1
  1919		JNE    sealSSETailLoadLoop
  1920		MOVQ t2, 0+tmpStore
  1921		MOVQ t3, 8+tmpStore
  1922		PXOR 0+tmpStore, A1
  1923		MOVOU  A1, (oup)
  1924		MOVOU  -16(t0)(itr2*1), T0
  1925		PAND   T0, A1
  1926		MOVQ   A1, t0
  1927		PSRLDQ $8, A1
  1928		MOVQ   A1, t1
  1929		ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
  1930		polyMul
  1931	
  1932		ADDQ inl, oup
  1933	
  1934	sealSSEFinalize:
  1935		// Hash in the buffer lengths
  1936		ADDQ ad_len+80(FP), acc0
  1937		ADCQ src_len+56(FP), acc1
  1938		ADCQ $1, acc2
  1939		polyMul
  1940	
  1941		// Final reduce
  1942		MOVQ    acc0, t0
  1943		MOVQ    acc1, t1
  1944		MOVQ    acc2, t2
  1945		SUBQ    $-5, acc0
  1946		SBBQ    $-1, acc1
  1947		SBBQ    $3, acc2
  1948		CMOVQCS t0, acc0
  1949		CMOVQCS t1, acc1
  1950		CMOVQCS t2, acc2
  1951	
  1952		// Add in the "s" part of the key
  1953		ADDQ 0+sStore, acc0
  1954		ADCQ 8+sStore, acc1
  1955	
  1956		// Finally store the tag at the end of the message
  1957		MOVQ acc0, (0*8)(oup)
  1958		MOVQ acc1, (1*8)(oup)
  1959		RET
  1960	
  1961	// ----------------------------------------------------------------------------
  1962	// ------------------------- AVX2 Code ----------------------------------------
  1963	chacha20Poly1305Seal_AVX2:
  1964		VZEROUPPER
  1965		VMOVDQU ·chacha20Constants<>(SB), AA0
  1966		BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
  1967		BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
  1968		BYTE    $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
  1969		VPADDD  ·avx2InitMask<>(SB), DD0, DD0
  1970	
  1971		// Special optimizations, for very short buffers
  1972		CMPQ inl, $192
  1973		JBE  seal192AVX2 // 33% faster
  1974		CMPQ inl, $320
  1975		JBE  seal320AVX2 // 17% faster
  1976	
  1977		// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
  1978		VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  1979		VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
  1980		VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
  1981		VPADDD  ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
  1982		VPADDD  ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
  1983		VPADDD  ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
  1984		VMOVDQA DD3, ctr3StoreAVX2
  1985		MOVQ    $10, itr2
  1986	
  1987	sealAVX2IntroLoop:
  1988		VMOVDQA CC3, tmpStoreAVX2
  1989		chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  1990		VMOVDQA tmpStoreAVX2, CC3
  1991		VMOVDQA CC1, tmpStoreAVX2
  1992		chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  1993		VMOVDQA tmpStoreAVX2, CC1
  1994	
  1995		VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
  1996		VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
  1997		VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
  1998		VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
  1999	
  2000		VMOVDQA CC3, tmpStoreAVX2
  2001		chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  2002		VMOVDQA tmpStoreAVX2, CC3
  2003		VMOVDQA CC1, tmpStoreAVX2
  2004		chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  2005		VMOVDQA tmpStoreAVX2, CC1
  2006	
  2007		VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
  2008		VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
  2009		VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
  2010		VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
  2011		DECQ     itr2
  2012		JNE      sealAVX2IntroLoop
  2013	
  2014		VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  2015		VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  2016		VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  2017		VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  2018	
  2019		VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127
  2020		VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key
  2021		VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
  2022	
  2023		// Clamp and store poly key
  2024		VPAND   ·polyClampMask<>(SB), DD0, DD0
  2025		VMOVDQA DD0, rsStoreAVX2
  2026	
  2027		// Hash AD
  2028		MOVQ ad_len+80(FP), itr2
  2029		CALL polyHashADInternal<>(SB)
  2030	
  2031		// Can store at least 320 bytes
  2032		VPXOR   (0*32)(inp), AA0, AA0
  2033		VPXOR   (1*32)(inp), CC0, CC0
  2034		VMOVDQU AA0, (0*32)(oup)
  2035		VMOVDQU CC0, (1*32)(oup)
  2036	
  2037		VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  2038		VPXOR      (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0
  2039		VMOVDQU    AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup)
  2040		VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  2041		VPXOR      (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0
  2042		VMOVDQU    AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup)
  2043	
  2044		MOVQ $320, itr1
  2045		SUBQ $320, inl
  2046		LEAQ 320(inp), inp
  2047	
  2048		VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0
  2049		CMPQ       inl, $128
  2050		JBE        sealAVX2SealHash
  2051	
  2052		VPXOR   (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0
  2053		VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup)
  2054		SUBQ    $128, inl
  2055		LEAQ    128(inp), inp
  2056	
  2057		MOVQ $8, itr1
  2058		MOVQ $2, itr2
  2059	
  2060		CMPQ inl, $128
  2061		JBE  sealAVX2Tail128
  2062		CMPQ inl, $256
  2063		JBE  sealAVX2Tail256
  2064		CMPQ inl, $384
  2065		JBE  sealAVX2Tail384
  2066		CMPQ inl, $512
  2067		JBE  sealAVX2Tail512
  2068	
  2069		// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
  2070		VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  2071		VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  2072		VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  2073		VMOVDQA ctr3StoreAVX2, DD0
  2074		VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  2075		VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  2076	
  2077		VMOVDQA CC3, tmpStoreAVX2
  2078		chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  2079		VMOVDQA tmpStoreAVX2, CC3
  2080		VMOVDQA CC1, tmpStoreAVX2
  2081		chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  2082		VMOVDQA tmpStoreAVX2, CC1
  2083	
  2084		VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
  2085		VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
  2086		VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
  2087		VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
  2088	
  2089		VMOVDQA CC3, tmpStoreAVX2
  2090		chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
  2091		VMOVDQA tmpStoreAVX2, CC3
  2092		VMOVDQA CC1, tmpStoreAVX2
  2093		chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
  2094		VMOVDQA tmpStoreAVX2, CC1
  2095	
  2096		VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
  2097		VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
  2098		VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
  2099		VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
  2100		VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2101		VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2102		VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2103		VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2104		VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2105		VMOVDQA  CC3, tmpStoreAVX2
  2106		VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2107		VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2108		VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2109		VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2110		VMOVDQA  tmpStoreAVX2, CC3
  2111	
  2112		SUBQ $16, oup                  // Adjust the pointer
  2113		MOVQ $9, itr1
  2114		JMP  sealAVX2InternalLoopStart
  2115	
  2116	sealAVX2MainLoop:
  2117		// Load state, increment counter blocks, store the incremented counters
  2118		VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  2119		VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  2120		VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  2121		VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  2122		VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  2123		MOVQ    $10, itr1
  2124	
  2125	sealAVX2InternalLoop:
  2126		polyAdd(0*8(oup))
  2127		VPADDD  BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2128		polyMulStage1_AVX2
  2129		VPXOR   AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2130		VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2131		polyMulStage2_AVX2
  2132		VPADDD  DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2133		VPXOR   CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2134		polyMulStage3_AVX2
  2135		VMOVDQA CC3, tmpStoreAVX2
  2136		VPSLLD  $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2137		VPSLLD  $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2138		VPSLLD  $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2139		VPSLLD  $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2140		VMOVDQA tmpStoreAVX2, CC3
  2141		polyMulReduceStage
  2142	
  2143	sealAVX2InternalLoopStart:
  2144		VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2145		VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2146		VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  2147		polyAdd(2*8(oup))
  2148		VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2149		polyMulStage1_AVX2
  2150		VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2151		VMOVDQA  CC3, tmpStoreAVX2
  2152		VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  2153		VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  2154		VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  2155		VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  2156		VMOVDQA  tmpStoreAVX2, CC3
  2157		polyMulStage2_AVX2
  2158		VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
  2159		VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  2160		VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
  2161		VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2162		polyMulStage3_AVX2
  2163		VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2164		VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2165		polyMulReduceStage
  2166		VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2167		VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2168		polyAdd(4*8(oup))
  2169		LEAQ     (6*8)(oup), oup
  2170		VMOVDQA  CC3, tmpStoreAVX2
  2171		VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2172		VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2173		VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2174		VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2175		VMOVDQA  tmpStoreAVX2, CC3
  2176		polyMulStage1_AVX2
  2177		VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2178		VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2179		polyMulStage2_AVX2
  2180		VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  2181		VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2182		polyMulStage3_AVX2
  2183		VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2184		VMOVDQA  CC3, tmpStoreAVX2
  2185		VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  2186		VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  2187		VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  2188		VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  2189		VMOVDQA  tmpStoreAVX2, CC3
  2190		polyMulReduceStage
  2191		VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
  2192		VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  2193		VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
  2194		DECQ     itr1
  2195		JNE      sealAVX2InternalLoop
  2196	
  2197		VPADDD  ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  2198		VPADDD  state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  2199		VPADDD  state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  2200		VPADDD  ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  2201		VMOVDQA CC3, tmpStoreAVX2
  2202	
  2203		// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
  2204		polyAdd(0*8(oup))
  2205		polyMulAVX2
  2206		LEAQ       (4*8)(oup), oup
  2207		VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
  2208		VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
  2209		VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
  2210		VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
  2211		VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
  2212		VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
  2213	
  2214		// and here
  2215		polyAdd(-2*8(oup))
  2216		polyMulAVX2
  2217		VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
  2218		VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
  2219		VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
  2220		VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
  2221		VPXOR      (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
  2222		VMOVDQU    AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
  2223		LEAQ       (32*16)(inp), inp
  2224		SUBQ       $(32*16), inl
  2225		CMPQ       inl, $512
  2226		JG         sealAVX2MainLoop
  2227	
  2228		// Tail can only hash 480 bytes
  2229		polyAdd(0*8(oup))
  2230		polyMulAVX2
  2231		polyAdd(2*8(oup))
  2232		polyMulAVX2
  2233		LEAQ 32(oup), oup
  2234	
  2235		MOVQ $10, itr1
  2236		MOVQ $0, itr2
  2237		CMPQ inl, $128
  2238		JBE  sealAVX2Tail128
  2239		CMPQ inl, $256
  2240		JBE  sealAVX2Tail256
  2241		CMPQ inl, $384
  2242		JBE  sealAVX2Tail384
  2243		JMP  sealAVX2Tail512
  2244	
  2245	// ----------------------------------------------------------------------------
  2246	// Special optimization for buffers smaller than 193 bytes
  2247	seal192AVX2:
  2248		// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
  2249		VMOVDQA AA0, AA1
  2250		VMOVDQA BB0, BB1
  2251		VMOVDQA CC0, CC1
  2252		VPADDD  ·avx2IncMask<>(SB), DD0, DD1
  2253		VMOVDQA AA0, AA2
  2254		VMOVDQA BB0, BB2
  2255		VMOVDQA CC0, CC2
  2256		VMOVDQA DD0, DD2
  2257		VMOVDQA DD1, TT3
  2258		MOVQ    $10, itr2
  2259	
  2260	sealAVX2192InnerCipherLoop:
  2261		chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  2262		VPALIGNR   $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
  2263		VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  2264		VPALIGNR   $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
  2265		chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  2266		VPALIGNR   $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
  2267		VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  2268		VPALIGNR   $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
  2269		DECQ       itr2
  2270		JNE        sealAVX2192InnerCipherLoop
  2271		VPADDD     AA2, AA0, AA0; VPADDD AA2, AA1, AA1
  2272		VPADDD     BB2, BB0, BB0; VPADDD BB2, BB1, BB1
  2273		VPADDD     CC2, CC0, CC0; VPADDD CC2, CC1, CC1
  2274		VPADDD     DD2, DD0, DD0; VPADDD TT3, DD1, DD1
  2275		VPERM2I128 $0x02, AA0, BB0, TT0
  2276	
  2277		// Clamp and store poly key
  2278		VPAND   ·polyClampMask<>(SB), TT0, TT0
  2279		VMOVDQA TT0, rsStoreAVX2
  2280	
  2281		// Stream for up to 192 bytes
  2282		VPERM2I128 $0x13, AA0, BB0, AA0
  2283		VPERM2I128 $0x13, CC0, DD0, BB0
  2284		VPERM2I128 $0x02, AA1, BB1, CC0
  2285		VPERM2I128 $0x02, CC1, DD1, DD0
  2286		VPERM2I128 $0x13, AA1, BB1, AA1
  2287		VPERM2I128 $0x13, CC1, DD1, BB1
  2288	
  2289	sealAVX2ShortSeal:
  2290		// Hash aad
  2291		MOVQ ad_len+80(FP), itr2
  2292		CALL polyHashADInternal<>(SB)
  2293		XORQ itr1, itr1
  2294	
  2295	sealAVX2SealHash:
  2296		// itr1 holds the number of bytes encrypted but not yet hashed
  2297		CMPQ itr1, $16
  2298		JB   sealAVX2ShortSealLoop
  2299		polyAdd(0(oup))
  2300		polyMul
  2301		SUBQ $16, itr1
  2302		ADDQ $16, oup
  2303		JMP  sealAVX2SealHash
  2304	
  2305	sealAVX2ShortSealLoop:
  2306		CMPQ inl, $32
  2307		JB   sealAVX2ShortTail32
  2308		SUBQ $32, inl
  2309	
  2310		// Load for encryption
  2311		VPXOR   (inp), AA0, AA0
  2312		VMOVDQU AA0, (oup)
  2313		LEAQ    (1*32)(inp), inp
  2314	
  2315		// Now can hash
  2316		polyAdd(0*8(oup))
  2317		polyMulAVX2
  2318		polyAdd(2*8(oup))
  2319		polyMulAVX2
  2320		LEAQ (1*32)(oup), oup
  2321	
  2322		// Shift stream left
  2323		VMOVDQA BB0, AA0
  2324		VMOVDQA CC0, BB0
  2325		VMOVDQA DD0, CC0
  2326		VMOVDQA AA1, DD0
  2327		VMOVDQA BB1, AA1
  2328		VMOVDQA CC1, BB1
  2329		VMOVDQA DD1, CC1
  2330		VMOVDQA AA2, DD1
  2331		VMOVDQA BB2, AA2
  2332		JMP     sealAVX2ShortSealLoop
  2333	
  2334	sealAVX2ShortTail32:
  2335		CMPQ    inl, $16
  2336		VMOVDQA A0, A1
  2337		JB      sealAVX2ShortDone
  2338	
  2339		SUBQ $16, inl
  2340	
  2341		// Load for encryption
  2342		VPXOR   (inp), A0, T0
  2343		VMOVDQU T0, (oup)
  2344		LEAQ    (1*16)(inp), inp
  2345	
  2346		// Hash
  2347		polyAdd(0*8(oup))
  2348		polyMulAVX2
  2349		LEAQ       (1*16)(oup), oup
  2350		VPERM2I128 $0x11, AA0, AA0, AA0
  2351		VMOVDQA    A0, A1
  2352	
  2353	sealAVX2ShortDone:
  2354		VZEROUPPER
  2355		JMP sealSSETail
  2356	
  2357	// ----------------------------------------------------------------------------
  2358	// Special optimization for buffers smaller than 321 bytes
  2359	seal320AVX2:
  2360		// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
  2361		VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
  2362		VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
  2363		VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
  2364		MOVQ    $10, itr2
  2365	
  2366	sealAVX2320InnerCipherLoop:
  2367		chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2368		VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  2369		VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2370		VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  2371		chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2372		VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  2373		VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2374		VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  2375		DECQ     itr2
  2376		JNE      sealAVX2320InnerCipherLoop
  2377	
  2378		VMOVDQA ·chacha20Constants<>(SB), TT0
  2379		VPADDD  TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
  2380		VPADDD  TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
  2381		VPADDD  TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
  2382		VMOVDQA ·avx2IncMask<>(SB), TT0
  2383		VPADDD  TT3, DD0, DD0; VPADDD TT0, TT3, TT3
  2384		VPADDD  TT3, DD1, DD1; VPADDD TT0, TT3, TT3
  2385		VPADDD  TT3, DD2, DD2
  2386	
  2387		// Clamp and store poly key
  2388		VPERM2I128 $0x02, AA0, BB0, TT0
  2389		VPAND      ·polyClampMask<>(SB), TT0, TT0
  2390		VMOVDQA    TT0, rsStoreAVX2
  2391	
  2392		// Stream for up to 320 bytes
  2393		VPERM2I128 $0x13, AA0, BB0, AA0
  2394		VPERM2I128 $0x13, CC0, DD0, BB0
  2395		VPERM2I128 $0x02, AA1, BB1, CC0
  2396		VPERM2I128 $0x02, CC1, DD1, DD0
  2397		VPERM2I128 $0x13, AA1, BB1, AA1
  2398		VPERM2I128 $0x13, CC1, DD1, BB1
  2399		VPERM2I128 $0x02, AA2, BB2, CC1
  2400		VPERM2I128 $0x02, CC2, DD2, DD1
  2401		VPERM2I128 $0x13, AA2, BB2, AA2
  2402		VPERM2I128 $0x13, CC2, DD2, BB2
  2403		JMP        sealAVX2ShortSeal
  2404	
  2405	// ----------------------------------------------------------------------------
  2406	// Special optimization for the last 128 bytes of ciphertext
  2407	sealAVX2Tail128:
  2408		// Need to decrypt up to 128 bytes - prepare two blocks
  2409		// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2410		// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2411		VMOVDQA ·chacha20Constants<>(SB), AA0
  2412		VMOVDQA state1StoreAVX2, BB0
  2413		VMOVDQA state2StoreAVX2, CC0
  2414		VMOVDQA ctr3StoreAVX2, DD0
  2415		VPADDD  ·avx2IncMask<>(SB), DD0, DD0
  2416		VMOVDQA DD0, DD1
  2417	
  2418	sealAVX2Tail128LoopA:
  2419		polyAdd(0(oup))
  2420		polyMul
  2421		LEAQ 16(oup), oup
  2422	
  2423	sealAVX2Tail128LoopB:
  2424		chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
  2425		polyAdd(0(oup))
  2426		polyMul
  2427		VPALIGNR $4, BB0, BB0, BB0
  2428		VPALIGNR $8, CC0, CC0, CC0
  2429		VPALIGNR $12, DD0, DD0, DD0
  2430		chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
  2431		polyAdd(16(oup))
  2432		polyMul
  2433		LEAQ     32(oup), oup
  2434		VPALIGNR $12, BB0, BB0, BB0
  2435		VPALIGNR $8, CC0, CC0, CC0
  2436		VPALIGNR $4, DD0, DD0, DD0
  2437		DECQ     itr1
  2438		JG       sealAVX2Tail128LoopA
  2439		DECQ     itr2
  2440		JGE      sealAVX2Tail128LoopB
  2441	
  2442		VPADDD ·chacha20Constants<>(SB), AA0, AA1
  2443		VPADDD state1StoreAVX2, BB0, BB1
  2444		VPADDD state2StoreAVX2, CC0, CC1
  2445		VPADDD DD1, DD0, DD1
  2446	
  2447		VPERM2I128 $0x02, AA1, BB1, AA0
  2448		VPERM2I128 $0x02, CC1, DD1, BB0
  2449		VPERM2I128 $0x13, AA1, BB1, CC0
  2450		VPERM2I128 $0x13, CC1, DD1, DD0
  2451		JMP        sealAVX2ShortSealLoop
  2452	
  2453	// ----------------------------------------------------------------------------
  2454	// Special optimization for the last 256 bytes of ciphertext
  2455	sealAVX2Tail256:
  2456		// Need to decrypt up to 256 bytes - prepare two blocks
  2457		// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2458		// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2459		VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1
  2460		VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
  2461		VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
  2462		VMOVDQA ctr3StoreAVX2, DD0
  2463		VPADDD  ·avx2IncMask<>(SB), DD0, DD0
  2464		VPADDD  ·avx2IncMask<>(SB), DD0, DD1
  2465		VMOVDQA DD0, TT1
  2466		VMOVDQA DD1, TT2
  2467	
  2468	sealAVX2Tail256LoopA:
  2469		polyAdd(0(oup))
  2470		polyMul
  2471		LEAQ 16(oup), oup
  2472	
  2473	sealAVX2Tail256LoopB:
  2474		chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  2475		polyAdd(0(oup))
  2476		polyMul
  2477		VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
  2478		VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  2479		VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
  2480		chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
  2481		polyAdd(16(oup))
  2482		polyMul
  2483		LEAQ     32(oup), oup
  2484		VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
  2485		VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
  2486		VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
  2487		DECQ     itr1
  2488		JG       sealAVX2Tail256LoopA
  2489		DECQ     itr2
  2490		JGE      sealAVX2Tail256LoopB
  2491	
  2492		VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
  2493		VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
  2494		VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
  2495		VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1
  2496		VPERM2I128 $0x02, AA0, BB0, TT0
  2497		VPERM2I128 $0x02, CC0, DD0, TT1
  2498		VPERM2I128 $0x13, AA0, BB0, TT2
  2499		VPERM2I128 $0x13, CC0, DD0, TT3
  2500		VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
  2501		VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
  2502		MOVQ       $128, itr1
  2503		LEAQ       128(inp), inp
  2504		SUBQ       $128, inl
  2505		VPERM2I128 $0x02, AA1, BB1, AA0
  2506		VPERM2I128 $0x02, CC1, DD1, BB0
  2507		VPERM2I128 $0x13, AA1, BB1, CC0
  2508		VPERM2I128 $0x13, CC1, DD1, DD0
  2509	
  2510		JMP sealAVX2SealHash
  2511	
  2512	// ----------------------------------------------------------------------------
  2513	// Special optimization for the last 384 bytes of ciphertext
  2514	sealAVX2Tail384:
  2515		// Need to decrypt up to 384 bytes - prepare two blocks
  2516		// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2517		// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2518		VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
  2519		VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
  2520		VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
  2521		VMOVDQA ctr3StoreAVX2, DD0
  2522		VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2
  2523		VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
  2524	
  2525	sealAVX2Tail384LoopA:
  2526		polyAdd(0(oup))
  2527		polyMul
  2528		LEAQ 16(oup), oup
  2529	
  2530	sealAVX2Tail384LoopB:
  2531		chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2532		polyAdd(0(oup))
  2533		polyMul
  2534		VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
  2535		VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2536		VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
  2537		chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
  2538		polyAdd(16(oup))
  2539		polyMul
  2540		LEAQ     32(oup), oup
  2541		VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
  2542		VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
  2543		VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
  2544		DECQ     itr1
  2545		JG       sealAVX2Tail384LoopA
  2546		DECQ     itr2
  2547		JGE      sealAVX2Tail384LoopB
  2548	
  2549		VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
  2550		VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
  2551		VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
  2552		VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
  2553		VPERM2I128 $0x02, AA0, BB0, TT0
  2554		VPERM2I128 $0x02, CC0, DD0, TT1
  2555		VPERM2I128 $0x13, AA0, BB0, TT2
  2556		VPERM2I128 $0x13, CC0, DD0, TT3
  2557		VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
  2558		VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
  2559		VPERM2I128 $0x02, AA1, BB1, TT0
  2560		VPERM2I128 $0x02, CC1, DD1, TT1
  2561		VPERM2I128 $0x13, AA1, BB1, TT2
  2562		VPERM2I128 $0x13, CC1, DD1, TT3
  2563		VPXOR      (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
  2564		VMOVDQU    TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
  2565		MOVQ       $256, itr1
  2566		LEAQ       256(inp), inp
  2567		SUBQ       $256, inl
  2568		VPERM2I128 $0x02, AA2, BB2, AA0
  2569		VPERM2I128 $0x02, CC2, DD2, BB0
  2570		VPERM2I128 $0x13, AA2, BB2, CC0
  2571		VPERM2I128 $0x13, CC2, DD2, DD0
  2572	
  2573		JMP sealAVX2SealHash
  2574	
  2575	// ----------------------------------------------------------------------------
  2576	// Special optimization for the last 512 bytes of ciphertext
  2577	sealAVX2Tail512:
  2578		// Need to decrypt up to 512 bytes - prepare two blocks
  2579		// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
  2580		// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
  2581		VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
  2582		VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
  2583		VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
  2584		VMOVDQA ctr3StoreAVX2, DD0
  2585		VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
  2586		VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
  2587	
  2588	sealAVX2Tail512LoopA:
  2589		polyAdd(0(oup))
  2590		polyMul
  2591		LEAQ 16(oup), oup
  2592	
  2593	sealAVX2Tail512LoopB:
  2594		VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2595		VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2596		VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2597		VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2598		VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2599		VMOVDQA  CC3, tmpStoreAVX2
  2600		VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2601		VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2602		VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2603		VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2604		VMOVDQA  tmpStoreAVX2, CC3
  2605		polyAdd(0*8(oup))
  2606		polyMulAVX2
  2607		VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2608		VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2609		VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  2610		VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2611		VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2612		VMOVDQA  CC3, tmpStoreAVX2
  2613		VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  2614		VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  2615		VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  2616		VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  2617		VMOVDQA  tmpStoreAVX2, CC3
  2618		VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
  2619		VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  2620		VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
  2621		VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2622		VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2623		VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
  2624		VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2625		VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2626		polyAdd(2*8(oup))
  2627		polyMulAVX2
  2628		LEAQ     (4*8)(oup), oup
  2629		VMOVDQA  CC3, tmpStoreAVX2
  2630		VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
  2631		VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
  2632		VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
  2633		VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
  2634		VMOVDQA  tmpStoreAVX2, CC3
  2635		VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
  2636		VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
  2637		VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
  2638		VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
  2639		VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
  2640		VMOVDQA  CC3, tmpStoreAVX2
  2641		VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
  2642		VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
  2643		VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
  2644		VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
  2645		VMOVDQA  tmpStoreAVX2, CC3
  2646		VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
  2647		VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
  2648		VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
  2649	
  2650		DECQ itr1
  2651		JG   sealAVX2Tail512LoopA
  2652		DECQ itr2
  2653		JGE  sealAVX2Tail512LoopB
  2654	
  2655		VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
  2656		VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
  2657		VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
  2658		VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
  2659		VMOVDQA    CC3, tmpStoreAVX2
  2660		VPERM2I128 $0x02, AA0, BB0, CC3
  2661		VPXOR      (0*32)(inp), CC3, CC3
  2662		VMOVDQU    CC3, (0*32)(oup)
  2663		VPERM2I128 $0x02, CC0, DD0, CC3
  2664		VPXOR      (1*32)(inp), CC3, CC3
  2665		VMOVDQU    CC3, (1*32)(oup)
  2666		VPERM2I128 $0x13, AA0, BB0, CC3
  2667		VPXOR      (2*32)(inp), CC3, CC3
  2668		VMOVDQU    CC3, (2*32)(oup)
  2669		VPERM2I128 $0x13, CC0, DD0, CC3
  2670		VPXOR      (3*32)(inp), CC3, CC3
  2671		VMOVDQU    CC3, (3*32)(oup)
  2672	
  2673		VPERM2I128 $0x02, AA1, BB1, AA0
  2674		VPERM2I128 $0x02, CC1, DD1, BB0
  2675		VPERM2I128 $0x13, AA1, BB1, CC0
  2676		VPERM2I128 $0x13, CC1, DD1, DD0
  2677		VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
  2678		VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
  2679	
  2680		VPERM2I128 $0x02, AA2, BB2, AA0
  2681		VPERM2I128 $0x02, CC2, DD2, BB0
  2682		VPERM2I128 $0x13, AA2, BB2, CC0
  2683		VPERM2I128 $0x13, CC2, DD2, DD0
  2684		VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
  2685		VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
  2686	
  2687		MOVQ       $384, itr1
  2688		LEAQ       384(inp), inp
  2689		SUBQ       $384, inl
  2690		VPERM2I128 $0x02, AA3, BB3, AA0
  2691		VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0
  2692		VPERM2I128 $0x13, AA3, BB3, CC0
  2693		VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
  2694	
  2695		JMP sealAVX2SealHash

View as plain text