...

Text file src/crypto/aes/gcm_amd64.s

     1	// Copyright 2015 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI
     6	// The implementation uses some optimization as described in:
     7	// [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication
     8	//     Instruction and its Usage for Computing the GCM Mode rev. 2.02
     9	// [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and
    10	//     Hardware
    11	
    12	#include "textflag.h"
    13	
    14	#define B0 X0
    15	#define B1 X1
    16	#define B2 X2
    17	#define B3 X3
    18	#define B4 X4
    19	#define B5 X5
    20	#define B6 X6
    21	#define B7 X7
    22	
    23	#define ACC0 X8
    24	#define ACC1 X9
    25	#define ACCM X10
    26	
    27	#define T0 X11
    28	#define T1 X12
    29	#define T2 X13
    30	#define POLY X14
    31	#define BSWAP X15
    32	
    33	DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
    34	DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607
    35	
    36	DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
    37	DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
    38	
    39	DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
    40	DATA andMask<>+0x08(SB)/8, $0x0000000000000000
    41	DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
    42	DATA andMask<>+0x18(SB)/8, $0x0000000000000000
    43	DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
    44	DATA andMask<>+0x28(SB)/8, $0x0000000000000000
    45	DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
    46	DATA andMask<>+0x38(SB)/8, $0x0000000000000000
    47	DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
    48	DATA andMask<>+0x48(SB)/8, $0x0000000000000000
    49	DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
    50	DATA andMask<>+0x58(SB)/8, $0x0000000000000000
    51	DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
    52	DATA andMask<>+0x68(SB)/8, $0x0000000000000000
    53	DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
    54	DATA andMask<>+0x78(SB)/8, $0x0000000000000000
    55	DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
    56	DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
    57	DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
    58	DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
    59	DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
    60	DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
    61	DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
    62	DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
    63	DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
    64	DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
    65	DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
    66	DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
    67	DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
    68	DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
    69	
    70	GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16
    71	GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
    72	GLOBL andMask<>(SB), (NOPTR+RODATA), $240
    73	
    74	// func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
    75	TEXT ·gcmAesFinish(SB),NOSPLIT,$0
    76	#define pTbl DI
    77	#define tMsk SI
    78	#define tPtr DX
    79	#define plen AX
    80	#define dlen CX
    81	
    82		MOVQ productTable+0(FP), pTbl
    83		MOVQ tagMask+8(FP), tMsk
    84		MOVQ T+16(FP), tPtr
    85		MOVQ pLen+24(FP), plen
    86		MOVQ dLen+32(FP), dlen
    87	
    88		MOVOU (tPtr), ACC0
    89		MOVOU (tMsk), T2
    90	
    91		MOVOU bswapMask<>(SB), BSWAP
    92		MOVOU gcmPoly<>(SB), POLY
    93	
    94		SHLQ $3, plen
    95		SHLQ $3, dlen
    96	
    97		MOVQ plen, B0
    98		PINSRQ $1, dlen, B0
    99	
   100		PXOR ACC0, B0
   101	
   102		MOVOU (16*14)(pTbl), ACC0
   103		MOVOU (16*15)(pTbl), ACCM
   104		MOVOU ACC0, ACC1
   105	
   106		PCLMULQDQ $0x00, B0, ACC0
   107		PCLMULQDQ $0x11, B0, ACC1
   108		PSHUFD $78, B0, T0
   109		PXOR B0, T0
   110		PCLMULQDQ $0x00, T0, ACCM
   111	
   112		PXOR ACC0, ACCM
   113		PXOR ACC1, ACCM
   114		MOVOU ACCM, T0
   115		PSRLDQ $8, ACCM
   116		PSLLDQ $8, T0
   117		PXOR ACCM, ACC1
   118		PXOR T0, ACC0
   119	
   120		MOVOU POLY, T0
   121		PCLMULQDQ $0x01, ACC0, T0
   122		PSHUFD $78, ACC0, ACC0
   123		PXOR T0, ACC0
   124	
   125		MOVOU POLY, T0
   126		PCLMULQDQ $0x01, ACC0, T0
   127		PSHUFD $78, ACC0, ACC0
   128		PXOR T0, ACC0
   129	
   130		PXOR ACC1, ACC0
   131	
   132		PSHUFB BSWAP, ACC0
   133		PXOR T2, ACC0
   134		MOVOU ACC0, (tPtr)
   135	
   136		RET
   137	#undef pTbl
   138	#undef tMsk
   139	#undef tPtr
   140	#undef plen
   141	#undef dlen
   142	
   143	// func gcmAesInit(productTable *[256]byte, ks []uint32)
   144	TEXT ·gcmAesInit(SB),NOSPLIT,$0
   145	#define dst DI
   146	#define KS SI
   147	#define NR DX
   148	
   149		MOVQ productTable+0(FP), dst
   150		MOVQ ks_base+8(FP), KS
   151		MOVQ ks_len+16(FP), NR
   152	
   153		SHRQ $2, NR
   154		DECQ NR
   155	
   156		MOVOU bswapMask<>(SB), BSWAP
   157		MOVOU gcmPoly<>(SB), POLY
   158	
   159		// Encrypt block 0, with the AES key to generate the hash key H
   160		MOVOU (16*0)(KS), B0
   161		MOVOU (16*1)(KS), T0
   162		AESENC T0, B0
   163		MOVOU (16*2)(KS), T0
   164		AESENC T0, B0
   165		MOVOU (16*3)(KS), T0
   166		AESENC T0, B0
   167		MOVOU (16*4)(KS), T0
   168		AESENC T0, B0
   169		MOVOU (16*5)(KS), T0
   170		AESENC T0, B0
   171		MOVOU (16*6)(KS), T0
   172		AESENC T0, B0
   173		MOVOU (16*7)(KS), T0
   174		AESENC T0, B0
   175		MOVOU (16*8)(KS), T0
   176		AESENC T0, B0
   177		MOVOU (16*9)(KS), T0
   178		AESENC T0, B0
   179		MOVOU (16*10)(KS), T0
   180		CMPQ NR, $12
   181		JB initEncLast
   182		AESENC T0, B0
   183		MOVOU (16*11)(KS), T0
   184		AESENC T0, B0
   185		MOVOU (16*12)(KS), T0
   186		JE initEncLast
   187		AESENC T0, B0
   188		MOVOU (16*13)(KS), T0
   189		AESENC T0, B0
   190		MOVOU (16*14)(KS), T0
   191	initEncLast:
   192		AESENCLAST T0, B0
   193	
   194		PSHUFB BSWAP, B0
   195		// H * 2
   196		PSHUFD $0xff, B0, T0
   197		MOVOU B0, T1
   198		PSRAL $31, T0
   199		PAND POLY, T0
   200		PSRLL $31, T1
   201		PSLLDQ $4, T1
   202		PSLLL $1, B0
   203		PXOR T0, B0
   204		PXOR T1, B0
   205		// Karatsuba pre-computations
   206		MOVOU B0, (16*14)(dst)
   207		PSHUFD $78, B0, B1
   208		PXOR B0, B1
   209		MOVOU B1, (16*15)(dst)
   210	
   211		MOVOU B0, B2
   212		MOVOU B1, B3
   213		// Now prepare powers of H and pre-computations for them
   214		MOVQ $7, AX
   215	
   216	initLoop:
   217			MOVOU B2, T0
   218			MOVOU B2, T1
   219			MOVOU B3, T2
   220			PCLMULQDQ $0x00, B0, T0
   221			PCLMULQDQ $0x11, B0, T1
   222			PCLMULQDQ $0x00, B1, T2
   223	
   224			PXOR T0, T2
   225			PXOR T1, T2
   226			MOVOU T2, B4
   227			PSLLDQ $8, B4
   228			PSRLDQ $8, T2
   229			PXOR B4, T0
   230			PXOR T2, T1
   231	
   232			MOVOU POLY, B2
   233			PCLMULQDQ $0x01, T0, B2
   234			PSHUFD $78, T0, T0
   235			PXOR B2, T0
   236			MOVOU POLY, B2
   237			PCLMULQDQ $0x01, T0, B2
   238			PSHUFD $78, T0, T0
   239			PXOR T0, B2
   240			PXOR T1, B2
   241	
   242			MOVOU B2, (16*12)(dst)
   243			PSHUFD $78, B2, B3
   244			PXOR B2, B3
   245			MOVOU B3, (16*13)(dst)
   246	
   247			DECQ AX
   248			LEAQ (-16*2)(dst), dst
   249		JNE initLoop
   250	
   251		RET
   252	#undef NR
   253	#undef KS
   254	#undef dst
   255	
   256	// func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
   257	TEXT ·gcmAesData(SB),NOSPLIT,$0
   258	#define pTbl DI
   259	#define aut SI
   260	#define tPtr CX
   261	#define autLen DX
   262	
   263	#define reduceRound(a) 	MOVOU POLY, T0;	PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
   264	#define mulRoundAAD(X ,i) \
   265		MOVOU (16*(i*2))(pTbl), T1;\
   266		MOVOU T1, T2;\
   267		PCLMULQDQ $0x00, X, T1;\
   268		PXOR T1, ACC0;\
   269		PCLMULQDQ $0x11, X, T2;\
   270		PXOR T2, ACC1;\
   271		PSHUFD $78, X, T1;\
   272		PXOR T1, X;\
   273		MOVOU (16*(i*2+1))(pTbl), T1;\
   274		PCLMULQDQ $0x00, X, T1;\
   275		PXOR T1, ACCM
   276	
   277		MOVQ productTable+0(FP), pTbl
   278		MOVQ data_base+8(FP), aut
   279		MOVQ data_len+16(FP), autLen
   280		MOVQ T+32(FP), tPtr
   281	
   282		PXOR ACC0, ACC0
   283		MOVOU bswapMask<>(SB), BSWAP
   284		MOVOU gcmPoly<>(SB), POLY
   285	
   286		TESTQ autLen, autLen
   287		JEQ dataBail
   288	
   289		CMPQ autLen, $13	// optimize the TLS case
   290		JE dataTLS
   291		CMPQ autLen, $128
   292		JB startSinglesLoop
   293		JMP dataOctaLoop
   294	
   295	dataTLS:
   296		MOVOU (16*14)(pTbl), T1
   297		MOVOU (16*15)(pTbl), T2
   298		PXOR B0, B0
   299		MOVQ (aut), B0
   300		PINSRD $2, 8(aut), B0
   301		PINSRB $12, 12(aut), B0
   302		XORQ autLen, autLen
   303		JMP dataMul
   304	
   305	dataOctaLoop:
   306			CMPQ autLen, $128
   307			JB startSinglesLoop
   308			SUBQ $128, autLen
   309	
   310			MOVOU (16*0)(aut), X0
   311			MOVOU (16*1)(aut), X1
   312			MOVOU (16*2)(aut), X2
   313			MOVOU (16*3)(aut), X3
   314			MOVOU (16*4)(aut), X4
   315			MOVOU (16*5)(aut), X5
   316			MOVOU (16*6)(aut), X6
   317			MOVOU (16*7)(aut), X7
   318			LEAQ (16*8)(aut), aut
   319			PSHUFB BSWAP, X0
   320			PSHUFB BSWAP, X1
   321			PSHUFB BSWAP, X2
   322			PSHUFB BSWAP, X3
   323			PSHUFB BSWAP, X4
   324			PSHUFB BSWAP, X5
   325			PSHUFB BSWAP, X6
   326			PSHUFB BSWAP, X7
   327			PXOR ACC0, X0
   328	
   329			MOVOU (16*0)(pTbl), ACC0
   330			MOVOU (16*1)(pTbl), ACCM
   331			MOVOU ACC0, ACC1
   332			PSHUFD $78, X0, T1
   333			PXOR X0, T1
   334			PCLMULQDQ $0x00, X0, ACC0
   335			PCLMULQDQ $0x11, X0, ACC1
   336			PCLMULQDQ $0x00, T1, ACCM
   337	
   338			mulRoundAAD(X1, 1)
   339			mulRoundAAD(X2, 2)
   340			mulRoundAAD(X3, 3)
   341			mulRoundAAD(X4, 4)
   342			mulRoundAAD(X5, 5)
   343			mulRoundAAD(X6, 6)
   344			mulRoundAAD(X7, 7)
   345	
   346			PXOR ACC0, ACCM
   347			PXOR ACC1, ACCM
   348			MOVOU ACCM, T0
   349			PSRLDQ $8, ACCM
   350			PSLLDQ $8, T0
   351			PXOR ACCM, ACC1
   352			PXOR T0, ACC0
   353			reduceRound(ACC0)
   354			reduceRound(ACC0)
   355			PXOR ACC1, ACC0
   356		JMP dataOctaLoop
   357	
   358	startSinglesLoop:
   359		MOVOU (16*14)(pTbl), T1
   360		MOVOU (16*15)(pTbl), T2
   361	
   362	dataSinglesLoop:
   363	
   364			CMPQ autLen, $16
   365			JB dataEnd
   366			SUBQ $16, autLen
   367	
   368			MOVOU (aut), B0
   369	dataMul:
   370			PSHUFB BSWAP, B0
   371			PXOR ACC0, B0
   372	
   373			MOVOU T1, ACC0
   374			MOVOU T2, ACCM
   375			MOVOU T1, ACC1
   376	
   377			PSHUFD $78, B0, T0
   378			PXOR B0, T0
   379			PCLMULQDQ $0x00, B0, ACC0
   380			PCLMULQDQ $0x11, B0, ACC1
   381			PCLMULQDQ $0x00, T0, ACCM
   382	
   383			PXOR ACC0, ACCM
   384			PXOR ACC1, ACCM
   385			MOVOU ACCM, T0
   386			PSRLDQ $8, ACCM
   387			PSLLDQ $8, T0
   388			PXOR ACCM, ACC1
   389			PXOR T0, ACC0
   390	
   391			MOVOU POLY, T0
   392			PCLMULQDQ $0x01, ACC0, T0
   393			PSHUFD $78, ACC0, ACC0
   394			PXOR T0, ACC0
   395	
   396			MOVOU POLY, T0
   397			PCLMULQDQ $0x01, ACC0, T0
   398			PSHUFD $78, ACC0, ACC0
   399			PXOR T0, ACC0
   400			PXOR ACC1, ACC0
   401	
   402			LEAQ 16(aut), aut
   403	
   404		JMP dataSinglesLoop
   405	
   406	dataEnd:
   407	
   408		TESTQ autLen, autLen
   409		JEQ dataBail
   410	
   411		PXOR B0, B0
   412		LEAQ -1(aut)(autLen*1), aut
   413	
   414	dataLoadLoop:
   415	
   416			PSLLDQ $1, B0
   417			PINSRB $0, (aut), B0
   418	
   419			LEAQ -1(aut), aut
   420			DECQ autLen
   421			JNE dataLoadLoop
   422	
   423		JMP dataMul
   424	
   425	dataBail:
   426		MOVOU ACC0, (tPtr)
   427		RET
   428	#undef pTbl
   429	#undef aut
   430	#undef tPtr
   431	#undef autLen
   432	
   433	// func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   434	TEXT ·gcmAesEnc(SB),0,$256-96
   435	#define pTbl DI
   436	#define ctx DX
   437	#define ctrPtr CX
   438	#define ptx SI
   439	#define ks AX
   440	#define tPtr R8
   441	#define ptxLen R9
   442	#define aluCTR R10
   443	#define aluTMP R11
   444	#define aluK R12
   445	#define NR R13
   446	
   447	#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP)
   448	#define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7
   449	#define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7
   450	#define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7
   451	#define combinedRound(i) \
   452		MOVOU (16*i)(ks), T0;\
   453		AESENC T0, B0;\
   454		AESENC T0, B1;\
   455		AESENC T0, B2;\
   456		AESENC T0, B3;\
   457		 MOVOU (16*(i*2))(pTbl), T1;\
   458		 MOVOU T1, T2;\
   459		AESENC T0, B4;\
   460		AESENC T0, B5;\
   461		AESENC T0, B6;\
   462		AESENC T0, B7;\
   463		 MOVOU (16*i)(SP), T0;\
   464		 PCLMULQDQ $0x00, T0, T1;\
   465		 PXOR T1, ACC0;\
   466		 PSHUFD $78, T0, T1;\
   467		 PCLMULQDQ $0x11, T0, T2;\
   468		 PXOR T1, T0;\
   469		 PXOR T2, ACC1;\
   470		 MOVOU (16*(i*2+1))(pTbl), T2;\
   471		 PCLMULQDQ $0x00, T2, T0;\
   472		 PXOR T0, ACCM
   473	#define mulRound(i) \
   474		MOVOU (16*i)(SP), T0;\
   475		MOVOU (16*(i*2))(pTbl), T1;\
   476		MOVOU T1, T2;\
   477		PCLMULQDQ $0x00, T0, T1;\
   478		PXOR T1, ACC0;\
   479		PCLMULQDQ $0x11, T0, T2;\
   480		PXOR T2, ACC1;\
   481		PSHUFD $78, T0, T1;\
   482		PXOR T1, T0;\
   483		MOVOU (16*(i*2+1))(pTbl), T1;\
   484		PCLMULQDQ $0x00, T0, T1;\
   485		PXOR T1, ACCM
   486	
   487		MOVQ productTable+0(FP), pTbl
   488		MOVQ dst+8(FP), ctx
   489		MOVQ src_base+32(FP), ptx
   490		MOVQ src_len+40(FP), ptxLen
   491		MOVQ ctr+56(FP), ctrPtr
   492		MOVQ T+64(FP), tPtr
   493		MOVQ ks_base+72(FP), ks
   494		MOVQ ks_len+80(FP), NR
   495	
   496		SHRQ $2, NR
   497		DECQ NR
   498	
   499		MOVOU bswapMask<>(SB), BSWAP
   500		MOVOU gcmPoly<>(SB), POLY
   501	
   502		MOVOU (tPtr), ACC0
   503		PXOR ACC1, ACC1
   504		PXOR ACCM, ACCM
   505		MOVOU (ctrPtr), B0
   506		MOVL (3*4)(ctrPtr), aluCTR
   507		MOVOU (ks), T0
   508		MOVL (3*4)(ks), aluK
   509		BSWAPL aluCTR
   510		BSWAPL aluK
   511	
   512		PXOR B0, T0
   513		MOVOU T0, (8*16 + 0*16)(SP)
   514		increment(0)
   515	
   516		CMPQ ptxLen, $128
   517		JB gcmAesEncSingles
   518		SUBQ $128, ptxLen
   519	
   520		// We have at least 8 blocks to encrypt, prepare the rest of the counters
   521		MOVOU T0, (8*16 + 1*16)(SP)
   522		increment(1)
   523		MOVOU T0, (8*16 + 2*16)(SP)
   524		increment(2)
   525		MOVOU T0, (8*16 + 3*16)(SP)
   526		increment(3)
   527		MOVOU T0, (8*16 + 4*16)(SP)
   528		increment(4)
   529		MOVOU T0, (8*16 + 5*16)(SP)
   530		increment(5)
   531		MOVOU T0, (8*16 + 6*16)(SP)
   532		increment(6)
   533		MOVOU T0, (8*16 + 7*16)(SP)
   534		increment(7)
   535	
   536		MOVOU (8*16 + 0*16)(SP), B0
   537		MOVOU (8*16 + 1*16)(SP), B1
   538		MOVOU (8*16 + 2*16)(SP), B2
   539		MOVOU (8*16 + 3*16)(SP), B3
   540		MOVOU (8*16 + 4*16)(SP), B4
   541		MOVOU (8*16 + 5*16)(SP), B5
   542		MOVOU (8*16 + 6*16)(SP), B6
   543		MOVOU (8*16 + 7*16)(SP), B7
   544	
   545		aesRound(1)
   546		increment(0)
   547		aesRound(2)
   548		increment(1)
   549		aesRound(3)
   550		increment(2)
   551		aesRound(4)
   552		increment(3)
   553		aesRound(5)
   554		increment(4)
   555		aesRound(6)
   556		increment(5)
   557		aesRound(7)
   558		increment(6)
   559		aesRound(8)
   560		increment(7)
   561		aesRound(9)
   562		MOVOU (16*10)(ks), T0
   563		CMPQ NR, $12
   564		JB encLast1
   565		aesRnd(T0)
   566		aesRound(11)
   567		MOVOU (16*12)(ks), T0
   568		JE encLast1
   569		aesRnd(T0)
   570		aesRound(13)
   571		MOVOU (16*14)(ks), T0
   572	encLast1:
   573		aesRndLast(T0)
   574	
   575		MOVOU (16*0)(ptx), T0
   576		PXOR T0, B0
   577		MOVOU (16*1)(ptx), T0
   578		PXOR T0, B1
   579		MOVOU (16*2)(ptx), T0
   580		PXOR T0, B2
   581		MOVOU (16*3)(ptx), T0
   582		PXOR T0, B3
   583		MOVOU (16*4)(ptx), T0
   584		PXOR T0, B4
   585		MOVOU (16*5)(ptx), T0
   586		PXOR T0, B5
   587		MOVOU (16*6)(ptx), T0
   588		PXOR T0, B6
   589		MOVOU (16*7)(ptx), T0
   590		PXOR T0, B7
   591	
   592		MOVOU B0, (16*0)(ctx)
   593		PSHUFB BSWAP, B0
   594		PXOR ACC0, B0
   595		MOVOU B1, (16*1)(ctx)
   596		PSHUFB BSWAP, B1
   597		MOVOU B2, (16*2)(ctx)
   598		PSHUFB BSWAP, B2
   599		MOVOU B3, (16*3)(ctx)
   600		PSHUFB BSWAP, B3
   601		MOVOU B4, (16*4)(ctx)
   602		PSHUFB BSWAP, B4
   603		MOVOU B5, (16*5)(ctx)
   604		PSHUFB BSWAP, B5
   605		MOVOU B6, (16*6)(ctx)
   606		PSHUFB BSWAP, B6
   607		MOVOU B7, (16*7)(ctx)
   608		PSHUFB BSWAP, B7
   609	
   610		MOVOU B0, (16*0)(SP)
   611		MOVOU B1, (16*1)(SP)
   612		MOVOU B2, (16*2)(SP)
   613		MOVOU B3, (16*3)(SP)
   614		MOVOU B4, (16*4)(SP)
   615		MOVOU B5, (16*5)(SP)
   616		MOVOU B6, (16*6)(SP)
   617		MOVOU B7, (16*7)(SP)
   618	
   619		LEAQ 128(ptx), ptx
   620		LEAQ 128(ctx), ctx
   621	
   622	gcmAesEncOctetsLoop:
   623	
   624			CMPQ ptxLen, $128
   625			JB gcmAesEncOctetsEnd
   626			SUBQ $128, ptxLen
   627	
   628			MOVOU (8*16 + 0*16)(SP), B0
   629			MOVOU (8*16 + 1*16)(SP), B1
   630			MOVOU (8*16 + 2*16)(SP), B2
   631			MOVOU (8*16 + 3*16)(SP), B3
   632			MOVOU (8*16 + 4*16)(SP), B4
   633			MOVOU (8*16 + 5*16)(SP), B5
   634			MOVOU (8*16 + 6*16)(SP), B6
   635			MOVOU (8*16 + 7*16)(SP), B7
   636	
   637			MOVOU (16*0)(SP), T0
   638			PSHUFD $78, T0, T1
   639			PXOR T0, T1
   640	
   641			MOVOU (16*0)(pTbl), ACC0
   642			MOVOU (16*1)(pTbl), ACCM
   643			MOVOU ACC0, ACC1
   644	
   645			PCLMULQDQ $0x00, T1, ACCM
   646			PCLMULQDQ $0x00, T0, ACC0
   647			PCLMULQDQ $0x11, T0, ACC1
   648	
   649			combinedRound(1)
   650			increment(0)
   651			combinedRound(2)
   652			increment(1)
   653			combinedRound(3)
   654			increment(2)
   655			combinedRound(4)
   656			increment(3)
   657			combinedRound(5)
   658			increment(4)
   659			combinedRound(6)
   660			increment(5)
   661			combinedRound(7)
   662			increment(6)
   663	
   664			aesRound(8)
   665			increment(7)
   666	
   667			PXOR ACC0, ACCM
   668			PXOR ACC1, ACCM
   669			MOVOU ACCM, T0
   670			PSRLDQ $8, ACCM
   671			PSLLDQ $8, T0
   672			PXOR ACCM, ACC1
   673			PXOR T0, ACC0
   674	
   675			reduceRound(ACC0)
   676			aesRound(9)
   677	
   678			reduceRound(ACC0)
   679			PXOR ACC1, ACC0
   680	
   681			MOVOU (16*10)(ks), T0
   682			CMPQ NR, $12
   683			JB encLast2
   684			aesRnd(T0)
   685			aesRound(11)
   686			MOVOU (16*12)(ks), T0
   687			JE encLast2
   688			aesRnd(T0)
   689			aesRound(13)
   690			MOVOU (16*14)(ks), T0
   691	encLast2:
   692			aesRndLast(T0)
   693	
   694			MOVOU (16*0)(ptx), T0
   695			PXOR T0, B0
   696			MOVOU (16*1)(ptx), T0
   697			PXOR T0, B1
   698			MOVOU (16*2)(ptx), T0
   699			PXOR T0, B2
   700			MOVOU (16*3)(ptx), T0
   701			PXOR T0, B3
   702			MOVOU (16*4)(ptx), T0
   703			PXOR T0, B4
   704			MOVOU (16*5)(ptx), T0
   705			PXOR T0, B5
   706			MOVOU (16*6)(ptx), T0
   707			PXOR T0, B6
   708			MOVOU (16*7)(ptx), T0
   709			PXOR T0, B7
   710	
   711			MOVOU B0, (16*0)(ctx)
   712			PSHUFB BSWAP, B0
   713			PXOR ACC0, B0
   714			MOVOU B1, (16*1)(ctx)
   715			PSHUFB BSWAP, B1
   716			MOVOU B2, (16*2)(ctx)
   717			PSHUFB BSWAP, B2
   718			MOVOU B3, (16*3)(ctx)
   719			PSHUFB BSWAP, B3
   720			MOVOU B4, (16*4)(ctx)
   721			PSHUFB BSWAP, B4
   722			MOVOU B5, (16*5)(ctx)
   723			PSHUFB BSWAP, B5
   724			MOVOU B6, (16*6)(ctx)
   725			PSHUFB BSWAP, B6
   726			MOVOU B7, (16*7)(ctx)
   727			PSHUFB BSWAP, B7
   728	
   729			MOVOU B0, (16*0)(SP)
   730			MOVOU B1, (16*1)(SP)
   731			MOVOU B2, (16*2)(SP)
   732			MOVOU B3, (16*3)(SP)
   733			MOVOU B4, (16*4)(SP)
   734			MOVOU B5, (16*5)(SP)
   735			MOVOU B6, (16*6)(SP)
   736			MOVOU B7, (16*7)(SP)
   737	
   738			LEAQ 128(ptx), ptx
   739			LEAQ 128(ctx), ctx
   740	
   741			JMP gcmAesEncOctetsLoop
   742	
   743	gcmAesEncOctetsEnd:
   744	
   745		MOVOU (16*0)(SP), T0
   746		MOVOU (16*0)(pTbl), ACC0
   747		MOVOU (16*1)(pTbl), ACCM
   748		MOVOU ACC0, ACC1
   749		PSHUFD $78, T0, T1
   750		PXOR T0, T1
   751		PCLMULQDQ $0x00, T0, ACC0
   752		PCLMULQDQ $0x11, T0, ACC1
   753		PCLMULQDQ $0x00, T1, ACCM
   754	
   755		mulRound(1)
   756		mulRound(2)
   757		mulRound(3)
   758		mulRound(4)
   759		mulRound(5)
   760		mulRound(6)
   761		mulRound(7)
   762	
   763		PXOR ACC0, ACCM
   764		PXOR ACC1, ACCM
   765		MOVOU ACCM, T0
   766		PSRLDQ $8, ACCM
   767		PSLLDQ $8, T0
   768		PXOR ACCM, ACC1
   769		PXOR T0, ACC0
   770	
   771		reduceRound(ACC0)
   772		reduceRound(ACC0)
   773		PXOR ACC1, ACC0
   774	
   775		TESTQ ptxLen, ptxLen
   776		JE gcmAesEncDone
   777	
   778		SUBQ $7, aluCTR
   779	
   780	gcmAesEncSingles:
   781	
   782		MOVOU (16*1)(ks), B1
   783		MOVOU (16*2)(ks), B2
   784		MOVOU (16*3)(ks), B3
   785		MOVOU (16*4)(ks), B4
   786		MOVOU (16*5)(ks), B5
   787		MOVOU (16*6)(ks), B6
   788		MOVOU (16*7)(ks), B7
   789	
   790		MOVOU (16*14)(pTbl), T2
   791	
   792	gcmAesEncSinglesLoop:
   793	
   794			CMPQ ptxLen, $16
   795			JB gcmAesEncTail
   796			SUBQ $16, ptxLen
   797	
   798			MOVOU (8*16 + 0*16)(SP), B0
   799			increment(0)
   800	
   801			AESENC B1, B0
   802			AESENC B2, B0
   803			AESENC B3, B0
   804			AESENC B4, B0
   805			AESENC B5, B0
   806			AESENC B6, B0
   807			AESENC B7, B0
   808			MOVOU (16*8)(ks), T0
   809			AESENC T0, B0
   810			MOVOU (16*9)(ks), T0
   811			AESENC T0, B0
   812			MOVOU (16*10)(ks), T0
   813			CMPQ NR, $12
   814			JB encLast3
   815			AESENC T0, B0
   816			MOVOU (16*11)(ks), T0
   817			AESENC T0, B0
   818			MOVOU (16*12)(ks), T0
   819			JE encLast3
   820			AESENC T0, B0
   821			MOVOU (16*13)(ks), T0
   822			AESENC T0, B0
   823			MOVOU (16*14)(ks), T0
   824	encLast3:
   825			AESENCLAST T0, B0
   826	
   827			MOVOU (ptx), T0
   828			PXOR T0, B0
   829			MOVOU B0, (ctx)
   830	
   831			PSHUFB BSWAP, B0
   832			PXOR ACC0, B0
   833	
   834			MOVOU T2, ACC0
   835			MOVOU T2, ACC1
   836			MOVOU (16*15)(pTbl), ACCM
   837	
   838			PSHUFD $78, B0, T0
   839			PXOR B0, T0
   840			PCLMULQDQ $0x00, B0, ACC0
   841			PCLMULQDQ $0x11, B0, ACC1
   842			PCLMULQDQ $0x00, T0, ACCM
   843	
   844			PXOR ACC0, ACCM
   845			PXOR ACC1, ACCM
   846			MOVOU ACCM, T0
   847			PSRLDQ $8, ACCM
   848			PSLLDQ $8, T0
   849			PXOR ACCM, ACC1
   850			PXOR T0, ACC0
   851	
   852			reduceRound(ACC0)
   853			reduceRound(ACC0)
   854			PXOR ACC1, ACC0
   855	
   856			LEAQ (16*1)(ptx), ptx
   857			LEAQ (16*1)(ctx), ctx
   858	
   859		JMP gcmAesEncSinglesLoop
   860	
   861	gcmAesEncTail:
   862		TESTQ ptxLen, ptxLen
   863		JE gcmAesEncDone
   864	
   865		MOVOU (8*16 + 0*16)(SP), B0
   866		AESENC B1, B0
   867		AESENC B2, B0
   868		AESENC B3, B0
   869		AESENC B4, B0
   870		AESENC B5, B0
   871		AESENC B6, B0
   872		AESENC B7, B0
   873		MOVOU (16*8)(ks), T0
   874		AESENC T0, B0
   875		MOVOU (16*9)(ks), T0
   876		AESENC T0, B0
   877		MOVOU (16*10)(ks), T0
   878		CMPQ NR, $12
   879		JB encLast4
   880		AESENC T0, B0
   881		MOVOU (16*11)(ks), T0
   882		AESENC T0, B0
   883		MOVOU (16*12)(ks), T0
   884		JE encLast4
   885		AESENC T0, B0
   886		MOVOU (16*13)(ks), T0
   887		AESENC T0, B0
   888		MOVOU (16*14)(ks), T0
   889	encLast4:
   890		AESENCLAST T0, B0
   891		MOVOU B0, T0
   892	
   893		LEAQ -1(ptx)(ptxLen*1), ptx
   894	
   895		MOVQ ptxLen, aluTMP
   896		SHLQ $4, aluTMP
   897	
   898		LEAQ andMask<>(SB), aluCTR
   899		MOVOU -16(aluCTR)(aluTMP*1), T1
   900	
   901		PXOR B0, B0
   902	ptxLoadLoop:
   903			PSLLDQ $1, B0
   904			PINSRB $0, (ptx), B0
   905			LEAQ -1(ptx), ptx
   906			DECQ ptxLen
   907		JNE ptxLoadLoop
   908	
   909		PXOR T0, B0
   910		PAND T1, B0
   911		MOVOU B0, (ctx)	// I assume there is always space, due to TAG in the end of the CT
   912	
   913		PSHUFB BSWAP, B0
   914		PXOR ACC0, B0
   915	
   916		MOVOU T2, ACC0
   917		MOVOU T2, ACC1
   918		MOVOU (16*15)(pTbl), ACCM
   919	
   920		PSHUFD $78, B0, T0
   921		PXOR B0, T0
   922		PCLMULQDQ $0x00, B0, ACC0
   923		PCLMULQDQ $0x11, B0, ACC1
   924		PCLMULQDQ $0x00, T0, ACCM
   925	
   926		PXOR ACC0, ACCM
   927		PXOR ACC1, ACCM
   928		MOVOU ACCM, T0
   929		PSRLDQ $8, ACCM
   930		PSLLDQ $8, T0
   931		PXOR ACCM, ACC1
   932		PXOR T0, ACC0
   933	
   934		reduceRound(ACC0)
   935		reduceRound(ACC0)
   936		PXOR ACC1, ACC0
   937	
   938	gcmAesEncDone:
   939		MOVOU ACC0, (tPtr)
   940		RET
   941	#undef increment
   942	
   943	// func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   944	TEXT ·gcmAesDec(SB),0,$128-96
   945	#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP)
   946	#define combinedDecRound(i) \
   947		MOVOU (16*i)(ks), T0;\
   948		AESENC T0, B0;\
   949		AESENC T0, B1;\
   950		AESENC T0, B2;\
   951		AESENC T0, B3;\
   952		MOVOU (16*(i*2))(pTbl), T1;\
   953		MOVOU T1, T2;\
   954		AESENC T0, B4;\
   955		AESENC T0, B5;\
   956		AESENC T0, B6;\
   957		AESENC T0, B7;\
   958		MOVOU (16*i)(ctx), T0;\
   959		PSHUFB BSWAP, T0;\
   960		PCLMULQDQ $0x00, T0, T1;\
   961		PXOR T1, ACC0;\
   962		PSHUFD $78, T0, T1;\
   963		PCLMULQDQ $0x11, T0, T2;\
   964		PXOR T1, T0;\
   965		PXOR T2, ACC1;\
   966		MOVOU (16*(i*2+1))(pTbl), T2;\
   967		PCLMULQDQ $0x00, T2, T0;\
   968		PXOR T0, ACCM
   969	
   970		MOVQ productTable+0(FP), pTbl
   971		MOVQ dst+8(FP), ptx
   972		MOVQ src_base+32(FP), ctx
   973		MOVQ src_len+40(FP), ptxLen
   974		MOVQ ctr+56(FP), ctrPtr
   975		MOVQ T+64(FP), tPtr
   976		MOVQ ks_base+72(FP), ks
   977		MOVQ ks_len+80(FP), NR
   978	
   979		SHRQ $2, NR
   980		DECQ NR
   981	
   982		MOVOU bswapMask<>(SB), BSWAP
   983		MOVOU gcmPoly<>(SB), POLY
   984	
   985		MOVOU (tPtr), ACC0
   986		PXOR ACC1, ACC1
   987		PXOR ACCM, ACCM
   988		MOVOU (ctrPtr), B0
   989		MOVL (3*4)(ctrPtr), aluCTR
   990		MOVOU (ks), T0
   991		MOVL (3*4)(ks), aluK
   992		BSWAPL aluCTR
   993		BSWAPL aluK
   994	
   995		PXOR B0, T0
   996		MOVOU T0, (0*16)(SP)
   997		increment(0)
   998	
   999		CMPQ ptxLen, $128
  1000		JB gcmAesDecSingles
  1001	
  1002		MOVOU T0, (1*16)(SP)
  1003		increment(1)
  1004		MOVOU T0, (2*16)(SP)
  1005		increment(2)
  1006		MOVOU T0, (3*16)(SP)
  1007		increment(3)
  1008		MOVOU T0, (4*16)(SP)
  1009		increment(4)
  1010		MOVOU T0, (5*16)(SP)
  1011		increment(5)
  1012		MOVOU T0, (6*16)(SP)
  1013		increment(6)
  1014		MOVOU T0, (7*16)(SP)
  1015		increment(7)
  1016	
  1017	gcmAesDecOctetsLoop:
  1018	
  1019			CMPQ ptxLen, $128
  1020			JB gcmAesDecEndOctets
  1021			SUBQ $128, ptxLen
  1022	
  1023			MOVOU (0*16)(SP), B0
  1024			MOVOU (1*16)(SP), B1
  1025			MOVOU (2*16)(SP), B2
  1026			MOVOU (3*16)(SP), B3
  1027			MOVOU (4*16)(SP), B4
  1028			MOVOU (5*16)(SP), B5
  1029			MOVOU (6*16)(SP), B6
  1030			MOVOU (7*16)(SP), B7
  1031	
  1032			MOVOU (16*0)(ctx), T0
  1033			PSHUFB BSWAP, T0
  1034			PXOR ACC0, T0
  1035			PSHUFD $78, T0, T1
  1036			PXOR T0, T1
  1037	
  1038			MOVOU (16*0)(pTbl), ACC0
  1039			MOVOU (16*1)(pTbl), ACCM
  1040			MOVOU ACC0, ACC1
  1041	
  1042			PCLMULQDQ $0x00, T1, ACCM
  1043			PCLMULQDQ $0x00, T0, ACC0
  1044			PCLMULQDQ $0x11, T0, ACC1
  1045	
  1046			combinedDecRound(1)
  1047			increment(0)
  1048			combinedDecRound(2)
  1049			increment(1)
  1050			combinedDecRound(3)
  1051			increment(2)
  1052			combinedDecRound(4)
  1053			increment(3)
  1054			combinedDecRound(5)
  1055			increment(4)
  1056			combinedDecRound(6)
  1057			increment(5)
  1058			combinedDecRound(7)
  1059			increment(6)
  1060	
  1061			aesRound(8)
  1062			increment(7)
  1063	
  1064			PXOR ACC0, ACCM
  1065			PXOR ACC1, ACCM
  1066			MOVOU ACCM, T0
  1067			PSRLDQ $8, ACCM
  1068			PSLLDQ $8, T0
  1069			PXOR ACCM, ACC1
  1070			PXOR T0, ACC0
  1071	
  1072			reduceRound(ACC0)
  1073			aesRound(9)
  1074	
  1075			reduceRound(ACC0)
  1076			PXOR ACC1, ACC0
  1077	
  1078			MOVOU (16*10)(ks), T0
  1079			CMPQ NR, $12
  1080			JB decLast1
  1081			aesRnd(T0)
  1082			aesRound(11)
  1083			MOVOU (16*12)(ks), T0
  1084			JE decLast1
  1085			aesRnd(T0)
  1086			aesRound(13)
  1087			MOVOU (16*14)(ks), T0
  1088	decLast1:
  1089			aesRndLast(T0)
  1090	
  1091			MOVOU (16*0)(ctx), T0
  1092			PXOR T0, B0
  1093			MOVOU (16*1)(ctx), T0
  1094			PXOR T0, B1
  1095			MOVOU (16*2)(ctx), T0
  1096			PXOR T0, B2
  1097			MOVOU (16*3)(ctx), T0
  1098			PXOR T0, B3
  1099			MOVOU (16*4)(ctx), T0
  1100			PXOR T0, B4
  1101			MOVOU (16*5)(ctx), T0
  1102			PXOR T0, B5
  1103			MOVOU (16*6)(ctx), T0
  1104			PXOR T0, B6
  1105			MOVOU (16*7)(ctx), T0
  1106			PXOR T0, B7
  1107	
  1108			MOVOU B0, (16*0)(ptx)
  1109			MOVOU B1, (16*1)(ptx)
  1110			MOVOU B2, (16*2)(ptx)
  1111			MOVOU B3, (16*3)(ptx)
  1112			MOVOU B4, (16*4)(ptx)
  1113			MOVOU B5, (16*5)(ptx)
  1114			MOVOU B6, (16*6)(ptx)
  1115			MOVOU B7, (16*7)(ptx)
  1116	
  1117			LEAQ 128(ptx), ptx
  1118			LEAQ 128(ctx), ctx
  1119	
  1120			JMP gcmAesDecOctetsLoop
  1121	
  1122	gcmAesDecEndOctets:
  1123	
  1124		SUBQ $7, aluCTR
  1125	
  1126	gcmAesDecSingles:
  1127	
  1128		MOVOU (16*1)(ks), B1
  1129		MOVOU (16*2)(ks), B2
  1130		MOVOU (16*3)(ks), B3
  1131		MOVOU (16*4)(ks), B4
  1132		MOVOU (16*5)(ks), B5
  1133		MOVOU (16*6)(ks), B6
  1134		MOVOU (16*7)(ks), B7
  1135	
  1136		MOVOU (16*14)(pTbl), T2
  1137	
  1138	gcmAesDecSinglesLoop:
  1139	
  1140			CMPQ ptxLen, $16
  1141			JB gcmAesDecTail
  1142			SUBQ $16, ptxLen
  1143	
  1144			MOVOU (ctx), B0
  1145			MOVOU B0, T1
  1146			PSHUFB BSWAP, B0
  1147			PXOR ACC0, B0
  1148	
  1149			MOVOU T2, ACC0
  1150			MOVOU T2, ACC1
  1151			MOVOU (16*15)(pTbl), ACCM
  1152	
  1153			PCLMULQDQ $0x00, B0, ACC0
  1154			PCLMULQDQ $0x11, B0, ACC1
  1155			PSHUFD $78, B0, T0
  1156			PXOR B0, T0
  1157			PCLMULQDQ $0x00, T0, ACCM
  1158	
  1159			PXOR ACC0, ACCM
  1160			PXOR ACC1, ACCM
  1161			MOVOU ACCM, T0
  1162			PSRLDQ $8, ACCM
  1163			PSLLDQ $8, T0
  1164			PXOR ACCM, ACC1
  1165			PXOR T0, ACC0
  1166	
  1167			reduceRound(ACC0)
  1168			reduceRound(ACC0)
  1169			PXOR ACC1, ACC0
  1170	
  1171			MOVOU (0*16)(SP), B0
  1172			increment(0)
  1173			AESENC B1, B0
  1174			AESENC B2, B0
  1175			AESENC B3, B0
  1176			AESENC B4, B0
  1177			AESENC B5, B0
  1178			AESENC B6, B0
  1179			AESENC B7, B0
  1180			MOVOU (16*8)(ks), T0
  1181			AESENC T0, B0
  1182			MOVOU (16*9)(ks), T0
  1183			AESENC T0, B0
  1184			MOVOU (16*10)(ks), T0
  1185			CMPQ NR, $12
  1186			JB decLast2
  1187			AESENC T0, B0
  1188			MOVOU (16*11)(ks), T0
  1189			AESENC T0, B0
  1190			MOVOU (16*12)(ks), T0
  1191			JE decLast2
  1192			AESENC T0, B0
  1193			MOVOU (16*13)(ks), T0
  1194			AESENC T0, B0
  1195			MOVOU (16*14)(ks), T0
  1196	decLast2:
  1197			AESENCLAST T0, B0
  1198	
  1199			PXOR T1, B0
  1200			MOVOU B0, (ptx)
  1201	
  1202			LEAQ (16*1)(ptx), ptx
  1203			LEAQ (16*1)(ctx), ctx
  1204	
  1205		JMP gcmAesDecSinglesLoop
  1206	
  1207	gcmAesDecTail:
  1208	
  1209		TESTQ ptxLen, ptxLen
  1210		JE gcmAesDecDone
  1211	
  1212		MOVQ ptxLen, aluTMP
  1213		SHLQ $4, aluTMP
  1214		LEAQ andMask<>(SB), aluCTR
  1215		MOVOU -16(aluCTR)(aluTMP*1), T1
  1216	
  1217		MOVOU (ctx), B0	// I assume there is TAG attached to the ctx, and there is no read overflow
  1218		PAND T1, B0
  1219	
  1220		MOVOU B0, T1
  1221		PSHUFB BSWAP, B0
  1222		PXOR ACC0, B0
  1223	
  1224		MOVOU (16*14)(pTbl), ACC0
  1225		MOVOU (16*15)(pTbl), ACCM
  1226		MOVOU ACC0, ACC1
  1227	
  1228		PCLMULQDQ $0x00, B0, ACC0
  1229		PCLMULQDQ $0x11, B0, ACC1
  1230		PSHUFD $78, B0, T0
  1231		PXOR B0, T0
  1232		PCLMULQDQ $0x00, T0, ACCM
  1233	
  1234		PXOR ACC0, ACCM
  1235		PXOR ACC1, ACCM
  1236		MOVOU ACCM, T0
  1237		PSRLDQ $8, ACCM
  1238		PSLLDQ $8, T0
  1239		PXOR ACCM, ACC1
  1240		PXOR T0, ACC0
  1241	
  1242		reduceRound(ACC0)
  1243		reduceRound(ACC0)
  1244		PXOR ACC1, ACC0
  1245	
  1246		MOVOU (0*16)(SP), B0
  1247		increment(0)
  1248		AESENC B1, B0
  1249		AESENC B2, B0
  1250		AESENC B3, B0
  1251		AESENC B4, B0
  1252		AESENC B5, B0
  1253		AESENC B6, B0
  1254		AESENC B7, B0
  1255		MOVOU (16*8)(ks), T0
  1256		AESENC T0, B0
  1257		MOVOU (16*9)(ks), T0
  1258		AESENC T0, B0
  1259		MOVOU (16*10)(ks), T0
  1260		CMPQ NR, $12
  1261		JB decLast3
  1262		AESENC T0, B0
  1263		MOVOU (16*11)(ks), T0
  1264		AESENC T0, B0
  1265		MOVOU (16*12)(ks), T0
  1266		JE decLast3
  1267		AESENC T0, B0
  1268		MOVOU (16*13)(ks), T0
  1269		AESENC T0, B0
  1270		MOVOU (16*14)(ks), T0
  1271	decLast3:
  1272		AESENCLAST T0, B0
  1273		PXOR T1, B0
  1274	
  1275	ptxStoreLoop:
  1276			PEXTRB $0, B0, (ptx)
  1277			PSRLDQ $1, B0
  1278			LEAQ 1(ptx), ptx
  1279			DECQ ptxLen
  1280	
  1281		JNE ptxStoreLoop
  1282	
  1283	gcmAesDecDone:
  1284	
  1285		MOVOU ACC0, (tPtr)
  1286		RET

View as plain text