...

Text file src/pkg/crypto/aes/gcm_arm64.s

     1	// Copyright 2018 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	#include "textflag.h"
     6	
     7	#define B0 V0
     8	#define B1 V1
     9	#define B2 V2
    10	#define B3 V3
    11	#define B4 V4
    12	#define B5 V5
    13	#define B6 V6
    14	#define B7 V7
    15	
    16	#define ACC0 V8
    17	#define ACC1 V9
    18	#define ACCM V10
    19	
    20	#define T0 V11
    21	#define T1 V12
    22	#define T2 V13
    23	#define T3 V14
    24	
    25	#define POLY V15
    26	#define ZERO V16
    27	#define INC V17
    28	#define CTR V18
    29	
    30	#define K0 V19
    31	#define K1 V20
    32	#define K2 V21
    33	#define K3 V22
    34	#define K4 V23
    35	#define K5 V24
    36	#define K6 V25
    37	#define K7 V26
    38	#define K8 V27
    39	#define K9 V28
    40	#define K10 V29
    41	#define K11 V30
    42	#define KLAST V31
    43	
    44	#define reduce() \
    45		VEOR	ACC0.B16, ACCM.B16, ACCM.B16     \
    46		VEOR	ACC1.B16, ACCM.B16, ACCM.B16     \
    47		VEXT	$8, ZERO.B16, ACCM.B16, T0.B16   \
    48		VEXT	$8, ACCM.B16, ZERO.B16, ACCM.B16 \
    49		VEOR	ACCM.B16, ACC0.B16, ACC0.B16     \
    50		VEOR	T0.B16, ACC1.B16, ACC1.B16       \
    51		VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
    52		VEXT	$8, ACC0.B16, ACC0.B16, ACC0.B16 \
    53		VEOR	T0.B16, ACC0.B16, ACC0.B16       \
    54		VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
    55		VEOR	T0.B16, ACC1.B16, ACC1.B16       \
    56		VEXT	$8, ACC1.B16, ACC1.B16, ACC1.B16 \
    57		VEOR	ACC1.B16, ACC0.B16, ACC0.B16     \
    58	
    59	// func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
    60	TEXT ·gcmAesFinish(SB),NOSPLIT,$0
    61	#define pTbl R0
    62	#define tMsk R1
    63	#define tPtr R2
    64	#define plen R3
    65	#define dlen R4
    66	
    67		MOVD	$0xC2, R1
    68		LSL	$56, R1
    69		MOVD	$1, R0
    70		VMOV	R1, POLY.D[0]
    71		VMOV	R0, POLY.D[1]
    72		VEOR	ZERO.B16, ZERO.B16, ZERO.B16
    73	
    74		MOVD	productTable+0(FP), pTbl
    75		MOVD	tagMask+8(FP), tMsk
    76		MOVD	T+16(FP), tPtr
    77		MOVD	pLen+24(FP), plen
    78		MOVD	dLen+32(FP), dlen
    79	
    80		VLD1	(tPtr), [ACC0.B16]
    81		VLD1	(tMsk), [B1.B16]
    82	
    83		LSL	$3, plen
    84		LSL	$3, dlen
    85	
    86		VMOV	dlen, B0.D[0]
    87		VMOV	plen, B0.D[1]
    88	
    89		ADD	$14*16, pTbl
    90		VLD1.P	(pTbl), [T1.B16, T2.B16]
    91	
    92		VEOR	ACC0.B16, B0.B16, B0.B16
    93	
    94		VEXT	$8, B0.B16, B0.B16, T0.B16
    95		VEOR	B0.B16, T0.B16, T0.B16
    96		VPMULL	B0.D1, T1.D1, ACC1.Q1
    97		VPMULL2	B0.D2, T1.D2, ACC0.Q1
    98		VPMULL	T0.D1, T2.D1, ACCM.Q1
    99	
   100		reduce()
   101	
   102		VREV64	ACC0.B16, ACC0.B16
   103		VEOR	B1.B16, ACC0.B16, ACC0.B16
   104	
   105		VST1	[ACC0.B16], (tPtr)
   106		RET
   107	#undef pTbl
   108	#undef tMsk
   109	#undef tPtr
   110	#undef plen
   111	#undef dlen
   112	
   113	// func gcmAesInit(productTable *[256]byte, ks []uint32)
   114	TEXT ·gcmAesInit(SB),NOSPLIT,$0
   115	#define pTbl R0
   116	#define KS R1
   117	#define NR R2
   118	#define I R3
   119		MOVD	productTable+0(FP), pTbl
   120		MOVD	ks_base+8(FP), KS
   121		MOVD	ks_len+16(FP), NR
   122	
   123		MOVD	$0xC2, I
   124		LSL	$56, I
   125		VMOV	I, POLY.D[0]
   126		MOVD	$1, I
   127		VMOV	I, POLY.D[1]
   128		VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   129	
   130		// Encrypt block 0 with the AES key to generate the hash key H
   131		VLD1.P	64(KS), [T0.B16, T1.B16, T2.B16, T3.B16]
   132		VEOR	B0.B16, B0.B16, B0.B16
   133		AESE	T0.B16, B0.B16
   134		AESMC	B0.B16, B0.B16
   135		AESE	T1.B16, B0.B16
   136		AESMC	B0.B16, B0.B16
   137		AESE	T2.B16, B0.B16
   138		AESMC	B0.B16, B0.B16
   139		AESE	T3.B16, B0.B16
   140		AESMC	B0.B16, B0.B16
   141		VLD1.P	64(KS), [T0.B16, T1.B16, T2.B16, T3.B16]
   142		AESE	T0.B16, B0.B16
   143		AESMC	B0.B16, B0.B16
   144		AESE	T1.B16, B0.B16
   145		AESMC	B0.B16, B0.B16
   146		AESE	T2.B16, B0.B16
   147		AESMC	B0.B16, B0.B16
   148		AESE	T3.B16, B0.B16
   149		AESMC	B0.B16, B0.B16
   150		TBZ	$4, NR, initEncFinish
   151		VLD1.P	32(KS), [T0.B16, T1.B16]
   152		AESE	T0.B16, B0.B16
   153		AESMC	B0.B16, B0.B16
   154		AESE	T1.B16, B0.B16
   155		AESMC	B0.B16, B0.B16
   156		TBZ	$3, NR, initEncFinish
   157		VLD1.P	32(KS), [T0.B16, T1.B16]
   158		AESE	T0.B16, B0.B16
   159		AESMC	B0.B16, B0.B16
   160		AESE	T1.B16, B0.B16
   161		AESMC	B0.B16, B0.B16
   162	initEncFinish:
   163		VLD1	(KS), [T0.B16, T1.B16, T2.B16]
   164		AESE	T0.B16, B0.B16
   165		AESMC	B0.B16, B0.B16
   166		AESE	T1.B16, B0.B16
   167	 	VEOR	T2.B16, B0.B16, B0.B16
   168	
   169		VREV64	B0.B16, B0.B16
   170	
   171		// Multiply by 2 modulo P
   172		VMOV	B0.D[0], I
   173		ASR	$63, I
   174		VMOV	I, T1.D[0]
   175		VMOV	I, T1.D[1]
   176		VAND	POLY.B16, T1.B16, T1.B16
   177		VUSHR	$63, B0.D2, T2.D2
   178		VEXT	$8, ZERO.B16, T2.B16, T2.B16
   179		VSHL	$1, B0.D2, B0.D2
   180		VEOR	T1.B16, B0.B16, B0.B16
   181		VEOR	T2.B16, B0.B16, B0.B16 // Can avoid this when VSLI is available
   182	
   183		// Karatsuba pre-computation
   184		VEXT	$8, B0.B16, B0.B16, B1.B16
   185		VEOR	B0.B16, B1.B16, B1.B16
   186	
   187		ADD	$14*16, pTbl
   188		VST1	[B0.B16, B1.B16], (pTbl)
   189		SUB	$2*16, pTbl
   190	
   191		VMOV	B0.B16, B2.B16
   192		VMOV	B1.B16, B3.B16
   193	
   194		MOVD	$7, I
   195	
   196	initLoop:
   197		// Compute powers of H
   198		SUBS	$1, I
   199	
   200		VPMULL	B0.D1, B2.D1, T1.Q1
   201		VPMULL2	B0.D2, B2.D2, T0.Q1
   202		VPMULL	B1.D1, B3.D1, T2.Q1
   203		VEOR	T0.B16, T2.B16, T2.B16
   204		VEOR	T1.B16, T2.B16, T2.B16
   205		VEXT	$8, ZERO.B16, T2.B16, T3.B16
   206		VEXT	$8, T2.B16, ZERO.B16, T2.B16
   207		VEOR	T2.B16, T0.B16, T0.B16
   208		VEOR	T3.B16, T1.B16, T1.B16
   209		VPMULL	POLY.D1, T0.D1, T2.Q1
   210		VEXT	$8, T0.B16, T0.B16, T0.B16
   211		VEOR	T2.B16, T0.B16, T0.B16
   212		VPMULL	POLY.D1, T0.D1, T2.Q1
   213		VEXT	$8, T0.B16, T0.B16, T0.B16
   214		VEOR	T2.B16, T0.B16, T0.B16
   215		VEOR	T1.B16, T0.B16, B2.B16
   216		VMOV	B2.B16, B3.B16
   217		VEXT	$8, B2.B16, B2.B16, B2.B16
   218		VEOR	B2.B16, B3.B16, B3.B16
   219	
   220		VST1	[B2.B16, B3.B16], (pTbl)
   221		SUB	$2*16, pTbl
   222	
   223		BNE	initLoop
   224		RET
   225	#undef I
   226	#undef NR
   227	#undef KS
   228	#undef pTbl
   229	
   230	// func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
   231	TEXT ·gcmAesData(SB),NOSPLIT,$0
   232	#define pTbl R0
   233	#define aut R1
   234	#define tPtr R2
   235	#define autLen R3
   236	#define H0 R4
   237	#define pTblSave R5
   238	
   239	#define mulRound(X) \
   240		VLD1.P	32(pTbl), [T1.B16, T2.B16] \
   241		VREV64	X.B16, X.B16               \
   242		VEXT	$8, X.B16, X.B16, T0.B16   \
   243		VEOR	X.B16, T0.B16, T0.B16      \
   244		VPMULL	X.D1, T1.D1, T3.Q1         \
   245		VEOR	T3.B16, ACC1.B16, ACC1.B16 \
   246		VPMULL2	X.D2, T1.D2, T3.Q1         \
   247		VEOR	T3.B16, ACC0.B16, ACC0.B16 \
   248		VPMULL	T0.D1, T2.D1, T3.Q1        \
   249		VEOR	T3.B16, ACCM.B16, ACCM.B16
   250	
   251		MOVD	productTable+0(FP), pTbl
   252		MOVD	data_base+8(FP), aut
   253		MOVD	data_len+16(FP), autLen
   254		MOVD	T+32(FP), tPtr
   255	
   256		VEOR	ACC0.B16, ACC0.B16, ACC0.B16
   257		CBZ	autLen, dataBail
   258	
   259		MOVD	$0xC2, H0
   260		LSL	$56, H0
   261		VMOV	H0, POLY.D[0]
   262		MOVD	$1, H0
   263		VMOV	H0, POLY.D[1]
   264		VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   265		MOVD	pTbl, pTblSave
   266	
   267		CMP	$13, autLen
   268		BEQ	dataTLS
   269		CMP	$128, autLen
   270		BLT	startSinglesLoop
   271		B	octetsLoop
   272	
   273	dataTLS:
   274		ADD	$14*16, pTbl
   275		VLD1.P	(pTbl), [T1.B16, T2.B16]
   276		VEOR	B0.B16, B0.B16, B0.B16
   277	
   278		MOVD	(aut), H0
   279		VMOV	H0, B0.D[0]
   280		MOVW	8(aut), H0
   281		VMOV	H0, B0.S[2]
   282		MOVB	12(aut), H0
   283		VMOV	H0, B0.B[12]
   284	
   285		MOVD	$0, autLen
   286		B	dataMul
   287	
   288	octetsLoop:
   289			CMP	$128, autLen
   290			BLT	startSinglesLoop
   291			SUB	$128, autLen
   292	
   293			VLD1.P	32(aut), [B0.B16, B1.B16]
   294	
   295			VLD1.P	32(pTbl), [T1.B16, T2.B16]
   296			VREV64	B0.B16, B0.B16
   297			VEOR	ACC0.B16, B0.B16, B0.B16
   298			VEXT	$8, B0.B16, B0.B16, T0.B16
   299			VEOR	B0.B16, T0.B16, T0.B16
   300			VPMULL	B0.D1, T1.D1, ACC1.Q1
   301			VPMULL2	B0.D2, T1.D2, ACC0.Q1
   302			VPMULL	T0.D1, T2.D1, ACCM.Q1
   303	
   304			mulRound(B1)
   305			VLD1.P  32(aut), [B2.B16, B3.B16]
   306			mulRound(B2)
   307			mulRound(B3)
   308			VLD1.P  32(aut), [B4.B16, B5.B16]
   309			mulRound(B4)
   310			mulRound(B5)
   311			VLD1.P  32(aut), [B6.B16, B7.B16]
   312			mulRound(B6)
   313			mulRound(B7)
   314	
   315			MOVD	pTblSave, pTbl
   316			reduce()
   317		B	octetsLoop
   318	
   319	startSinglesLoop:
   320	
   321		ADD	$14*16, pTbl
   322		VLD1.P	(pTbl), [T1.B16, T2.B16]
   323	
   324	singlesLoop:
   325	
   326			CMP	$16, autLen
   327			BLT	dataEnd
   328			SUB	$16, autLen
   329	
   330			VLD1.P	16(aut), [B0.B16]
   331	dataMul:
   332			VREV64	B0.B16, B0.B16
   333			VEOR	ACC0.B16, B0.B16, B0.B16
   334	
   335			VEXT	$8, B0.B16, B0.B16, T0.B16
   336			VEOR	B0.B16, T0.B16, T0.B16
   337			VPMULL	B0.D1, T1.D1, ACC1.Q1
   338			VPMULL2	B0.D2, T1.D2, ACC0.Q1
   339			VPMULL	T0.D1, T2.D1, ACCM.Q1
   340	
   341			reduce()
   342	
   343		B	singlesLoop
   344	
   345	dataEnd:
   346	
   347		CBZ	autLen, dataBail
   348		VEOR	B0.B16, B0.B16, B0.B16
   349		ADD	autLen, aut
   350	
   351	dataLoadLoop:
   352			MOVB.W	-1(aut), H0
   353			VEXT	$15, B0.B16, ZERO.B16, B0.B16
   354			VMOV	H0, B0.B[0]
   355			SUBS	$1, autLen
   356			BNE	dataLoadLoop
   357		B	dataMul
   358	
   359	dataBail:
   360		VST1	[ACC0.B16], (tPtr)
   361		RET
   362	
   363	#undef pTbl
   364	#undef aut
   365	#undef tPtr
   366	#undef autLen
   367	#undef H0
   368	#undef pTblSave
   369	
   370	// func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   371	TEXT ·gcmAesEnc(SB),NOSPLIT,$0
   372	#define pTbl R0
   373	#define dstPtr R1
   374	#define ctrPtr R2
   375	#define srcPtr R3
   376	#define ks R4
   377	#define tPtr R5
   378	#define srcPtrLen R6
   379	#define aluCTR R7
   380	#define aluTMP R8
   381	#define aluK R9
   382	#define NR R10
   383	#define H0 R11
   384	#define H1 R12
   385	#define curK R13
   386	#define pTblSave R14
   387	
   388	#define aesrndx8(K) \
   389		AESE	K.B16, B0.B16    \
   390		AESMC	B0.B16, B0.B16   \
   391		AESE	K.B16, B1.B16    \
   392		AESMC	B1.B16, B1.B16   \
   393		AESE	K.B16, B2.B16    \
   394		AESMC	B2.B16, B2.B16   \
   395		AESE	K.B16, B3.B16    \
   396		AESMC	B3.B16, B3.B16   \
   397		AESE	K.B16, B4.B16    \
   398		AESMC	B4.B16, B4.B16   \
   399		AESE	K.B16, B5.B16    \
   400		AESMC	B5.B16, B5.B16   \
   401		AESE	K.B16, B6.B16    \
   402		AESMC	B6.B16, B6.B16   \
   403		AESE	K.B16, B7.B16    \
   404		AESMC	B7.B16, B7.B16
   405	
   406	#define aesrndlastx8(K) \
   407		AESE	K.B16, B0.B16    \
   408		AESE	K.B16, B1.B16    \
   409		AESE	K.B16, B2.B16    \
   410		AESE	K.B16, B3.B16    \
   411		AESE	K.B16, B4.B16    \
   412		AESE	K.B16, B5.B16    \
   413		AESE	K.B16, B6.B16    \
   414		AESE	K.B16, B7.B16
   415	
   416		MOVD	productTable+0(FP), pTbl
   417		MOVD	dst+8(FP), dstPtr
   418		MOVD	src_base+32(FP), srcPtr
   419		MOVD	src_len+40(FP), srcPtrLen
   420		MOVD	ctr+56(FP), ctrPtr
   421		MOVD	T+64(FP), tPtr
   422		MOVD	ks_base+72(FP), ks
   423		MOVD	ks_len+80(FP), NR
   424	
   425		MOVD	$0xC2, H1
   426		LSL	$56, H1
   427		MOVD	$1, H0
   428		VMOV	H1, POLY.D[0]
   429		VMOV	H0, POLY.D[1]
   430		VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   431		// Compute NR from len(ks)
   432		MOVD	pTbl, pTblSave
   433		// Current tag, after AAD
   434		VLD1	(tPtr), [ACC0.B16]
   435		VEOR	ACC1.B16, ACC1.B16, ACC1.B16
   436		VEOR	ACCM.B16, ACCM.B16, ACCM.B16
   437		// Prepare initial counter, and the increment vector
   438		VLD1	(ctrPtr), [CTR.B16]
   439		VEOR	INC.B16, INC.B16, INC.B16
   440		MOVD	$1, H0
   441		VMOV	H0, INC.S[3]
   442		VREV32	CTR.B16, CTR.B16
   443		VADD	CTR.S4, INC.S4, CTR.S4
   444		// Skip to <8 blocks loop
   445		CMP	$128, srcPtrLen
   446	
   447		MOVD	ks, H0
   448		// For AES-128 round keys are stored in: K0 .. K10, KLAST
   449		VLD1.P	64(H0), [K0.B16, K1.B16, K2.B16, K3.B16]
   450		VLD1.P	64(H0), [K4.B16, K5.B16, K6.B16, K7.B16]
   451		VLD1.P	48(H0), [K8.B16, K9.B16, K10.B16]
   452		VMOV	K10.B16, KLAST.B16
   453	
   454		BLT	startSingles
   455		// There are at least 8 blocks to encrypt
   456		TBZ	$4, NR, octetsLoop
   457	
   458		// For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST
   459		VMOV	K8.B16, K10.B16
   460		VMOV	K9.B16, K11.B16
   461		VMOV	KLAST.B16, K8.B16
   462		VLD1.P	16(H0), [K9.B16]
   463		VLD1.P  16(H0), [KLAST.B16]
   464		TBZ	$3, NR, octetsLoop
   465		// For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST
   466		VMOV	KLAST.B16, K8.B16
   467		VLD1.P	16(H0), [K9.B16]
   468		VLD1.P  16(H0), [KLAST.B16]
   469		ADD	$10*16, ks, H0
   470		MOVD	H0, curK
   471	
   472	octetsLoop:
   473			SUB	$128, srcPtrLen
   474	
   475			VMOV	CTR.B16, B0.B16
   476			VADD	B0.S4, INC.S4, B1.S4
   477			VREV32	B0.B16, B0.B16
   478			VADD	B1.S4, INC.S4, B2.S4
   479			VREV32	B1.B16, B1.B16
   480			VADD	B2.S4, INC.S4, B3.S4
   481			VREV32	B2.B16, B2.B16
   482			VADD	B3.S4, INC.S4, B4.S4
   483			VREV32	B3.B16, B3.B16
   484			VADD	B4.S4, INC.S4, B5.S4
   485			VREV32	B4.B16, B4.B16
   486			VADD	B5.S4, INC.S4, B6.S4
   487			VREV32	B5.B16, B5.B16
   488			VADD	B6.S4, INC.S4, B7.S4
   489			VREV32	B6.B16, B6.B16
   490			VADD	B7.S4, INC.S4, CTR.S4
   491			VREV32	B7.B16, B7.B16
   492	
   493			aesrndx8(K0)
   494			aesrndx8(K1)
   495			aesrndx8(K2)
   496			aesrndx8(K3)
   497			aesrndx8(K4)
   498			aesrndx8(K5)
   499			aesrndx8(K6)
   500			aesrndx8(K7)
   501			TBZ	$4, NR, octetsFinish
   502			aesrndx8(K10)
   503			aesrndx8(K11)
   504			TBZ	$3, NR, octetsFinish
   505			VLD1.P	32(curK), [T1.B16, T2.B16]
   506			aesrndx8(T1)
   507			aesrndx8(T2)
   508			MOVD	H0, curK
   509	octetsFinish:
   510			aesrndx8(K8)
   511			aesrndlastx8(K9)
   512	
   513			VEOR	KLAST.B16, B0.B16, B0.B16
   514			VEOR	KLAST.B16, B1.B16, B1.B16
   515			VEOR	KLAST.B16, B2.B16, B2.B16
   516			VEOR	KLAST.B16, B3.B16, B3.B16
   517			VEOR	KLAST.B16, B4.B16, B4.B16
   518			VEOR	KLAST.B16, B5.B16, B5.B16
   519			VEOR	KLAST.B16, B6.B16, B6.B16
   520			VEOR	KLAST.B16, B7.B16, B7.B16
   521	
   522			VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   523			VEOR	B0.B16, T1.B16, B0.B16
   524			VEOR	B1.B16, T2.B16, B1.B16
   525			VST1.P  [B0.B16, B1.B16], 32(dstPtr)
   526			VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   527			VEOR	B2.B16, T1.B16, B2.B16
   528			VEOR	B3.B16, T2.B16, B3.B16
   529			VST1.P  [B2.B16, B3.B16], 32(dstPtr)
   530			VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   531			VEOR	B4.B16, T1.B16, B4.B16
   532			VEOR	B5.B16, T2.B16, B5.B16
   533			VST1.P  [B4.B16, B5.B16], 32(dstPtr)
   534			VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   535			VEOR	B6.B16, T1.B16, B6.B16
   536			VEOR	B7.B16, T2.B16, B7.B16
   537			VST1.P  [B6.B16, B7.B16], 32(dstPtr)
   538	
   539			VLD1.P	32(pTbl), [T1.B16, T2.B16]
   540			VREV64	B0.B16, B0.B16
   541			VEOR	ACC0.B16, B0.B16, B0.B16
   542			VEXT	$8, B0.B16, B0.B16, T0.B16
   543			VEOR	B0.B16, T0.B16, T0.B16
   544			VPMULL	B0.D1, T1.D1, ACC1.Q1
   545			VPMULL2	B0.D2, T1.D2, ACC0.Q1
   546			VPMULL	T0.D1, T2.D1, ACCM.Q1
   547	
   548			mulRound(B1)
   549			mulRound(B2)
   550			mulRound(B3)
   551			mulRound(B4)
   552			mulRound(B5)
   553			mulRound(B6)
   554			mulRound(B7)
   555			MOVD	pTblSave, pTbl
   556			reduce()
   557	
   558			CMP	$128, srcPtrLen
   559			BGE	octetsLoop
   560	
   561	startSingles:
   562		CBZ	srcPtrLen, done
   563		ADD	$14*16, pTbl
   564		// Preload H and its Karatsuba precomp
   565		VLD1.P	(pTbl), [T1.B16, T2.B16]
   566		// Preload AES round keys
   567		ADD	$128, ks
   568		VLD1.P	48(ks), [K8.B16, K9.B16, K10.B16]
   569		VMOV	K10.B16, KLAST.B16
   570		TBZ	$4, NR, singlesLoop
   571		VLD1.P	32(ks), [B1.B16, B2.B16]
   572		VMOV	B2.B16, KLAST.B16
   573		TBZ	$3, NR, singlesLoop
   574		VLD1.P	32(ks), [B3.B16, B4.B16]
   575		VMOV	B4.B16, KLAST.B16
   576	
   577	singlesLoop:
   578			CMP	$16, srcPtrLen
   579			BLT	tail
   580			SUB	$16, srcPtrLen
   581	
   582			VLD1.P	16(srcPtr), [T0.B16]
   583			VEOR	KLAST.B16, T0.B16, T0.B16
   584	
   585			VREV32	CTR.B16, B0.B16
   586			VADD	CTR.S4, INC.S4, CTR.S4
   587	
   588			AESE	K0.B16, B0.B16
   589			AESMC	B0.B16, B0.B16
   590			AESE	K1.B16, B0.B16
   591			AESMC	B0.B16, B0.B16
   592			AESE	K2.B16, B0.B16
   593			AESMC	B0.B16, B0.B16
   594			AESE	K3.B16, B0.B16
   595			AESMC	B0.B16, B0.B16
   596			AESE	K4.B16, B0.B16
   597			AESMC	B0.B16, B0.B16
   598			AESE	K5.B16, B0.B16
   599			AESMC	B0.B16, B0.B16
   600			AESE	K6.B16, B0.B16
   601			AESMC	B0.B16, B0.B16
   602			AESE	K7.B16, B0.B16
   603			AESMC	B0.B16, B0.B16
   604			AESE	K8.B16, B0.B16
   605			AESMC	B0.B16, B0.B16
   606			AESE	K9.B16, B0.B16
   607			TBZ	$4, NR, singlesLast
   608			AESMC	B0.B16, B0.B16
   609			AESE	K10.B16, B0.B16
   610			AESMC	B0.B16, B0.B16
   611			AESE	B1.B16, B0.B16
   612			TBZ	$3, NR, singlesLast
   613			AESMC	B0.B16, B0.B16
   614			AESE	B2.B16, B0.B16
   615			AESMC	B0.B16, B0.B16
   616			AESE	B3.B16, B0.B16
   617	singlesLast:
   618			VEOR	T0.B16, B0.B16, B0.B16
   619	encReduce:
   620			VST1.P	[B0.B16], 16(dstPtr)
   621	
   622			VREV64	B0.B16, B0.B16
   623			VEOR	ACC0.B16, B0.B16, B0.B16
   624	
   625			VEXT	$8, B0.B16, B0.B16, T0.B16
   626			VEOR	B0.B16, T0.B16, T0.B16
   627			VPMULL	B0.D1, T1.D1, ACC1.Q1
   628			VPMULL2	B0.D2, T1.D2, ACC0.Q1
   629			VPMULL	T0.D1, T2.D1, ACCM.Q1
   630	
   631			reduce()
   632	
   633		B	singlesLoop
   634	tail:
   635		CBZ	srcPtrLen, done
   636	
   637		VEOR	T0.B16, T0.B16, T0.B16
   638		VEOR	T3.B16, T3.B16, T3.B16
   639		MOVD	$0, H1
   640		SUB	$1, H1
   641		ADD	srcPtrLen, srcPtr
   642	
   643		TBZ	$3, srcPtrLen, ld4
   644		MOVD.W	-8(srcPtr), H0
   645		VMOV	H0, T0.D[0]
   646		VMOV	H1, T3.D[0]
   647	ld4:
   648		TBZ	$2, srcPtrLen, ld2
   649		MOVW.W	-4(srcPtr), H0
   650		VEXT	$12, T0.B16, ZERO.B16, T0.B16
   651		VEXT	$12, T3.B16, ZERO.B16, T3.B16
   652		VMOV	H0, T0.S[0]
   653		VMOV	H1, T3.S[0]
   654	ld2:
   655		TBZ	$1, srcPtrLen, ld1
   656		MOVH.W	-2(srcPtr), H0
   657		VEXT	$14, T0.B16, ZERO.B16, T0.B16
   658		VEXT	$14, T3.B16, ZERO.B16, T3.B16
   659		VMOV	H0, T0.H[0]
   660		VMOV	H1, T3.H[0]
   661	ld1:
   662		TBZ	$0, srcPtrLen, ld0
   663		MOVB.W	-1(srcPtr), H0
   664		VEXT	$15, T0.B16, ZERO.B16, T0.B16
   665		VEXT	$15, T3.B16, ZERO.B16, T3.B16
   666		VMOV	H0, T0.B[0]
   667		VMOV	H1, T3.B[0]
   668	ld0:
   669	
   670		MOVD	ZR, srcPtrLen
   671		VEOR	KLAST.B16, T0.B16, T0.B16
   672		VREV32	CTR.B16, B0.B16
   673	
   674		AESE	K0.B16, B0.B16
   675		AESMC	B0.B16, B0.B16
   676		AESE	K1.B16, B0.B16
   677		AESMC	B0.B16, B0.B16
   678		AESE	K2.B16, B0.B16
   679		AESMC	B0.B16, B0.B16
   680		AESE	K3.B16, B0.B16
   681		AESMC	B0.B16, B0.B16
   682		AESE	K4.B16, B0.B16
   683		AESMC	B0.B16, B0.B16
   684		AESE	K5.B16, B0.B16
   685		AESMC	B0.B16, B0.B16
   686		AESE	K6.B16, B0.B16
   687		AESMC	B0.B16, B0.B16
   688		AESE	K7.B16, B0.B16
   689		AESMC	B0.B16, B0.B16
   690		AESE	K8.B16, B0.B16
   691		AESMC	B0.B16, B0.B16
   692		AESE	K9.B16, B0.B16
   693		TBZ	$4, NR, tailLast
   694		AESMC	B0.B16, B0.B16
   695		AESE	K10.B16, B0.B16
   696		AESMC	B0.B16, B0.B16
   697		AESE	B1.B16, B0.B16
   698		TBZ	$3, NR, tailLast
   699		AESMC	B0.B16, B0.B16
   700		AESE	B2.B16, B0.B16
   701		AESMC	B0.B16, B0.B16
   702		AESE	B3.B16, B0.B16
   703	
   704	tailLast:
   705		VEOR	T0.B16, B0.B16, B0.B16
   706		VAND	T3.B16, B0.B16, B0.B16
   707		B	encReduce
   708	
   709	done:
   710		VST1	[ACC0.B16], (tPtr)
   711		RET
   712	
   713	// func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   714	TEXT ·gcmAesDec(SB),NOSPLIT,$0
   715		MOVD	productTable+0(FP), pTbl
   716		MOVD	dst+8(FP), dstPtr
   717		MOVD	src_base+32(FP), srcPtr
   718		MOVD	src_len+40(FP), srcPtrLen
   719		MOVD	ctr+56(FP), ctrPtr
   720		MOVD	T+64(FP), tPtr
   721		MOVD	ks_base+72(FP), ks
   722		MOVD	ks_len+80(FP), NR
   723	
   724		MOVD	$0xC2, H1
   725		LSL	$56, H1
   726		MOVD	$1, H0
   727		VMOV	H1, POLY.D[0]
   728		VMOV	H0, POLY.D[1]
   729		VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   730		// Compute NR from len(ks)
   731		MOVD	pTbl, pTblSave
   732		// Current tag, after AAD
   733		VLD1	(tPtr), [ACC0.B16]
   734		VEOR	ACC1.B16, ACC1.B16, ACC1.B16
   735		VEOR	ACCM.B16, ACCM.B16, ACCM.B16
   736		// Prepare initial counter, and the increment vector
   737		VLD1	(ctrPtr), [CTR.B16]
   738		VEOR	INC.B16, INC.B16, INC.B16
   739		MOVD	$1, H0
   740		VMOV	H0, INC.S[3]
   741		VREV32	CTR.B16, CTR.B16
   742		VADD	CTR.S4, INC.S4, CTR.S4
   743	
   744		MOVD	ks, H0
   745		// For AES-128 round keys are stored in: K0 .. K10, KLAST
   746		VLD1.P	64(H0), [K0.B16, K1.B16, K2.B16, K3.B16]
   747		VLD1.P	64(H0), [K4.B16, K5.B16, K6.B16, K7.B16]
   748		VLD1.P	48(H0), [K8.B16, K9.B16, K10.B16]
   749		VMOV	K10.B16, KLAST.B16
   750	
   751		// Skip to <8 blocks loop
   752		CMP	$128, srcPtrLen
   753		BLT	startSingles
   754		// There are at least 8 blocks to encrypt
   755		TBZ	$4, NR, octetsLoop
   756	
   757		// For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST
   758		VMOV	K8.B16, K10.B16
   759		VMOV	K9.B16, K11.B16
   760		VMOV	KLAST.B16, K8.B16
   761		VLD1.P	16(H0), [K9.B16]
   762		VLD1.P  16(H0), [KLAST.B16]
   763		TBZ	$3, NR, octetsLoop
   764		// For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST
   765		VMOV	KLAST.B16, K8.B16
   766		VLD1.P	16(H0), [K9.B16]
   767		VLD1.P  16(H0), [KLAST.B16]
   768		ADD	$10*16, ks, H0
   769		MOVD	H0, curK
   770	
   771	octetsLoop:
   772			SUB	$128, srcPtrLen
   773	
   774			VMOV	CTR.B16, B0.B16
   775			VADD	B0.S4, INC.S4, B1.S4
   776			VREV32	B0.B16, B0.B16
   777			VADD	B1.S4, INC.S4, B2.S4
   778			VREV32	B1.B16, B1.B16
   779			VADD	B2.S4, INC.S4, B3.S4
   780			VREV32	B2.B16, B2.B16
   781			VADD	B3.S4, INC.S4, B4.S4
   782			VREV32	B3.B16, B3.B16
   783			VADD	B4.S4, INC.S4, B5.S4
   784			VREV32	B4.B16, B4.B16
   785			VADD	B5.S4, INC.S4, B6.S4
   786			VREV32	B5.B16, B5.B16
   787			VADD	B6.S4, INC.S4, B7.S4
   788			VREV32	B6.B16, B6.B16
   789			VADD	B7.S4, INC.S4, CTR.S4
   790			VREV32	B7.B16, B7.B16
   791	
   792			aesrndx8(K0)
   793			aesrndx8(K1)
   794			aesrndx8(K2)
   795			aesrndx8(K3)
   796			aesrndx8(K4)
   797			aesrndx8(K5)
   798			aesrndx8(K6)
   799			aesrndx8(K7)
   800			TBZ	$4, NR, octetsFinish
   801			aesrndx8(K10)
   802			aesrndx8(K11)
   803			TBZ	$3, NR, octetsFinish
   804			VLD1.P	32(curK), [T1.B16, T2.B16]
   805			aesrndx8(T1)
   806			aesrndx8(T2)
   807			MOVD	H0, curK
   808	octetsFinish:
   809			aesrndx8(K8)
   810			aesrndlastx8(K9)
   811	
   812			VEOR	KLAST.B16, B0.B16, T1.B16
   813			VEOR	KLAST.B16, B1.B16, T2.B16
   814			VEOR	KLAST.B16, B2.B16, B2.B16
   815			VEOR	KLAST.B16, B3.B16, B3.B16
   816			VEOR	KLAST.B16, B4.B16, B4.B16
   817			VEOR	KLAST.B16, B5.B16, B5.B16
   818			VEOR	KLAST.B16, B6.B16, B6.B16
   819			VEOR	KLAST.B16, B7.B16, B7.B16
   820	
   821			VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   822			VEOR	B0.B16, T1.B16, T1.B16
   823			VEOR	B1.B16, T2.B16, T2.B16
   824			VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   825	
   826			VLD1.P	32(pTbl), [T1.B16, T2.B16]
   827			VREV64	B0.B16, B0.B16
   828			VEOR	ACC0.B16, B0.B16, B0.B16
   829			VEXT	$8, B0.B16, B0.B16, T0.B16
   830			VEOR	B0.B16, T0.B16, T0.B16
   831			VPMULL	B0.D1, T1.D1, ACC1.Q1
   832			VPMULL2	B0.D2, T1.D2, ACC0.Q1
   833			VPMULL	T0.D1, T2.D1, ACCM.Q1
   834			mulRound(B1)
   835	
   836			VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   837			VEOR	B2.B16, B0.B16, T1.B16
   838			VEOR	B3.B16, B1.B16, T2.B16
   839			VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   840			mulRound(B0)
   841			mulRound(B1)
   842	
   843			VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   844			VEOR	B4.B16, B0.B16, T1.B16
   845			VEOR	B5.B16, B1.B16, T2.B16
   846			VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   847			mulRound(B0)
   848			mulRound(B1)
   849	
   850			VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   851			VEOR	B6.B16, B0.B16, T1.B16
   852			VEOR	B7.B16, B1.B16, T2.B16
   853			VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   854			mulRound(B0)
   855			mulRound(B1)
   856	
   857			MOVD	pTblSave, pTbl
   858			reduce()
   859	
   860			CMP	$128, srcPtrLen
   861			BGE	octetsLoop
   862	
   863	startSingles:
   864		CBZ	srcPtrLen, done
   865		ADD	$14*16, pTbl
   866		// Preload H and its Karatsuba precomp
   867		VLD1.P	(pTbl), [T1.B16, T2.B16]
   868		// Preload AES round keys
   869		ADD	$128, ks
   870		VLD1.P	48(ks), [K8.B16, K9.B16, K10.B16]
   871		VMOV	K10.B16, KLAST.B16
   872		TBZ	$4, NR, singlesLoop
   873		VLD1.P	32(ks), [B1.B16, B2.B16]
   874		VMOV	B2.B16, KLAST.B16
   875		TBZ	$3, NR, singlesLoop
   876		VLD1.P	32(ks), [B3.B16, B4.B16]
   877		VMOV	B4.B16, KLAST.B16
   878	
   879	singlesLoop:
   880			CMP	$16, srcPtrLen
   881			BLT	tail
   882			SUB	$16, srcPtrLen
   883	
   884			VLD1.P	16(srcPtr), [T0.B16]
   885			VREV64	T0.B16, B5.B16
   886			VEOR	KLAST.B16, T0.B16, T0.B16
   887	
   888			VREV32	CTR.B16, B0.B16
   889			VADD	CTR.S4, INC.S4, CTR.S4
   890	
   891			AESE	K0.B16, B0.B16
   892			AESMC	B0.B16, B0.B16
   893			AESE	K1.B16, B0.B16
   894			AESMC	B0.B16, B0.B16
   895			AESE	K2.B16, B0.B16
   896			AESMC	B0.B16, B0.B16
   897			AESE	K3.B16, B0.B16
   898			AESMC	B0.B16, B0.B16
   899			AESE	K4.B16, B0.B16
   900			AESMC	B0.B16, B0.B16
   901			AESE	K5.B16, B0.B16
   902			AESMC	B0.B16, B0.B16
   903			AESE	K6.B16, B0.B16
   904			AESMC	B0.B16, B0.B16
   905			AESE	K7.B16, B0.B16
   906			AESMC	B0.B16, B0.B16
   907			AESE	K8.B16, B0.B16
   908			AESMC	B0.B16, B0.B16
   909			AESE	K9.B16, B0.B16
   910			TBZ	$4, NR, singlesLast
   911			AESMC	B0.B16, B0.B16
   912			AESE	K10.B16, B0.B16
   913			AESMC	B0.B16, B0.B16
   914			AESE	B1.B16, B0.B16
   915			TBZ	$3, NR, singlesLast
   916			AESMC	B0.B16, B0.B16
   917			AESE	B2.B16, B0.B16
   918			AESMC	B0.B16, B0.B16
   919			AESE	B3.B16, B0.B16
   920	singlesLast:
   921			VEOR	T0.B16, B0.B16, B0.B16
   922	
   923			VST1.P	[B0.B16], 16(dstPtr)
   924	
   925			VEOR	ACC0.B16, B5.B16, B5.B16
   926			VEXT	$8, B5.B16, B5.B16, T0.B16
   927			VEOR	B5.B16, T0.B16, T0.B16
   928			VPMULL	B5.D1, T1.D1, ACC1.Q1
   929			VPMULL2	B5.D2, T1.D2, ACC0.Q1
   930			VPMULL	T0.D1, T2.D1, ACCM.Q1
   931			reduce()
   932	
   933		B	singlesLoop
   934	tail:
   935		CBZ	srcPtrLen, done
   936	
   937		VREV32	CTR.B16, B0.B16
   938		VADD	CTR.S4, INC.S4, CTR.S4
   939	
   940		AESE	K0.B16, B0.B16
   941		AESMC	B0.B16, B0.B16
   942		AESE	K1.B16, B0.B16
   943		AESMC	B0.B16, B0.B16
   944		AESE	K2.B16, B0.B16
   945		AESMC	B0.B16, B0.B16
   946		AESE	K3.B16, B0.B16
   947		AESMC	B0.B16, B0.B16
   948		AESE	K4.B16, B0.B16
   949		AESMC	B0.B16, B0.B16
   950		AESE	K5.B16, B0.B16
   951		AESMC	B0.B16, B0.B16
   952		AESE	K6.B16, B0.B16
   953		AESMC	B0.B16, B0.B16
   954		AESE	K7.B16, B0.B16
   955		AESMC	B0.B16, B0.B16
   956		AESE	K8.B16, B0.B16
   957		AESMC	B0.B16, B0.B16
   958		AESE	K9.B16, B0.B16
   959		TBZ	$4, NR, tailLast
   960		AESMC	B0.B16, B0.B16
   961		AESE	K10.B16, B0.B16
   962		AESMC	B0.B16, B0.B16
   963		AESE	B1.B16, B0.B16
   964		TBZ	$3, NR, tailLast
   965		AESMC	B0.B16, B0.B16
   966		AESE	B2.B16, B0.B16
   967		AESMC	B0.B16, B0.B16
   968		AESE	B3.B16, B0.B16
   969	tailLast:
   970		VEOR	KLAST.B16, B0.B16, B0.B16
   971	
   972		// Assuming it is safe to load past dstPtr due to the presence of the tag
   973		VLD1	(srcPtr), [B5.B16]
   974	
   975		VEOR	B5.B16, B0.B16, B0.B16
   976	
   977		VEOR	T3.B16, T3.B16, T3.B16
   978		MOVD	$0, H1
   979		SUB	$1, H1
   980	
   981		TBZ	$3, srcPtrLen, ld4
   982		VMOV	B0.D[0], H0
   983		MOVD.P	H0, 8(dstPtr)
   984		VMOV	H1, T3.D[0]
   985		VEXT	$8, ZERO.B16, B0.B16, B0.B16
   986	ld4:
   987		TBZ	$2, srcPtrLen, ld2
   988		VMOV	B0.S[0], H0
   989		MOVW.P	H0, 4(dstPtr)
   990		VEXT	$12, T3.B16, ZERO.B16, T3.B16
   991		VMOV	H1, T3.S[0]
   992		VEXT	$4, ZERO.B16, B0.B16, B0.B16
   993	ld2:
   994		TBZ	$1, srcPtrLen, ld1
   995		VMOV	B0.H[0], H0
   996		MOVH.P	H0, 2(dstPtr)
   997		VEXT	$14, T3.B16, ZERO.B16, T3.B16
   998		VMOV	H1, T3.H[0]
   999		VEXT	$2, ZERO.B16, B0.B16, B0.B16
  1000	ld1:
  1001		TBZ	$0, srcPtrLen, ld0
  1002		VMOV	B0.B[0], H0
  1003		MOVB.P	H0, 1(dstPtr)
  1004		VEXT	$15, T3.B16, ZERO.B16, T3.B16
  1005		VMOV	H1, T3.B[0]
  1006	ld0:
  1007	
  1008		VAND	T3.B16, B5.B16, B5.B16
  1009		VREV64	B5.B16, B5.B16
  1010	
  1011		VEOR	ACC0.B16, B5.B16, B5.B16
  1012		VEXT	$8, B5.B16, B5.B16, T0.B16
  1013		VEOR	B5.B16, T0.B16, T0.B16
  1014		VPMULL	B5.D1, T1.D1, ACC1.Q1
  1015		VPMULL2	B5.D2, T1.D2, ACC0.Q1
  1016		VPMULL	T0.D1, T2.D1, ACCM.Q1
  1017		reduce()
  1018	done:
  1019		VST1	[ACC0.B16], (tPtr)
  1020	
  1021		RET

View as plain text