...

Text file src/pkg/vendor/golang.org/x/crypto/internal/chacha20/asm_arm64.s

     1	// Copyright 2018 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// +build go1.11
     6	// +build !gccgo,!appengine
     7	
     8	#include "textflag.h"
     9	
    10	#define NUM_ROUNDS 10
    11	
    12	// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
    13	TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
    14		MOVD	dst+0(FP), R1
    15		MOVD	src+24(FP), R2
    16		MOVD	src_len+32(FP), R3
    17		MOVD	key+48(FP), R4
    18		MOVD	nonce+56(FP), R6
    19		MOVD	counter+64(FP), R7
    20	
    21		MOVD	$·constants(SB), R10
    22		MOVD	$·incRotMatrix(SB), R11
    23	
    24		MOVW	(R7), R20
    25	
    26		AND	$~255, R3, R13
    27		ADD	R2, R13, R12 // R12 for block end
    28		AND	$255, R3, R13
    29	loop:
    30		MOVD	$NUM_ROUNDS, R21
    31		VLD1	(R11), [V30.S4, V31.S4]
    32	
    33		// load contants
    34		// VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4]
    35		WORD	$0x4D60E940
    36	
    37		// load keys
    38		// VLD4R 16(R4), [V4.S4, V5.S4, V6.S4, V7.S4]
    39		WORD	$0x4DFFE884
    40		// VLD4R 16(R4), [V8.S4, V9.S4, V10.S4, V11.S4]
    41		WORD	$0x4DFFE888
    42		SUB	$32, R4
    43	
    44		// load counter + nonce
    45		// VLD1R (R7), [V12.S4]
    46		WORD	$0x4D40C8EC
    47	
    48		// VLD3R (R6), [V13.S4, V14.S4, V15.S4]
    49		WORD	$0x4D40E8CD
    50	
    51		// update counter
    52		VADD	V30.S4, V12.S4, V12.S4
    53	
    54	chacha:
    55		// V0..V3 += V4..V7
    56		// V12..V15 <<<= ((V12..V15 XOR V0..V3), 16)
    57		VADD	V0.S4, V4.S4, V0.S4
    58		VADD	V1.S4, V5.S4, V1.S4
    59		VADD	V2.S4, V6.S4, V2.S4
    60		VADD	V3.S4, V7.S4, V3.S4
    61		VEOR	V12.B16, V0.B16, V12.B16
    62		VEOR	V13.B16, V1.B16, V13.B16
    63		VEOR	V14.B16, V2.B16, V14.B16
    64		VEOR	V15.B16, V3.B16, V15.B16
    65		VREV32	V12.H8, V12.H8
    66		VREV32	V13.H8, V13.H8
    67		VREV32	V14.H8, V14.H8
    68		VREV32	V15.H8, V15.H8
    69		// V8..V11 += V12..V15
    70		// V4..V7 <<<= ((V4..V7 XOR V8..V11), 12)
    71		VADD	V8.S4, V12.S4, V8.S4
    72		VADD	V9.S4, V13.S4, V9.S4
    73		VADD	V10.S4, V14.S4, V10.S4
    74		VADD	V11.S4, V15.S4, V11.S4
    75		VEOR	V8.B16, V4.B16, V16.B16
    76		VEOR	V9.B16, V5.B16, V17.B16
    77		VEOR	V10.B16, V6.B16, V18.B16
    78		VEOR	V11.B16, V7.B16, V19.B16
    79		VSHL	$12, V16.S4, V4.S4
    80		VSHL	$12, V17.S4, V5.S4
    81		VSHL	$12, V18.S4, V6.S4
    82		VSHL	$12, V19.S4, V7.S4
    83		VSRI	$20, V16.S4, V4.S4
    84		VSRI	$20, V17.S4, V5.S4
    85		VSRI	$20, V18.S4, V6.S4
    86		VSRI	$20, V19.S4, V7.S4
    87	
    88		// V0..V3 += V4..V7
    89		// V12..V15 <<<= ((V12..V15 XOR V0..V3), 8)
    90		VADD	V0.S4, V4.S4, V0.S4
    91		VADD	V1.S4, V5.S4, V1.S4
    92		VADD	V2.S4, V6.S4, V2.S4
    93		VADD	V3.S4, V7.S4, V3.S4
    94		VEOR	V12.B16, V0.B16, V12.B16
    95		VEOR	V13.B16, V1.B16, V13.B16
    96		VEOR	V14.B16, V2.B16, V14.B16
    97		VEOR	V15.B16, V3.B16, V15.B16
    98		VTBL	V31.B16, [V12.B16], V12.B16
    99		VTBL	V31.B16, [V13.B16], V13.B16
   100		VTBL	V31.B16, [V14.B16], V14.B16
   101		VTBL	V31.B16, [V15.B16], V15.B16
   102	
   103		// V8..V11 += V12..V15
   104		// V4..V7 <<<= ((V4..V7 XOR V8..V11), 7)
   105		VADD	V12.S4, V8.S4, V8.S4
   106		VADD	V13.S4, V9.S4, V9.S4
   107		VADD	V14.S4, V10.S4, V10.S4
   108		VADD	V15.S4, V11.S4, V11.S4
   109		VEOR	V8.B16, V4.B16, V16.B16
   110		VEOR	V9.B16, V5.B16, V17.B16
   111		VEOR	V10.B16, V6.B16, V18.B16
   112		VEOR	V11.B16, V7.B16, V19.B16
   113		VSHL	$7, V16.S4, V4.S4
   114		VSHL	$7, V17.S4, V5.S4
   115		VSHL	$7, V18.S4, V6.S4
   116		VSHL	$7, V19.S4, V7.S4
   117		VSRI	$25, V16.S4, V4.S4
   118		VSRI	$25, V17.S4, V5.S4
   119		VSRI	$25, V18.S4, V6.S4
   120		VSRI	$25, V19.S4, V7.S4
   121	
   122		// V0..V3 += V5..V7, V4
   123		// V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16)
   124		VADD	V0.S4, V5.S4, V0.S4
   125		VADD	V1.S4, V6.S4, V1.S4
   126		VADD	V2.S4, V7.S4, V2.S4
   127		VADD	V3.S4, V4.S4, V3.S4
   128		VEOR	V15.B16, V0.B16, V15.B16
   129		VEOR	V12.B16, V1.B16, V12.B16
   130		VEOR	V13.B16, V2.B16, V13.B16
   131		VEOR	V14.B16, V3.B16, V14.B16
   132		VREV32	V12.H8, V12.H8
   133		VREV32	V13.H8, V13.H8
   134		VREV32	V14.H8, V14.H8
   135		VREV32	V15.H8, V15.H8
   136	
   137		// V10 += V15; V5 <<<= ((V10 XOR V5), 12)
   138		// ...
   139		VADD	V15.S4, V10.S4, V10.S4
   140		VADD	V12.S4, V11.S4, V11.S4
   141		VADD	V13.S4, V8.S4, V8.S4
   142		VADD	V14.S4, V9.S4, V9.S4
   143		VEOR	V10.B16, V5.B16, V16.B16
   144		VEOR	V11.B16, V6.B16, V17.B16
   145		VEOR	V8.B16, V7.B16, V18.B16
   146		VEOR	V9.B16, V4.B16, V19.B16
   147		VSHL	$12, V16.S4, V5.S4
   148		VSHL	$12, V17.S4, V6.S4
   149		VSHL	$12, V18.S4, V7.S4
   150		VSHL	$12, V19.S4, V4.S4
   151		VSRI	$20, V16.S4, V5.S4
   152		VSRI	$20, V17.S4, V6.S4
   153		VSRI	$20, V18.S4, V7.S4
   154		VSRI	$20, V19.S4, V4.S4
   155	
   156		// V0 += V5; V15 <<<= ((V0 XOR V15), 8)
   157		// ...
   158		VADD	V5.S4, V0.S4, V0.S4
   159		VADD	V6.S4, V1.S4, V1.S4
   160		VADD	V7.S4, V2.S4, V2.S4
   161		VADD	V4.S4, V3.S4, V3.S4
   162		VEOR	V0.B16, V15.B16, V15.B16
   163		VEOR	V1.B16, V12.B16, V12.B16
   164		VEOR	V2.B16, V13.B16, V13.B16
   165		VEOR	V3.B16, V14.B16, V14.B16
   166		VTBL	V31.B16, [V12.B16], V12.B16
   167		VTBL	V31.B16, [V13.B16], V13.B16
   168		VTBL	V31.B16, [V14.B16], V14.B16
   169		VTBL	V31.B16, [V15.B16], V15.B16
   170	
   171		// V10 += V15; V5 <<<= ((V10 XOR V5), 7)
   172		// ...
   173		VADD	V15.S4, V10.S4, V10.S4
   174		VADD	V12.S4, V11.S4, V11.S4
   175		VADD	V13.S4, V8.S4, V8.S4
   176		VADD	V14.S4, V9.S4, V9.S4
   177		VEOR	V10.B16, V5.B16, V16.B16
   178		VEOR	V11.B16, V6.B16, V17.B16
   179		VEOR	V8.B16, V7.B16, V18.B16
   180		VEOR	V9.B16, V4.B16, V19.B16
   181		VSHL	$7, V16.S4, V5.S4
   182		VSHL	$7, V17.S4, V6.S4
   183		VSHL	$7, V18.S4, V7.S4
   184		VSHL	$7, V19.S4, V4.S4
   185		VSRI	$25, V16.S4, V5.S4
   186		VSRI	$25, V17.S4, V6.S4
   187		VSRI	$25, V18.S4, V7.S4
   188		VSRI	$25, V19.S4, V4.S4
   189	
   190		SUB	$1, R21
   191		CBNZ	R21, chacha
   192	
   193		// VLD4R (R10), [V16.S4, V17.S4, V18.S4, V19.S4]
   194		WORD	$0x4D60E950
   195	
   196		// VLD4R 16(R4), [V20.S4, V21.S4, V22.S4, V23.S4]
   197		WORD	$0x4DFFE894
   198		VADD	V30.S4, V12.S4, V12.S4
   199		VADD	V16.S4, V0.S4, V0.S4
   200		VADD	V17.S4, V1.S4, V1.S4
   201		VADD	V18.S4, V2.S4, V2.S4
   202		VADD	V19.S4, V3.S4, V3.S4
   203		// VLD4R 16(R4), [V24.S4, V25.S4, V26.S4, V27.S4]
   204		WORD	$0x4DFFE898
   205		// restore R4
   206		SUB	$32, R4
   207	
   208		// load counter + nonce
   209		// VLD1R (R7), [V28.S4]
   210		WORD	$0x4D40C8FC
   211		// VLD3R (R6), [V29.S4, V30.S4, V31.S4]
   212		WORD	$0x4D40E8DD
   213	
   214		VADD	V20.S4, V4.S4, V4.S4
   215		VADD	V21.S4, V5.S4, V5.S4
   216		VADD	V22.S4, V6.S4, V6.S4
   217		VADD	V23.S4, V7.S4, V7.S4
   218		VADD	V24.S4, V8.S4, V8.S4
   219		VADD	V25.S4, V9.S4, V9.S4
   220		VADD	V26.S4, V10.S4, V10.S4
   221		VADD	V27.S4, V11.S4, V11.S4
   222		VADD	V28.S4, V12.S4, V12.S4
   223		VADD	V29.S4, V13.S4, V13.S4
   224		VADD	V30.S4, V14.S4, V14.S4
   225		VADD	V31.S4, V15.S4, V15.S4
   226	
   227		VZIP1	V1.S4, V0.S4, V16.S4
   228		VZIP2	V1.S4, V0.S4, V17.S4
   229		VZIP1	V3.S4, V2.S4, V18.S4
   230		VZIP2	V3.S4, V2.S4, V19.S4
   231		VZIP1	V5.S4, V4.S4, V20.S4
   232		VZIP2	V5.S4, V4.S4, V21.S4
   233		VZIP1	V7.S4, V6.S4, V22.S4
   234		VZIP2	V7.S4, V6.S4, V23.S4
   235		VZIP1	V9.S4, V8.S4, V24.S4
   236		VZIP2	V9.S4, V8.S4, V25.S4
   237		VZIP1	V11.S4, V10.S4, V26.S4
   238		VZIP2	V11.S4, V10.S4, V27.S4
   239		VZIP1	V13.S4, V12.S4, V28.S4
   240		VZIP2	V13.S4, V12.S4, V29.S4
   241		VZIP1	V15.S4, V14.S4, V30.S4
   242		VZIP2	V15.S4, V14.S4, V31.S4
   243		VZIP1	V18.D2, V16.D2, V0.D2
   244		VZIP2	V18.D2, V16.D2, V4.D2
   245		VZIP1	V19.D2, V17.D2, V8.D2
   246		VZIP2	V19.D2, V17.D2, V12.D2
   247		VLD1.P	64(R2), [V16.B16, V17.B16, V18.B16, V19.B16]
   248	
   249		VZIP1	V22.D2, V20.D2, V1.D2
   250		VZIP2	V22.D2, V20.D2, V5.D2
   251		VZIP1	V23.D2, V21.D2, V9.D2
   252		VZIP2	V23.D2, V21.D2, V13.D2
   253		VLD1.P	64(R2), [V20.B16, V21.B16, V22.B16, V23.B16]
   254		VZIP1	V26.D2, V24.D2, V2.D2
   255		VZIP2	V26.D2, V24.D2, V6.D2
   256		VZIP1	V27.D2, V25.D2, V10.D2
   257		VZIP2	V27.D2, V25.D2, V14.D2
   258		VLD1.P	64(R2), [V24.B16, V25.B16, V26.B16, V27.B16]
   259		VZIP1	V30.D2, V28.D2, V3.D2
   260		VZIP2	V30.D2, V28.D2, V7.D2
   261		VZIP1	V31.D2, V29.D2, V11.D2
   262		VZIP2	V31.D2, V29.D2, V15.D2
   263		VLD1.P	64(R2), [V28.B16, V29.B16, V30.B16, V31.B16]
   264		VEOR	V0.B16, V16.B16, V16.B16
   265		VEOR	V1.B16, V17.B16, V17.B16
   266		VEOR	V2.B16, V18.B16, V18.B16
   267		VEOR	V3.B16, V19.B16, V19.B16
   268		VST1.P	[V16.B16, V17.B16, V18.B16, V19.B16], 64(R1)
   269		VEOR	V4.B16, V20.B16, V20.B16
   270		VEOR	V5.B16, V21.B16, V21.B16
   271		VEOR	V6.B16, V22.B16, V22.B16
   272		VEOR	V7.B16, V23.B16, V23.B16
   273		VST1.P	[V20.B16, V21.B16, V22.B16, V23.B16], 64(R1)
   274		VEOR	V8.B16, V24.B16, V24.B16
   275		VEOR	V9.B16, V25.B16, V25.B16
   276		VEOR	V10.B16, V26.B16, V26.B16
   277		VEOR	V11.B16, V27.B16, V27.B16
   278		VST1.P	[V24.B16, V25.B16, V26.B16, V27.B16], 64(R1)
   279		VEOR	V12.B16, V28.B16, V28.B16
   280		VEOR	V13.B16, V29.B16, V29.B16
   281		VEOR	V14.B16, V30.B16, V30.B16
   282		VEOR	V15.B16, V31.B16, V31.B16
   283		VST1.P	[V28.B16, V29.B16, V30.B16, V31.B16], 64(R1)
   284	
   285		ADD	$4, R20
   286		MOVW	R20, (R7) // update counter
   287	
   288		CMP	R2, R12
   289		BGT	loop
   290	
   291		RET
   292	
   293	
   294	DATA	·constants+0x00(SB)/4, $0x61707865
   295	DATA	·constants+0x04(SB)/4, $0x3320646e
   296	DATA	·constants+0x08(SB)/4, $0x79622d32
   297	DATA	·constants+0x0c(SB)/4, $0x6b206574
   298	GLOBL	·constants(SB), NOPTR|RODATA, $32
   299	
   300	DATA	·incRotMatrix+0x00(SB)/4, $0x00000000
   301	DATA	·incRotMatrix+0x04(SB)/4, $0x00000001
   302	DATA	·incRotMatrix+0x08(SB)/4, $0x00000002
   303	DATA	·incRotMatrix+0x0c(SB)/4, $0x00000003
   304	DATA	·incRotMatrix+0x10(SB)/4, $0x02010003
   305	DATA	·incRotMatrix+0x14(SB)/4, $0x06050407
   306	DATA	·incRotMatrix+0x18(SB)/4, $0x0A09080B
   307	DATA	·incRotMatrix+0x1c(SB)/4, $0x0E0D0C0F
   308	GLOBL	·incRotMatrix(SB), NOPTR|RODATA, $32

View as plain text