Text file src/vendor/golang.org/x/crypto/internal/chacha20/chacha_s390x.s

     1	// Copyright 2018 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// +build s390x,!gccgo,!appengine
     6	
     7	#include "go_asm.h"
     8	#include "textflag.h"
     9	
    10	// This is an implementation of the ChaCha20 encryption algorithm as
    11	// specified in RFC 7539. It uses vector instructions to compute
    12	// 4 keystream blocks in parallel (256 bytes) which are then XORed
    13	// with the bytes in the input slice.
    14	
    15	GLOBL ·constants<>(SB), RODATA|NOPTR, $32
    16	// BSWAP: swap bytes in each 4-byte element
    17	DATA ·constants<>+0x00(SB)/4, $0x03020100
    18	DATA ·constants<>+0x04(SB)/4, $0x07060504
    19	DATA ·constants<>+0x08(SB)/4, $0x0b0a0908
    20	DATA ·constants<>+0x0c(SB)/4, $0x0f0e0d0c
    21	// J0: [j0, j1, j2, j3]
    22	DATA ·constants<>+0x10(SB)/4, $0x61707865
    23	DATA ·constants<>+0x14(SB)/4, $0x3320646e
    24	DATA ·constants<>+0x18(SB)/4, $0x79622d32
    25	DATA ·constants<>+0x1c(SB)/4, $0x6b206574
    26	
    27	// EXRL targets:
    28	TEXT ·mvcSrcToBuf(SB), NOFRAME|NOSPLIT, $0
    29		MVC $1, (R1), (R8)
    30		RET
    31	
    32	TEXT ·mvcBufToDst(SB), NOFRAME|NOSPLIT, $0
    33		MVC $1, (R8), (R9)
    34		RET
    35	
    36	#define BSWAP V5
    37	#define J0    V6
    38	#define KEY0  V7
    39	#define KEY1  V8
    40	#define NONCE V9
    41	#define CTR   V10
    42	#define M0    V11
    43	#define M1    V12
    44	#define M2    V13
    45	#define M3    V14
    46	#define INC   V15
    47	#define X0    V16
    48	#define X1    V17
    49	#define X2    V18
    50	#define X3    V19
    51	#define X4    V20
    52	#define X5    V21
    53	#define X6    V22
    54	#define X7    V23
    55	#define X8    V24
    56	#define X9    V25
    57	#define X10   V26
    58	#define X11   V27
    59	#define X12   V28
    60	#define X13   V29
    61	#define X14   V30
    62	#define X15   V31
    63	
    64	#define NUM_ROUNDS 20
    65	
    66	#define ROUND4(a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3) \
    67		VAF    a1, a0, a0  \
    68		VAF    b1, b0, b0  \
    69		VAF    c1, c0, c0  \
    70		VAF    d1, d0, d0  \
    71		VX     a0, a2, a2  \
    72		VX     b0, b2, b2  \
    73		VX     c0, c2, c2  \
    74		VX     d0, d2, d2  \
    75		VERLLF $16, a2, a2 \
    76		VERLLF $16, b2, b2 \
    77		VERLLF $16, c2, c2 \
    78		VERLLF $16, d2, d2 \
    79		VAF    a2, a3, a3  \
    80		VAF    b2, b3, b3  \
    81		VAF    c2, c3, c3  \
    82		VAF    d2, d3, d3  \
    83		VX     a3, a1, a1  \
    84		VX     b3, b1, b1  \
    85		VX     c3, c1, c1  \
    86		VX     d3, d1, d1  \
    87		VERLLF $12, a1, a1 \
    88		VERLLF $12, b1, b1 \
    89		VERLLF $12, c1, c1 \
    90		VERLLF $12, d1, d1 \
    91		VAF    a1, a0, a0  \
    92		VAF    b1, b0, b0  \
    93		VAF    c1, c0, c0  \
    94		VAF    d1, d0, d0  \
    95		VX     a0, a2, a2  \
    96		VX     b0, b2, b2  \
    97		VX     c0, c2, c2  \
    98		VX     d0, d2, d2  \
    99		VERLLF $8, a2, a2  \
   100		VERLLF $8, b2, b2  \
   101		VERLLF $8, c2, c2  \
   102		VERLLF $8, d2, d2  \
   103		VAF    a2, a3, a3  \
   104		VAF    b2, b3, b3  \
   105		VAF    c2, c3, c3  \
   106		VAF    d2, d3, d3  \
   107		VX     a3, a1, a1  \
   108		VX     b3, b1, b1  \
   109		VX     c3, c1, c1  \
   110		VX     d3, d1, d1  \
   111		VERLLF $7, a1, a1  \
   112		VERLLF $7, b1, b1  \
   113		VERLLF $7, c1, c1  \
   114		VERLLF $7, d1, d1
   115	
   116	#define PERMUTE(mask, v0, v1, v2, v3) \
   117		VPERM v0, v0, mask, v0 \
   118		VPERM v1, v1, mask, v1 \
   119		VPERM v2, v2, mask, v2 \
   120		VPERM v3, v3, mask, v3
   121	
   122	#define ADDV(x, v0, v1, v2, v3) \
   123		VAF x, v0, v0 \
   124		VAF x, v1, v1 \
   125		VAF x, v2, v2 \
   126		VAF x, v3, v3
   127	
   128	#define XORV(off, dst, src, v0, v1, v2, v3) \
   129		VLM  off(src), M0, M3          \
   130		PERMUTE(BSWAP, v0, v1, v2, v3) \
   131		VX   v0, M0, M0                \
   132		VX   v1, M1, M1                \
   133		VX   v2, M2, M2                \
   134		VX   v3, M3, M3                \
   135		VSTM M0, M3, off(dst)
   136	
   137	#define SHUFFLE(a, b, c, d, t, u, v, w) \
   138		VMRHF a, c, t \ // t = {a[0], c[0], a[1], c[1]}
   139		VMRHF b, d, u \ // u = {b[0], d[0], b[1], d[1]}
   140		VMRLF a, c, v \ // v = {a[2], c[2], a[3], c[3]}
   141		VMRLF b, d, w \ // w = {b[2], d[2], b[3], d[3]}
   142		VMRHF t, u, a \ // a = {a[0], b[0], c[0], d[0]}
   143		VMRLF t, u, b \ // b = {a[1], b[1], c[1], d[1]}
   144		VMRHF v, w, c \ // c = {a[2], b[2], c[2], d[2]}
   145		VMRLF v, w, d // d = {a[3], b[3], c[3], d[3]}
   146	
   147	// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32, buf *[256]byte, len *int)
   148	TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
   149		MOVD $·constants<>(SB), R1
   150		MOVD dst+0(FP), R2         // R2=&dst[0]
   151		LMG  src+24(FP), R3, R4    // R3=&src[0] R4=len(src)
   152		MOVD key+48(FP), R5        // R5=key
   153		MOVD nonce+56(FP), R6      // R6=nonce
   154		MOVD counter+64(FP), R7    // R7=counter
   155		MOVD buf+72(FP), R8        // R8=buf
   156		MOVD len+80(FP), R9        // R9=len
   157	
   158		// load BSWAP and J0
   159		VLM (R1), BSWAP, J0
   160	
   161		// set up tail buffer
   162		ADD     $-1, R4, R12
   163		MOVBZ   R12, R12
   164		CMPUBEQ R12, $255, aligned
   165		MOVD    R4, R1
   166		AND     $~255, R1
   167		MOVD    $(R3)(R1*1), R1
   168		EXRL    $·mvcSrcToBuf(SB), R12
   169		MOVD    $255, R0
   170		SUB     R12, R0
   171		MOVD    R0, (R9)               // update len
   172	
   173	aligned:
   174		// setup
   175		MOVD  $95, R0
   176		VLM   (R5), KEY0, KEY1
   177		VLL   R0, (R6), NONCE
   178		VZERO M0
   179		VLEIB $7, $32, M0
   180		VSRLB M0, NONCE, NONCE
   181	
   182		// initialize counter values
   183		VLREPF (R7), CTR
   184		VZERO  INC
   185		VLEIF  $1, $1, INC
   186		VLEIF  $2, $2, INC
   187		VLEIF  $3, $3, INC
   188		VAF    INC, CTR, CTR
   189		VREPIF $4, INC
   190	
   191	chacha:
   192		VREPF $0, J0, X0
   193		VREPF $1, J0, X1
   194		VREPF $2, J0, X2
   195		VREPF $3, J0, X3
   196		VREPF $0, KEY0, X4
   197		VREPF $1, KEY0, X5
   198		VREPF $2, KEY0, X6
   199		VREPF $3, KEY0, X7
   200		VREPF $0, KEY1, X8
   201		VREPF $1, KEY1, X9
   202		VREPF $2, KEY1, X10
   203		VREPF $3, KEY1, X11
   204		VLR   CTR, X12
   205		VREPF $1, NONCE, X13
   206		VREPF $2, NONCE, X14
   207		VREPF $3, NONCE, X15
   208	
   209		MOVD $(NUM_ROUNDS/2), R1
   210	
   211	loop:
   212		ROUND4(X0, X4, X12,  X8, X1, X5, X13,  X9, X2, X6, X14, X10, X3, X7, X15, X11)
   213		ROUND4(X0, X5, X15, X10, X1, X6, X12, X11, X2, X7, X13, X8,  X3, X4, X14, X9)
   214	
   215		ADD $-1, R1
   216		BNE loop
   217	
   218		// decrement length
   219		ADD $-256, R4
   220		BLT tail
   221	
   222	continue:
   223		// rearrange vectors
   224		SHUFFLE(X0, X1, X2, X3, M0, M1, M2, M3)
   225		ADDV(J0, X0, X1, X2, X3)
   226		SHUFFLE(X4, X5, X6, X7, M0, M1, M2, M3)
   227		ADDV(KEY0, X4, X5, X6, X7)
   228		SHUFFLE(X8, X9, X10, X11, M0, M1, M2, M3)
   229		ADDV(KEY1, X8, X9, X10, X11)
   230		VAF CTR, X12, X12
   231		SHUFFLE(X12, X13, X14, X15, M0, M1, M2, M3)
   232		ADDV(NONCE, X12, X13, X14, X15)
   233	
   234		// increment counters
   235		VAF INC, CTR, CTR
   236	
   237		// xor keystream with plaintext
   238		XORV(0*64, R2, R3, X0, X4,  X8, X12)
   239		XORV(1*64, R2, R3, X1, X5,  X9, X13)
   240		XORV(2*64, R2, R3, X2, X6, X10, X14)
   241		XORV(3*64, R2, R3, X3, X7, X11, X15)
   242	
   243		// increment pointers
   244		MOVD $256(R2), R2
   245		MOVD $256(R3), R3
   246	
   247		CMPBNE  R4, $0, chacha
   248		CMPUBEQ R12, $255, return
   249		EXRL    $·mvcBufToDst(SB), R12 // len was updated during setup
   250	
   251	return:
   252		VSTEF $0, CTR, (R7)
   253		RET
   254	
   255	tail:
   256		MOVD R2, R9
   257		MOVD R8, R2
   258		MOVD R8, R3
   259		MOVD $0, R4
   260		JMP  continue
View as plain text