Text file src/vendor/golang.org/x/crypto/internal/chacha20/chacha_s390x.s
1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // +build s390x,!gccgo,!appengine
6
7 #include "go_asm.h"
8 #include "textflag.h"
9
10 // This is an implementation of the ChaCha20 encryption algorithm as
11 // specified in RFC 7539. It uses vector instructions to compute
12 // 4 keystream blocks in parallel (256 bytes) which are then XORed
13 // with the bytes in the input slice.
14
15 GLOBL ·constants<>(SB), RODATA|NOPTR, $32
16 // BSWAP: swap bytes in each 4-byte element
17 DATA ·constants<>+0x00(SB)/4, $0x03020100
18 DATA ·constants<>+0x04(SB)/4, $0x07060504
19 DATA ·constants<>+0x08(SB)/4, $0x0b0a0908
20 DATA ·constants<>+0x0c(SB)/4, $0x0f0e0d0c
21 // J0: [j0, j1, j2, j3]
22 DATA ·constants<>+0x10(SB)/4, $0x61707865
23 DATA ·constants<>+0x14(SB)/4, $0x3320646e
24 DATA ·constants<>+0x18(SB)/4, $0x79622d32
25 DATA ·constants<>+0x1c(SB)/4, $0x6b206574
26
27 // EXRL targets:
28 TEXT ·mvcSrcToBuf(SB), NOFRAME|NOSPLIT, $0
29 MVC $1, (R1), (R8)
30 RET
31
32 TEXT ·mvcBufToDst(SB), NOFRAME|NOSPLIT, $0
33 MVC $1, (R8), (R9)
34 RET
35
36 #define BSWAP V5
37 #define J0 V6
38 #define KEY0 V7
39 #define KEY1 V8
40 #define NONCE V9
41 #define CTR V10
42 #define M0 V11
43 #define M1 V12
44 #define M2 V13
45 #define M3 V14
46 #define INC V15
47 #define X0 V16
48 #define X1 V17
49 #define X2 V18
50 #define X3 V19
51 #define X4 V20
52 #define X5 V21
53 #define X6 V22
54 #define X7 V23
55 #define X8 V24
56 #define X9 V25
57 #define X10 V26
58 #define X11 V27
59 #define X12 V28
60 #define X13 V29
61 #define X14 V30
62 #define X15 V31
63
64 #define NUM_ROUNDS 20
65
66 #define ROUND4(a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3) \
67 VAF a1, a0, a0 \
68 VAF b1, b0, b0 \
69 VAF c1, c0, c0 \
70 VAF d1, d0, d0 \
71 VX a0, a2, a2 \
72 VX b0, b2, b2 \
73 VX c0, c2, c2 \
74 VX d0, d2, d2 \
75 VERLLF $16, a2, a2 \
76 VERLLF $16, b2, b2 \
77 VERLLF $16, c2, c2 \
78 VERLLF $16, d2, d2 \
79 VAF a2, a3, a3 \
80 VAF b2, b3, b3 \
81 VAF c2, c3, c3 \
82 VAF d2, d3, d3 \
83 VX a3, a1, a1 \
84 VX b3, b1, b1 \
85 VX c3, c1, c1 \
86 VX d3, d1, d1 \
87 VERLLF $12, a1, a1 \
88 VERLLF $12, b1, b1 \
89 VERLLF $12, c1, c1 \
90 VERLLF $12, d1, d1 \
91 VAF a1, a0, a0 \
92 VAF b1, b0, b0 \
93 VAF c1, c0, c0 \
94 VAF d1, d0, d0 \
95 VX a0, a2, a2 \
96 VX b0, b2, b2 \
97 VX c0, c2, c2 \
98 VX d0, d2, d2 \
99 VERLLF $8, a2, a2 \
100 VERLLF $8, b2, b2 \
101 VERLLF $8, c2, c2 \
102 VERLLF $8, d2, d2 \
103 VAF a2, a3, a3 \
104 VAF b2, b3, b3 \
105 VAF c2, c3, c3 \
106 VAF d2, d3, d3 \
107 VX a3, a1, a1 \
108 VX b3, b1, b1 \
109 VX c3, c1, c1 \
110 VX d3, d1, d1 \
111 VERLLF $7, a1, a1 \
112 VERLLF $7, b1, b1 \
113 VERLLF $7, c1, c1 \
114 VERLLF $7, d1, d1
115
116 #define PERMUTE(mask, v0, v1, v2, v3) \
117 VPERM v0, v0, mask, v0 \
118 VPERM v1, v1, mask, v1 \
119 VPERM v2, v2, mask, v2 \
120 VPERM v3, v3, mask, v3
121
122 #define ADDV(x, v0, v1, v2, v3) \
123 VAF x, v0, v0 \
124 VAF x, v1, v1 \
125 VAF x, v2, v2 \
126 VAF x, v3, v3
127
128 #define XORV(off, dst, src, v0, v1, v2, v3) \
129 VLM off(src), M0, M3 \
130 PERMUTE(BSWAP, v0, v1, v2, v3) \
131 VX v0, M0, M0 \
132 VX v1, M1, M1 \
133 VX v2, M2, M2 \
134 VX v3, M3, M3 \
135 VSTM M0, M3, off(dst)
136
137 #define SHUFFLE(a, b, c, d, t, u, v, w) \
138 VMRHF a, c, t \ // t = {a[0], c[0], a[1], c[1]}
139 VMRHF b, d, u \ // u = {b[0], d[0], b[1], d[1]}
140 VMRLF a, c, v \ // v = {a[2], c[2], a[3], c[3]}
141 VMRLF b, d, w \ // w = {b[2], d[2], b[3], d[3]}
142 VMRHF t, u, a \ // a = {a[0], b[0], c[0], d[0]}
143 VMRLF t, u, b \ // b = {a[1], b[1], c[1], d[1]}
144 VMRHF v, w, c \ // c = {a[2], b[2], c[2], d[2]}
145 VMRLF v, w, d // d = {a[3], b[3], c[3], d[3]}
146
147 // func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32, buf *[256]byte, len *int)
148 TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
149 MOVD $·constants<>(SB), R1
150 MOVD dst+0(FP), R2 // R2=&dst[0]
151 LMG src+24(FP), R3, R4 // R3=&src[0] R4=len(src)
152 MOVD key+48(FP), R5 // R5=key
153 MOVD nonce+56(FP), R6 // R6=nonce
154 MOVD counter+64(FP), R7 // R7=counter
155 MOVD buf+72(FP), R8 // R8=buf
156 MOVD len+80(FP), R9 // R9=len
157
158 // load BSWAP and J0
159 VLM (R1), BSWAP, J0
160
161 // set up tail buffer
162 ADD $-1, R4, R12
163 MOVBZ R12, R12
164 CMPUBEQ R12, $255, aligned
165 MOVD R4, R1
166 AND $~255, R1
167 MOVD $(R3)(R1*1), R1
168 EXRL $·mvcSrcToBuf(SB), R12
169 MOVD $255, R0
170 SUB R12, R0
171 MOVD R0, (R9) // update len
172
173 aligned:
174 // setup
175 MOVD $95, R0
176 VLM (R5), KEY0, KEY1
177 VLL R0, (R6), NONCE
178 VZERO M0
179 VLEIB $7, $32, M0
180 VSRLB M0, NONCE, NONCE
181
182 // initialize counter values
183 VLREPF (R7), CTR
184 VZERO INC
185 VLEIF $1, $1, INC
186 VLEIF $2, $2, INC
187 VLEIF $3, $3, INC
188 VAF INC, CTR, CTR
189 VREPIF $4, INC
190
191 chacha:
192 VREPF $0, J0, X0
193 VREPF $1, J0, X1
194 VREPF $2, J0, X2
195 VREPF $3, J0, X3
196 VREPF $0, KEY0, X4
197 VREPF $1, KEY0, X5
198 VREPF $2, KEY0, X6
199 VREPF $3, KEY0, X7
200 VREPF $0, KEY1, X8
201 VREPF $1, KEY1, X9
202 VREPF $2, KEY1, X10
203 VREPF $3, KEY1, X11
204 VLR CTR, X12
205 VREPF $1, NONCE, X13
206 VREPF $2, NONCE, X14
207 VREPF $3, NONCE, X15
208
209 MOVD $(NUM_ROUNDS/2), R1
210
211 loop:
212 ROUND4(X0, X4, X12, X8, X1, X5, X13, X9, X2, X6, X14, X10, X3, X7, X15, X11)
213 ROUND4(X0, X5, X15, X10, X1, X6, X12, X11, X2, X7, X13, X8, X3, X4, X14, X9)
214
215 ADD $-1, R1
216 BNE loop
217
218 // decrement length
219 ADD $-256, R4
220 BLT tail
221
222 continue:
223 // rearrange vectors
224 SHUFFLE(X0, X1, X2, X3, M0, M1, M2, M3)
225 ADDV(J0, X0, X1, X2, X3)
226 SHUFFLE(X4, X5, X6, X7, M0, M1, M2, M3)
227 ADDV(KEY0, X4, X5, X6, X7)
228 SHUFFLE(X8, X9, X10, X11, M0, M1, M2, M3)
229 ADDV(KEY1, X8, X9, X10, X11)
230 VAF CTR, X12, X12
231 SHUFFLE(X12, X13, X14, X15, M0, M1, M2, M3)
232 ADDV(NONCE, X12, X13, X14, X15)
233
234 // increment counters
235 VAF INC, CTR, CTR
236
237 // xor keystream with plaintext
238 XORV(0*64, R2, R3, X0, X4, X8, X12)
239 XORV(1*64, R2, R3, X1, X5, X9, X13)
240 XORV(2*64, R2, R3, X2, X6, X10, X14)
241 XORV(3*64, R2, R3, X3, X7, X11, X15)
242
243 // increment pointers
244 MOVD $256(R2), R2
245 MOVD $256(R3), R3
246
247 CMPBNE R4, $0, chacha
248 CMPUBEQ R12, $255, return
249 EXRL $·mvcBufToDst(SB), R12 // len was updated during setup
250
251 return:
252 VSTEF $0, CTR, (R7)
253 RET
254
255 tail:
256 MOVD R2, R9
257 MOVD R8, R2
258 MOVD R8, R3
259 MOVD $0, R4
260 JMP continue
View as plain text