Text file src/vendor/golang.org/x/crypto/poly1305/sum_s390x.s
1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // +build s390x,go1.11,!gccgo,!appengine
6
7 #include "textflag.h"
8
9 // Implementation of Poly1305 using the vector facility (vx).
10
11 // constants
12 #define MOD26 V0
13 #define EX0 V1
14 #define EX1 V2
15 #define EX2 V3
16
17 // temporaries
18 #define T_0 V4
19 #define T_1 V5
20 #define T_2 V6
21 #define T_3 V7
22 #define T_4 V8
23
24 // key (r)
25 #define R_0 V9
26 #define R_1 V10
27 #define R_2 V11
28 #define R_3 V12
29 #define R_4 V13
30 #define R5_1 V14
31 #define R5_2 V15
32 #define R5_3 V16
33 #define R5_4 V17
34 #define RSAVE_0 R5
35 #define RSAVE_1 R6
36 #define RSAVE_2 R7
37 #define RSAVE_3 R8
38 #define RSAVE_4 R9
39 #define R5SAVE_1 V28
40 #define R5SAVE_2 V29
41 #define R5SAVE_3 V30
42 #define R5SAVE_4 V31
43
44 // message block
45 #define F_0 V18
46 #define F_1 V19
47 #define F_2 V20
48 #define F_3 V21
49 #define F_4 V22
50
51 // accumulator
52 #define H_0 V23
53 #define H_1 V24
54 #define H_2 V25
55 #define H_3 V26
56 #define H_4 V27
57
58 GLOBL ·keyMask<>(SB), RODATA, $16
59 DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f
60 DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f
61
62 GLOBL ·bswapMask<>(SB), RODATA, $16
63 DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908
64 DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100
65
66 GLOBL ·constants<>(SB), RODATA, $64
67 // MOD26
68 DATA ·constants<>+0(SB)/8, $0x3ffffff
69 DATA ·constants<>+8(SB)/8, $0x3ffffff
70 // EX0
71 DATA ·constants<>+16(SB)/8, $0x0006050403020100
72 DATA ·constants<>+24(SB)/8, $0x1016151413121110
73 // EX1
74 DATA ·constants<>+32(SB)/8, $0x060c0b0a09080706
75 DATA ·constants<>+40(SB)/8, $0x161c1b1a19181716
76 // EX2
77 DATA ·constants<>+48(SB)/8, $0x0d0d0d0d0d0f0e0d
78 DATA ·constants<>+56(SB)/8, $0x1d1d1d1d1d1f1e1d
79
80 // h = (f*g) % (2**130-5) [partial reduction]
81 #define MULTIPLY(f0, f1, f2, f3, f4, g0, g1, g2, g3, g4, g51, g52, g53, g54, h0, h1, h2, h3, h4) \
82 VMLOF f0, g0, h0 \
83 VMLOF f0, g1, h1 \
84 VMLOF f0, g2, h2 \
85 VMLOF f0, g3, h3 \
86 VMLOF f0, g4, h4 \
87 VMLOF f1, g54, T_0 \
88 VMLOF f1, g0, T_1 \
89 VMLOF f1, g1, T_2 \
90 VMLOF f1, g2, T_3 \
91 VMLOF f1, g3, T_4 \
92 VMALOF f2, g53, h0, h0 \
93 VMALOF f2, g54, h1, h1 \
94 VMALOF f2, g0, h2, h2 \
95 VMALOF f2, g1, h3, h3 \
96 VMALOF f2, g2, h4, h4 \
97 VMALOF f3, g52, T_0, T_0 \
98 VMALOF f3, g53, T_1, T_1 \
99 VMALOF f3, g54, T_2, T_2 \
100 VMALOF f3, g0, T_3, T_3 \
101 VMALOF f3, g1, T_4, T_4 \
102 VMALOF f4, g51, h0, h0 \
103 VMALOF f4, g52, h1, h1 \
104 VMALOF f4, g53, h2, h2 \
105 VMALOF f4, g54, h3, h3 \
106 VMALOF f4, g0, h4, h4 \
107 VAG T_0, h0, h0 \
108 VAG T_1, h1, h1 \
109 VAG T_2, h2, h2 \
110 VAG T_3, h3, h3 \
111 VAG T_4, h4, h4
112
113 // carry h0->h1 h3->h4, h1->h2 h4->h0, h0->h1 h2->h3, h3->h4
114 #define REDUCE(h0, h1, h2, h3, h4) \
115 VESRLG $26, h0, T_0 \
116 VESRLG $26, h3, T_1 \
117 VN MOD26, h0, h0 \
118 VN MOD26, h3, h3 \
119 VAG T_0, h1, h1 \
120 VAG T_1, h4, h4 \
121 VESRLG $26, h1, T_2 \
122 VESRLG $26, h4, T_3 \
123 VN MOD26, h1, h1 \
124 VN MOD26, h4, h4 \
125 VESLG $2, T_3, T_4 \
126 VAG T_3, T_4, T_4 \
127 VAG T_2, h2, h2 \
128 VAG T_4, h0, h0 \
129 VESRLG $26, h2, T_0 \
130 VESRLG $26, h0, T_1 \
131 VN MOD26, h2, h2 \
132 VN MOD26, h0, h0 \
133 VAG T_0, h3, h3 \
134 VAG T_1, h1, h1 \
135 VESRLG $26, h3, T_2 \
136 VN MOD26, h3, h3 \
137 VAG T_2, h4, h4
138
139 // expand in0 into d[0] and in1 into d[1]
140 #define EXPAND(in0, in1, d0, d1, d2, d3, d4) \
141 VGBM $0x0707, d1 \ // d1=tmp
142 VPERM in0, in1, EX2, d4 \
143 VPERM in0, in1, EX0, d0 \
144 VPERM in0, in1, EX1, d2 \
145 VN d1, d4, d4 \
146 VESRLG $26, d0, d1 \
147 VESRLG $30, d2, d3 \
148 VESRLG $4, d2, d2 \
149 VN MOD26, d0, d0 \
150 VN MOD26, d1, d1 \
151 VN MOD26, d2, d2 \
152 VN MOD26, d3, d3
153
154 // pack h4:h0 into h1:h0 (no carry)
155 #define PACK(h0, h1, h2, h3, h4) \
156 VESLG $26, h1, h1 \
157 VESLG $26, h3, h3 \
158 VO h0, h1, h0 \
159 VO h2, h3, h2 \
160 VESLG $4, h2, h2 \
161 VLEIB $7, $48, h1 \
162 VSLB h1, h2, h2 \
163 VO h0, h2, h0 \
164 VLEIB $7, $104, h1 \
165 VSLB h1, h4, h3 \
166 VO h3, h0, h0 \
167 VLEIB $7, $24, h1 \
168 VSRLB h1, h4, h1
169
170 // if h > 2**130-5 then h -= 2**130-5
171 #define MOD(h0, h1, t0, t1, t2) \
172 VZERO t0 \
173 VLEIG $1, $5, t0 \
174 VACCQ h0, t0, t1 \
175 VAQ h0, t0, t0 \
176 VONE t2 \
177 VLEIG $1, $-4, t2 \
178 VAQ t2, t1, t1 \
179 VACCQ h1, t1, t1 \
180 VONE t2 \
181 VAQ t2, t1, t1 \
182 VN h0, t1, t2 \
183 VNC t0, t1, t1 \
184 VO t1, t2, h0
185
186 // func poly1305vx(out *[16]byte, m *byte, mlen uint64, key *[32]key)
187 TEXT ·poly1305vx(SB), $0-32
188 // This code processes up to 2 blocks (32 bytes) per iteration
189 // using the algorithm described in:
190 // NEON crypto, Daniel J. Bernstein & Peter Schwabe
191 // https://cryptojedi.org/papers/neoncrypto-20120320.pdf
192 LMG out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key
193
194 // load MOD26, EX0, EX1 and EX2
195 MOVD $·constants<>(SB), R5
196 VLM (R5), MOD26, EX2
197
198 // setup r
199 VL (R4), T_0
200 MOVD $·keyMask<>(SB), R6
201 VL (R6), T_1
202 VN T_0, T_1, T_0
203 EXPAND(T_0, T_0, R_0, R_1, R_2, R_3, R_4)
204
205 // setup r*5
206 VLEIG $0, $5, T_0
207 VLEIG $1, $5, T_0
208
209 // store r (for final block)
210 VMLOF T_0, R_1, R5SAVE_1
211 VMLOF T_0, R_2, R5SAVE_2
212 VMLOF T_0, R_3, R5SAVE_3
213 VMLOF T_0, R_4, R5SAVE_4
214 VLGVG $0, R_0, RSAVE_0
215 VLGVG $0, R_1, RSAVE_1
216 VLGVG $0, R_2, RSAVE_2
217 VLGVG $0, R_3, RSAVE_3
218 VLGVG $0, R_4, RSAVE_4
219
220 // skip r**2 calculation
221 CMPBLE R3, $16, skip
222
223 // calculate r**2
224 MULTIPLY(R_0, R_1, R_2, R_3, R_4, R_0, R_1, R_2, R_3, R_4, R5SAVE_1, R5SAVE_2, R5SAVE_3, R5SAVE_4, H_0, H_1, H_2, H_3, H_4)
225 REDUCE(H_0, H_1, H_2, H_3, H_4)
226 VLEIG $0, $5, T_0
227 VLEIG $1, $5, T_0
228 VMLOF T_0, H_1, R5_1
229 VMLOF T_0, H_2, R5_2
230 VMLOF T_0, H_3, R5_3
231 VMLOF T_0, H_4, R5_4
232 VLR H_0, R_0
233 VLR H_1, R_1
234 VLR H_2, R_2
235 VLR H_3, R_3
236 VLR H_4, R_4
237
238 // initialize h
239 VZERO H_0
240 VZERO H_1
241 VZERO H_2
242 VZERO H_3
243 VZERO H_4
244
245 loop:
246 CMPBLE R3, $32, b2
247 VLM (R2), T_0, T_1
248 SUB $32, R3
249 MOVD $32(R2), R2
250 EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
251 VLEIB $4, $1, F_4
252 VLEIB $12, $1, F_4
253
254 multiply:
255 VAG H_0, F_0, F_0
256 VAG H_1, F_1, F_1
257 VAG H_2, F_2, F_2
258 VAG H_3, F_3, F_3
259 VAG H_4, F_4, F_4
260 MULTIPLY(F_0, F_1, F_2, F_3, F_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, H_0, H_1, H_2, H_3, H_4)
261 REDUCE(H_0, H_1, H_2, H_3, H_4)
262 CMPBNE R3, $0, loop
263
264 finish:
265 // sum vectors
266 VZERO T_0
267 VSUMQG H_0, T_0, H_0
268 VSUMQG H_1, T_0, H_1
269 VSUMQG H_2, T_0, H_2
270 VSUMQG H_3, T_0, H_3
271 VSUMQG H_4, T_0, H_4
272
273 // h may be >= 2*(2**130-5) so we need to reduce it again
274 REDUCE(H_0, H_1, H_2, H_3, H_4)
275
276 // carry h1->h4
277 VESRLG $26, H_1, T_1
278 VN MOD26, H_1, H_1
279 VAQ T_1, H_2, H_2
280 VESRLG $26, H_2, T_2
281 VN MOD26, H_2, H_2
282 VAQ T_2, H_3, H_3
283 VESRLG $26, H_3, T_3
284 VN MOD26, H_3, H_3
285 VAQ T_3, H_4, H_4
286
287 // h is now < 2*(2**130-5)
288 // pack h into h1 (hi) and h0 (lo)
289 PACK(H_0, H_1, H_2, H_3, H_4)
290
291 // if h > 2**130-5 then h -= 2**130-5
292 MOD(H_0, H_1, T_0, T_1, T_2)
293
294 // h += s
295 MOVD $·bswapMask<>(SB), R5
296 VL (R5), T_1
297 VL 16(R4), T_0
298 VPERM T_0, T_0, T_1, T_0 // reverse bytes (to big)
299 VAQ T_0, H_0, H_0
300 VPERM H_0, H_0, T_1, H_0 // reverse bytes (to little)
301 VST H_0, (R1)
302
303 RET
304
305 b2:
306 CMPBLE R3, $16, b1
307
308 // 2 blocks remaining
309 SUB $17, R3
310 VL (R2), T_0
311 VLL R3, 16(R2), T_1
312 ADD $1, R3
313 MOVBZ $1, R0
314 CMPBEQ R3, $16, 2(PC)
315 VLVGB R3, R0, T_1
316 EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
317 CMPBNE R3, $16, 2(PC)
318 VLEIB $12, $1, F_4
319 VLEIB $4, $1, F_4
320
321 // setup [r²,r]
322 VLVGG $1, RSAVE_0, R_0
323 VLVGG $1, RSAVE_1, R_1
324 VLVGG $1, RSAVE_2, R_2
325 VLVGG $1, RSAVE_3, R_3
326 VLVGG $1, RSAVE_4, R_4
327 VPDI $0, R5_1, R5SAVE_1, R5_1
328 VPDI $0, R5_2, R5SAVE_2, R5_2
329 VPDI $0, R5_3, R5SAVE_3, R5_3
330 VPDI $0, R5_4, R5SAVE_4, R5_4
331
332 MOVD $0, R3
333 BR multiply
334
335 skip:
336 VZERO H_0
337 VZERO H_1
338 VZERO H_2
339 VZERO H_3
340 VZERO H_4
341
342 CMPBEQ R3, $0, finish
343
344 b1:
345 // 1 block remaining
346 SUB $1, R3
347 VLL R3, (R2), T_0
348 ADD $1, R3
349 MOVBZ $1, R0
350 CMPBEQ R3, $16, 2(PC)
351 VLVGB R3, R0, T_0
352 VZERO T_1
353 EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
354 CMPBNE R3, $16, 2(PC)
355 VLEIB $4, $1, F_4
356 VLEIG $1, $1, R_0
357 VZERO R_1
358 VZERO R_2
359 VZERO R_3
360 VZERO R_4
361 VZERO R5_1
362 VZERO R5_2
363 VZERO R5_3
364 VZERO R5_4
365
366 // setup [r, 1]
367 VLVGG $0, RSAVE_0, R_0
368 VLVGG $0, RSAVE_1, R_1
369 VLVGG $0, RSAVE_2, R_2
370 VLVGG $0, RSAVE_3, R_3
371 VLVGG $0, RSAVE_4, R_4
372 VPDI $0, R5SAVE_1, R5_1, R5_1
373 VPDI $0, R5SAVE_2, R5_2, R5_2
374 VPDI $0, R5SAVE_3, R5_3, R5_3
375 VPDI $0, R5SAVE_4, R5_4, R5_4
376
377 MOVD $0, R3
378 BR multiply
View as plain text