Text file src/vendor/golang.org/x/crypto/poly1305/sum_arm.s
1 // Copyright 2015 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // +build arm,!gccgo,!appengine,!nacl
6
7 #include "textflag.h"
8
9 // This code was translated into a form compatible with 5a from the public
10 // domain source by Andrew Moon: github.com/floodyberry/poly1305-opt/blob/master/app/extensions/poly1305.
11
12 DATA ·poly1305_init_constants_armv6<>+0x00(SB)/4, $0x3ffffff
13 DATA ·poly1305_init_constants_armv6<>+0x04(SB)/4, $0x3ffff03
14 DATA ·poly1305_init_constants_armv6<>+0x08(SB)/4, $0x3ffc0ff
15 DATA ·poly1305_init_constants_armv6<>+0x0c(SB)/4, $0x3f03fff
16 DATA ·poly1305_init_constants_armv6<>+0x10(SB)/4, $0x00fffff
17 GLOBL ·poly1305_init_constants_armv6<>(SB), 8, $20
18
19 // Warning: the linker may use R11 to synthesize certain instructions. Please
20 // take care and verify that no synthetic instructions use it.
21
22 TEXT poly1305_init_ext_armv6<>(SB), NOSPLIT, $0
23 // Needs 16 bytes of stack and 64 bytes of space pointed to by R0. (It
24 // might look like it's only 60 bytes of space but the final four bytes
25 // will be written by another function.) We need to skip over four
26 // bytes of stack because that's saving the value of 'g'.
27 ADD $4, R13, R8
28 MOVM.IB [R4-R7], (R8)
29 MOVM.IA.W (R1), [R2-R5]
30 MOVW $·poly1305_init_constants_armv6<>(SB), R7
31 MOVW R2, R8
32 MOVW R2>>26, R9
33 MOVW R3>>20, g
34 MOVW R4>>14, R11
35 MOVW R5>>8, R12
36 ORR R3<<6, R9, R9
37 ORR R4<<12, g, g
38 ORR R5<<18, R11, R11
39 MOVM.IA (R7), [R2-R6]
40 AND R8, R2, R2
41 AND R9, R3, R3
42 AND g, R4, R4
43 AND R11, R5, R5
44 AND R12, R6, R6
45 MOVM.IA.W [R2-R6], (R0)
46 EOR R2, R2, R2
47 EOR R3, R3, R3
48 EOR R4, R4, R4
49 EOR R5, R5, R5
50 EOR R6, R6, R6
51 MOVM.IA.W [R2-R6], (R0)
52 MOVM.IA.W (R1), [R2-R5]
53 MOVM.IA [R2-R6], (R0)
54 ADD $20, R13, R0
55 MOVM.DA (R0), [R4-R7]
56 RET
57
58 #define MOVW_UNALIGNED(Rsrc, Rdst, Rtmp, offset) \
59 MOVBU (offset+0)(Rsrc), Rtmp; \
60 MOVBU Rtmp, (offset+0)(Rdst); \
61 MOVBU (offset+1)(Rsrc), Rtmp; \
62 MOVBU Rtmp, (offset+1)(Rdst); \
63 MOVBU (offset+2)(Rsrc), Rtmp; \
64 MOVBU Rtmp, (offset+2)(Rdst); \
65 MOVBU (offset+3)(Rsrc), Rtmp; \
66 MOVBU Rtmp, (offset+3)(Rdst)
67
68 TEXT poly1305_blocks_armv6<>(SB), NOSPLIT, $0
69 // Needs 24 bytes of stack for saved registers and then 88 bytes of
70 // scratch space after that. We assume that 24 bytes at (R13) have
71 // already been used: four bytes for the link register saved in the
72 // prelude of poly1305_auth_armv6, four bytes for saving the value of g
73 // in that function and 16 bytes of scratch space used around
74 // poly1305_finish_ext_armv6_skip1.
75 ADD $24, R13, R12
76 MOVM.IB [R4-R8, R14], (R12)
77 MOVW R0, 88(R13)
78 MOVW R1, 92(R13)
79 MOVW R2, 96(R13)
80 MOVW R1, R14
81 MOVW R2, R12
82 MOVW 56(R0), R8
83 WORD $0xe1180008 // TST R8, R8 not working see issue 5921
84 EOR R6, R6, R6
85 MOVW.EQ $(1<<24), R6
86 MOVW R6, 84(R13)
87 ADD $116, R13, g
88 MOVM.IA (R0), [R0-R9]
89 MOVM.IA [R0-R4], (g)
90 CMP $16, R12
91 BLO poly1305_blocks_armv6_done
92
93 poly1305_blocks_armv6_mainloop:
94 WORD $0xe31e0003 // TST R14, #3 not working see issue 5921
95 BEQ poly1305_blocks_armv6_mainloop_aligned
96 ADD $100, R13, g
97 MOVW_UNALIGNED(R14, g, R0, 0)
98 MOVW_UNALIGNED(R14, g, R0, 4)
99 MOVW_UNALIGNED(R14, g, R0, 8)
100 MOVW_UNALIGNED(R14, g, R0, 12)
101 MOVM.IA (g), [R0-R3]
102 ADD $16, R14
103 B poly1305_blocks_armv6_mainloop_loaded
104
105 poly1305_blocks_armv6_mainloop_aligned:
106 MOVM.IA.W (R14), [R0-R3]
107
108 poly1305_blocks_armv6_mainloop_loaded:
109 MOVW R0>>26, g
110 MOVW R1>>20, R11
111 MOVW R2>>14, R12
112 MOVW R14, 92(R13)
113 MOVW R3>>8, R4
114 ORR R1<<6, g, g
115 ORR R2<<12, R11, R11
116 ORR R3<<18, R12, R12
117 BIC $0xfc000000, R0, R0
118 BIC $0xfc000000, g, g
119 MOVW 84(R13), R3
120 BIC $0xfc000000, R11, R11
121 BIC $0xfc000000, R12, R12
122 ADD R0, R5, R5
123 ADD g, R6, R6
124 ORR R3, R4, R4
125 ADD R11, R7, R7
126 ADD $116, R13, R14
127 ADD R12, R8, R8
128 ADD R4, R9, R9
129 MOVM.IA (R14), [R0-R4]
130 MULLU R4, R5, (R11, g)
131 MULLU R3, R5, (R14, R12)
132 MULALU R3, R6, (R11, g)
133 MULALU R2, R6, (R14, R12)
134 MULALU R2, R7, (R11, g)
135 MULALU R1, R7, (R14, R12)
136 ADD R4<<2, R4, R4
137 ADD R3<<2, R3, R3
138 MULALU R1, R8, (R11, g)
139 MULALU R0, R8, (R14, R12)
140 MULALU R0, R9, (R11, g)
141 MULALU R4, R9, (R14, R12)
142 MOVW g, 76(R13)
143 MOVW R11, 80(R13)
144 MOVW R12, 68(R13)
145 MOVW R14, 72(R13)
146 MULLU R2, R5, (R11, g)
147 MULLU R1, R5, (R14, R12)
148 MULALU R1, R6, (R11, g)
149 MULALU R0, R6, (R14, R12)
150 MULALU R0, R7, (R11, g)
151 MULALU R4, R7, (R14, R12)
152 ADD R2<<2, R2, R2
153 ADD R1<<2, R1, R1
154 MULALU R4, R8, (R11, g)
155 MULALU R3, R8, (R14, R12)
156 MULALU R3, R9, (R11, g)
157 MULALU R2, R9, (R14, R12)
158 MOVW g, 60(R13)
159 MOVW R11, 64(R13)
160 MOVW R12, 52(R13)
161 MOVW R14, 56(R13)
162 MULLU R0, R5, (R11, g)
163 MULALU R4, R6, (R11, g)
164 MULALU R3, R7, (R11, g)
165 MULALU R2, R8, (R11, g)
166 MULALU R1, R9, (R11, g)
167 ADD $52, R13, R0
168 MOVM.IA (R0), [R0-R7]
169 MOVW g>>26, R12
170 MOVW R4>>26, R14
171 ORR R11<<6, R12, R12
172 ORR R5<<6, R14, R14
173 BIC $0xfc000000, g, g
174 BIC $0xfc000000, R4, R4
175 ADD.S R12, R0, R0
176 ADC $0, R1, R1
177 ADD.S R14, R6, R6
178 ADC $0, R7, R7
179 MOVW R0>>26, R12
180 MOVW R6>>26, R14
181 ORR R1<<6, R12, R12
182 ORR R7<<6, R14, R14
183 BIC $0xfc000000, R0, R0
184 BIC $0xfc000000, R6, R6
185 ADD R14<<2, R14, R14
186 ADD.S R12, R2, R2
187 ADC $0, R3, R3
188 ADD R14, g, g
189 MOVW R2>>26, R12
190 MOVW g>>26, R14
191 ORR R3<<6, R12, R12
192 BIC $0xfc000000, g, R5
193 BIC $0xfc000000, R2, R7
194 ADD R12, R4, R4
195 ADD R14, R0, R0
196 MOVW R4>>26, R12
197 BIC $0xfc000000, R4, R8
198 ADD R12, R6, R9
199 MOVW 96(R13), R12
200 MOVW 92(R13), R14
201 MOVW R0, R6
202 CMP $32, R12
203 SUB $16, R12, R12
204 MOVW R12, 96(R13)
205 BHS poly1305_blocks_armv6_mainloop
206
207 poly1305_blocks_armv6_done:
208 MOVW 88(R13), R12
209 MOVW R5, 20(R12)
210 MOVW R6, 24(R12)
211 MOVW R7, 28(R12)
212 MOVW R8, 32(R12)
213 MOVW R9, 36(R12)
214 ADD $48, R13, R0
215 MOVM.DA (R0), [R4-R8, R14]
216 RET
217
218 #define MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp) \
219 MOVBU.P 1(Rsrc), Rtmp; \
220 MOVBU.P Rtmp, 1(Rdst); \
221 MOVBU.P 1(Rsrc), Rtmp; \
222 MOVBU.P Rtmp, 1(Rdst)
223
224 #define MOVWP_UNALIGNED(Rsrc, Rdst, Rtmp) \
225 MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp); \
226 MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp)
227
228 // func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]key)
229 TEXT ·poly1305_auth_armv6(SB), $196-16
230 // The value 196, just above, is the sum of 64 (the size of the context
231 // structure) and 132 (the amount of stack needed).
232 //
233 // At this point, the stack pointer (R13) has been moved down. It
234 // points to the saved link register and there's 196 bytes of free
235 // space above it.
236 //
237 // The stack for this function looks like:
238 //
239 // +---------------------
240 // |
241 // | 64 bytes of context structure
242 // |
243 // +---------------------
244 // |
245 // | 112 bytes for poly1305_blocks_armv6
246 // |
247 // +---------------------
248 // | 16 bytes of final block, constructed at
249 // | poly1305_finish_ext_armv6_skip8
250 // +---------------------
251 // | four bytes of saved 'g'
252 // +---------------------
253 // | lr, saved by prelude <- R13 points here
254 // +---------------------
255 MOVW g, 4(R13)
256
257 MOVW out+0(FP), R4
258 MOVW m+4(FP), R5
259 MOVW mlen+8(FP), R6
260 MOVW key+12(FP), R7
261
262 ADD $136, R13, R0 // 136 = 4 + 4 + 16 + 112
263 MOVW R7, R1
264
265 // poly1305_init_ext_armv6 will write to the stack from R13+4, but
266 // that's ok because none of the other values have been written yet.
267 BL poly1305_init_ext_armv6<>(SB)
268 BIC.S $15, R6, R2
269 BEQ poly1305_auth_armv6_noblocks
270 ADD $136, R13, R0
271 MOVW R5, R1
272 ADD R2, R5, R5
273 SUB R2, R6, R6
274 BL poly1305_blocks_armv6<>(SB)
275
276 poly1305_auth_armv6_noblocks:
277 ADD $136, R13, R0
278 MOVW R5, R1
279 MOVW R6, R2
280 MOVW R4, R3
281
282 MOVW R0, R5
283 MOVW R1, R6
284 MOVW R2, R7
285 MOVW R3, R8
286 AND.S R2, R2, R2
287 BEQ poly1305_finish_ext_armv6_noremaining
288 EOR R0, R0
289 ADD $8, R13, R9 // 8 = offset to 16 byte scratch space
290 MOVW R0, (R9)
291 MOVW R0, 4(R9)
292 MOVW R0, 8(R9)
293 MOVW R0, 12(R9)
294 WORD $0xe3110003 // TST R1, #3 not working see issue 5921
295 BEQ poly1305_finish_ext_armv6_aligned
296 WORD $0xe3120008 // TST R2, #8 not working see issue 5921
297 BEQ poly1305_finish_ext_armv6_skip8
298 MOVWP_UNALIGNED(R1, R9, g)
299 MOVWP_UNALIGNED(R1, R9, g)
300
301 poly1305_finish_ext_armv6_skip8:
302 WORD $0xe3120004 // TST $4, R2 not working see issue 5921
303 BEQ poly1305_finish_ext_armv6_skip4
304 MOVWP_UNALIGNED(R1, R9, g)
305
306 poly1305_finish_ext_armv6_skip4:
307 WORD $0xe3120002 // TST $2, R2 not working see issue 5921
308 BEQ poly1305_finish_ext_armv6_skip2
309 MOVHUP_UNALIGNED(R1, R9, g)
310 B poly1305_finish_ext_armv6_skip2
311
312 poly1305_finish_ext_armv6_aligned:
313 WORD $0xe3120008 // TST R2, #8 not working see issue 5921
314 BEQ poly1305_finish_ext_armv6_skip8_aligned
315 MOVM.IA.W (R1), [g-R11]
316 MOVM.IA.W [g-R11], (R9)
317
318 poly1305_finish_ext_armv6_skip8_aligned:
319 WORD $0xe3120004 // TST $4, R2 not working see issue 5921
320 BEQ poly1305_finish_ext_armv6_skip4_aligned
321 MOVW.P 4(R1), g
322 MOVW.P g, 4(R9)
323
324 poly1305_finish_ext_armv6_skip4_aligned:
325 WORD $0xe3120002 // TST $2, R2 not working see issue 5921
326 BEQ poly1305_finish_ext_armv6_skip2
327 MOVHU.P 2(R1), g
328 MOVH.P g, 2(R9)
329
330 poly1305_finish_ext_armv6_skip2:
331 WORD $0xe3120001 // TST $1, R2 not working see issue 5921
332 BEQ poly1305_finish_ext_armv6_skip1
333 MOVBU.P 1(R1), g
334 MOVBU.P g, 1(R9)
335
336 poly1305_finish_ext_armv6_skip1:
337 MOVW $1, R11
338 MOVBU R11, 0(R9)
339 MOVW R11, 56(R5)
340 MOVW R5, R0
341 ADD $8, R13, R1
342 MOVW $16, R2
343 BL poly1305_blocks_armv6<>(SB)
344
345 poly1305_finish_ext_armv6_noremaining:
346 MOVW 20(R5), R0
347 MOVW 24(R5), R1
348 MOVW 28(R5), R2
349 MOVW 32(R5), R3
350 MOVW 36(R5), R4
351 MOVW R4>>26, R12
352 BIC $0xfc000000, R4, R4
353 ADD R12<<2, R12, R12
354 ADD R12, R0, R0
355 MOVW R0>>26, R12
356 BIC $0xfc000000, R0, R0
357 ADD R12, R1, R1
358 MOVW R1>>26, R12
359 BIC $0xfc000000, R1, R1
360 ADD R12, R2, R2
361 MOVW R2>>26, R12
362 BIC $0xfc000000, R2, R2
363 ADD R12, R3, R3
364 MOVW R3>>26, R12
365 BIC $0xfc000000, R3, R3
366 ADD R12, R4, R4
367 ADD $5, R0, R6
368 MOVW R6>>26, R12
369 BIC $0xfc000000, R6, R6
370 ADD R12, R1, R7
371 MOVW R7>>26, R12
372 BIC $0xfc000000, R7, R7
373 ADD R12, R2, g
374 MOVW g>>26, R12
375 BIC $0xfc000000, g, g
376 ADD R12, R3, R11
377 MOVW $-(1<<26), R12
378 ADD R11>>26, R12, R12
379 BIC $0xfc000000, R11, R11
380 ADD R12, R4, R9
381 MOVW R9>>31, R12
382 SUB $1, R12
383 AND R12, R6, R6
384 AND R12, R7, R7
385 AND R12, g, g
386 AND R12, R11, R11
387 AND R12, R9, R9
388 MVN R12, R12
389 AND R12, R0, R0
390 AND R12, R1, R1
391 AND R12, R2, R2
392 AND R12, R3, R3
393 AND R12, R4, R4
394 ORR R6, R0, R0
395 ORR R7, R1, R1
396 ORR g, R2, R2
397 ORR R11, R3, R3
398 ORR R9, R4, R4
399 ORR R1<<26, R0, R0
400 MOVW R1>>6, R1
401 ORR R2<<20, R1, R1
402 MOVW R2>>12, R2
403 ORR R3<<14, R2, R2
404 MOVW R3>>18, R3
405 ORR R4<<8, R3, R3
406 MOVW 40(R5), R6
407 MOVW 44(R5), R7
408 MOVW 48(R5), g
409 MOVW 52(R5), R11
410 ADD.S R6, R0, R0
411 ADC.S R7, R1, R1
412 ADC.S g, R2, R2
413 ADC.S R11, R3, R3
414 MOVM.IA [R0-R3], (R8)
415 MOVW R5, R12
416 EOR R0, R0, R0
417 EOR R1, R1, R1
418 EOR R2, R2, R2
419 EOR R3, R3, R3
420 EOR R4, R4, R4
421 EOR R5, R5, R5
422 EOR R6, R6, R6
423 EOR R7, R7, R7
424 MOVM.IA.W [R0-R7], (R12)
425 MOVM.IA [R0-R7], (R12)
426 MOVW 4(R13), g
427 RET
View as plain text