Text file src/pkg/vendor/golang.org/x/crypto/internal/chacha20/asm_ppc64le.s
1 // Copyright 2019 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Based on CRYPTOGAMS code with the following comment:
6 // # ====================================================================
7 // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
8 // # project. The module is, however, dual licensed under OpenSSL and
9 // # CRYPTOGAMS licenses depending on where you obtain it. For further
10 // # details see http://www.openssl.org/~appro/cryptogams/.
11 // # ====================================================================
12
13 // Original code can be found at the link below:
14 // https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91e5c39ca79126a4a876d5d8ff
15
16 // There are some differences between CRYPTOGAMS code and this one. The round
17 // loop for "_int" isn't the same as the original. Some adjustments were
18 // necessary because there are less vector registers available. For example, some
19 // X variables (r12, r13, r14, and r15) share the same register used by the
20 // counter. The original code uses ctr to name the counter. Here we use CNT
21 // because golang uses CTR as the counter register name.
22
23 // +build ppc64le,!gccgo,!appengine
24
25 #include "textflag.h"
26
27 #define OUT R3
28 #define INP R4
29 #define LEN R5
30 #define KEY R6
31 #define CNT R7
32
33 #define TEMP R8
34
35 #define X0 R11
36 #define X1 R12
37 #define X2 R14
38 #define X3 R15
39 #define X4 R16
40 #define X5 R17
41 #define X6 R18
42 #define X7 R19
43 #define X8 R20
44 #define X9 R21
45 #define X10 R22
46 #define X11 R23
47 #define X12 R24
48 #define X13 R25
49 #define X14 R26
50 #define X15 R27
51
52 #define CON0 X0
53 #define CON1 X1
54 #define CON2 X2
55 #define CON3 X3
56
57 #define KEY0 X4
58 #define KEY1 X5
59 #define KEY2 X6
60 #define KEY3 X7
61 #define KEY4 X8
62 #define KEY5 X9
63 #define KEY6 X10
64 #define KEY7 X11
65
66 #define CNT0 X12
67 #define CNT1 X13
68 #define CNT2 X14
69 #define CNT3 X15
70
71 #define TMP0 R9
72 #define TMP1 R10
73 #define TMP2 R28
74 #define TMP3 R29
75
76 #define CONSTS R8
77
78 #define A0 V0
79 #define B0 V1
80 #define C0 V2
81 #define D0 V3
82 #define A1 V4
83 #define B1 V5
84 #define C1 V6
85 #define D1 V7
86 #define A2 V8
87 #define B2 V9
88 #define C2 V10
89 #define D2 V11
90 #define T0 V12
91 #define T1 V13
92 #define T2 V14
93
94 #define K0 V15
95 #define K1 V16
96 #define K2 V17
97 #define K3 V18
98 #define K4 V19
99 #define K5 V20
100
101 #define FOUR V21
102 #define SIXTEEN V22
103 #define TWENTY4 V23
104 #define TWENTY V24
105 #define TWELVE V25
106 #define TWENTY5 V26
107 #define SEVEN V27
108
109 #define INPPERM V28
110 #define OUTPERM V29
111 #define OUTMASK V30
112
113 #define DD0 V31
114 #define DD1 SEVEN
115 #define DD2 T0
116 #define DD3 T1
117 #define DD4 T2
118
119 DATA ·consts+0x00(SB)/8, $0x3320646e61707865
120 DATA ·consts+0x08(SB)/8, $0x6b20657479622d32
121 DATA ·consts+0x10(SB)/8, $0x0000000000000001
122 DATA ·consts+0x18(SB)/8, $0x0000000000000000
123 DATA ·consts+0x20(SB)/8, $0x0000000000000004
124 DATA ·consts+0x28(SB)/8, $0x0000000000000000
125 DATA ·consts+0x30(SB)/8, $0x0a0b08090e0f0c0d
126 DATA ·consts+0x38(SB)/8, $0x0203000106070405
127 DATA ·consts+0x40(SB)/8, $0x090a0b080d0e0f0c
128 DATA ·consts+0x48(SB)/8, $0x0102030005060704
129 GLOBL ·consts(SB), RODATA, $80
130
131 //func chaCha20_ctr32_vmx(out, inp *byte, len int, key *[32]byte, counter *[16]byte)
132 TEXT ·chaCha20_ctr32_vmx(SB),NOSPLIT|NOFRAME,$0
133 // Load the arguments inside the registers
134 MOVD out+0(FP), OUT
135 MOVD inp+8(FP), INP
136 MOVD len+16(FP), LEN
137 MOVD key+24(FP), KEY
138 MOVD counter+32(FP), CNT
139
140 MOVD $·consts(SB), CONSTS // point to consts addr
141
142 MOVD $16, X0
143 MOVD $32, X1
144 MOVD $48, X2
145 MOVD $64, X3
146 MOVD $31, X4
147 MOVD $15, X5
148
149 // Load key
150 LVX (KEY)(R0), K1
151 LVSR (KEY)(R0), T0
152 LVX (KEY)(X0), K2
153 LVX (KEY)(X4), DD0
154
155 // Load counter
156 LVX (CNT)(R0), K3
157 LVSR (CNT)(R0), T1
158 LVX (CNT)(X5), DD1
159
160 // Load constants
161 LVX (CONSTS)(R0), K0
162 LVX (CONSTS)(X0), K5
163 LVX (CONSTS)(X1), FOUR
164 LVX (CONSTS)(X2), SIXTEEN
165 LVX (CONSTS)(X3), TWENTY4
166
167 // Align key and counter
168 VPERM K2, K1, T0, K1
169 VPERM DD0, K2, T0, K2
170 VPERM DD1, K3, T1, K3
171
172 // Load counter to GPR
173 MOVWZ 0(CNT), CNT0
174 MOVWZ 4(CNT), CNT1
175 MOVWZ 8(CNT), CNT2
176 MOVWZ 12(CNT), CNT3
177
178 // Adjust vectors for the initial state
179 VADDUWM K3, K5, K3
180 VADDUWM K3, K5, K4
181 VADDUWM K4, K5, K5
182
183 // Synthesized constants
184 VSPLTISW $-12, TWENTY
185 VSPLTISW $12, TWELVE
186 VSPLTISW $-7, TWENTY5
187
188 VXOR T0, T0, T0
189 VSPLTISW $-1, OUTMASK
190 LVSR (INP)(R0), INPPERM
191 LVSL (OUT)(R0), OUTPERM
192 VPERM OUTMASK, T0, OUTPERM, OUTMASK
193
194 loop_outer_vmx:
195 // Load constant
196 MOVD $0x61707865, CON0
197 MOVD $0x3320646e, CON1
198 MOVD $0x79622d32, CON2
199 MOVD $0x6b206574, CON3
200
201 VOR K0, K0, A0
202 VOR K0, K0, A1
203 VOR K0, K0, A2
204 VOR K1, K1, B0
205
206 MOVD $10, TEMP
207
208 // Load key to GPR
209 MOVWZ 0(KEY), X4
210 MOVWZ 4(KEY), X5
211 MOVWZ 8(KEY), X6
212 MOVWZ 12(KEY), X7
213 VOR K1, K1, B1
214 VOR K1, K1, B2
215 MOVWZ 16(KEY), X8
216 MOVWZ 0(CNT), X12
217 MOVWZ 20(KEY), X9
218 MOVWZ 4(CNT), X13
219 VOR K2, K2, C0
220 VOR K2, K2, C1
221 MOVWZ 24(KEY), X10
222 MOVWZ 8(CNT), X14
223 VOR K2, K2, C2
224 VOR K3, K3, D0
225 MOVWZ 28(KEY), X11
226 MOVWZ 12(CNT), X15
227 VOR K4, K4, D1
228 VOR K5, K5, D2
229
230 MOVD X4, TMP0
231 MOVD X5, TMP1
232 MOVD X6, TMP2
233 MOVD X7, TMP3
234 VSPLTISW $7, SEVEN
235
236 MOVD TEMP, CTR
237
238 loop_vmx:
239 // CRYPTOGAMS uses a macro to create a loop using perl. This isn't possible
240 // using assembly macros. Therefore, the macro expansion result was used
241 // in order to maintain the algorithm efficiency.
242 // This loop generates three keystream blocks using VMX instructions and,
243 // in parallel, one keystream block using scalar instructions.
244 ADD X4, X0, X0
245 ADD X5, X1, X1
246 VADDUWM A0, B0, A0
247 VADDUWM A1, B1, A1
248 ADD X6, X2, X2
249 ADD X7, X3, X3
250 VADDUWM A2, B2, A2
251 VXOR D0, A0, D0
252 XOR X0, X12, X12
253 XOR X1, X13, X13
254 VXOR D1, A1, D1
255 VXOR D2, A2, D2
256 XOR X2, X14, X14
257 XOR X3, X15, X15
258 VPERM D0, D0, SIXTEEN, D0
259 VPERM D1, D1, SIXTEEN, D1
260 ROTLW $16, X12, X12
261 ROTLW $16, X13, X13
262 VPERM D2, D2, SIXTEEN, D2
263 VADDUWM C0, D0, C0
264 ROTLW $16, X14, X14
265 ROTLW $16, X15, X15
266 VADDUWM C1, D1, C1
267 VADDUWM C2, D2, C2
268 ADD X12, X8, X8
269 ADD X13, X9, X9
270 VXOR B0, C0, T0
271 VXOR B1, C1, T1
272 ADD X14, X10, X10
273 ADD X15, X11, X11
274 VXOR B2, C2, T2
275 VRLW T0, TWELVE, B0
276 XOR X8, X4, X4
277 XOR X9, X5, X5
278 VRLW T1, TWELVE, B1
279 VRLW T2, TWELVE, B2
280 XOR X10, X6, X6
281 XOR X11, X7, X7
282 VADDUWM A0, B0, A0
283 VADDUWM A1, B1, A1
284 ROTLW $12, X4, X4
285 ROTLW $12, X5, X5
286 VADDUWM A2, B2, A2
287 VXOR D0, A0, D0
288 ROTLW $12, X6, X6
289 ROTLW $12, X7, X7
290 VXOR D1, A1, D1
291 VXOR D2, A2, D2
292 ADD X4, X0, X0
293 ADD X5, X1, X1
294 VPERM D0, D0, TWENTY4, D0
295 VPERM D1, D1, TWENTY4, D1
296 ADD X6, X2, X2
297 ADD X7, X3, X3
298 VPERM D2, D2, TWENTY4, D2
299 VADDUWM C0, D0, C0
300 XOR X0, X12, X12
301 XOR X1, X13, X13
302 VADDUWM C1, D1, C1
303 VADDUWM C2, D2, C2
304 XOR X2, X14, X14
305 XOR X3, X15, X15
306 VXOR B0, C0, T0
307 VXOR B1, C1, T1
308 ROTLW $8, X12, X12
309 ROTLW $8, X13, X13
310 VXOR B2, C2, T2
311 VRLW T0, SEVEN, B0
312 ROTLW $8, X14, X14
313 ROTLW $8, X15, X15
314 VRLW T1, SEVEN, B1
315 VRLW T2, SEVEN, B2
316 ADD X12, X8, X8
317 ADD X13, X9, X9
318 VSLDOI $8, C0, C0, C0
319 VSLDOI $8, C1, C1, C1
320 ADD X14, X10, X10
321 ADD X15, X11, X11
322 VSLDOI $8, C2, C2, C2
323 VSLDOI $12, B0, B0, B0
324 XOR X8, X4, X4
325 XOR X9, X5, X5
326 VSLDOI $12, B1, B1, B1
327 VSLDOI $12, B2, B2, B2
328 XOR X10, X6, X6
329 XOR X11, X7, X7
330 VSLDOI $4, D0, D0, D0
331 VSLDOI $4, D1, D1, D1
332 ROTLW $7, X4, X4
333 ROTLW $7, X5, X5
334 VSLDOI $4, D2, D2, D2
335 VADDUWM A0, B0, A0
336 ROTLW $7, X6, X6
337 ROTLW $7, X7, X7
338 VADDUWM A1, B1, A1
339 VADDUWM A2, B2, A2
340 ADD X5, X0, X0
341 ADD X6, X1, X1
342 VXOR D0, A0, D0
343 VXOR D1, A1, D1
344 ADD X7, X2, X2
345 ADD X4, X3, X3
346 VXOR D2, A2, D2
347 VPERM D0, D0, SIXTEEN, D0
348 XOR X0, X15, X15
349 XOR X1, X12, X12
350 VPERM D1, D1, SIXTEEN, D1
351 VPERM D2, D2, SIXTEEN, D2
352 XOR X2, X13, X13
353 XOR X3, X14, X14
354 VADDUWM C0, D0, C0
355 VADDUWM C1, D1, C1
356 ROTLW $16, X15, X15
357 ROTLW $16, X12, X12
358 VADDUWM C2, D2, C2
359 VXOR B0, C0, T0
360 ROTLW $16, X13, X13
361 ROTLW $16, X14, X14
362 VXOR B1, C1, T1
363 VXOR B2, C2, T2
364 ADD X15, X10, X10
365 ADD X12, X11, X11
366 VRLW T0, TWELVE, B0
367 VRLW T1, TWELVE, B1
368 ADD X13, X8, X8
369 ADD X14, X9, X9
370 VRLW T2, TWELVE, B2
371 VADDUWM A0, B0, A0
372 XOR X10, X5, X5
373 XOR X11, X6, X6
374 VADDUWM A1, B1, A1
375 VADDUWM A2, B2, A2
376 XOR X8, X7, X7
377 XOR X9, X4, X4
378 VXOR D0, A0, D0
379 VXOR D1, A1, D1
380 ROTLW $12, X5, X5
381 ROTLW $12, X6, X6
382 VXOR D2, A2, D2
383 VPERM D0, D0, TWENTY4, D0
384 ROTLW $12, X7, X7
385 ROTLW $12, X4, X4
386 VPERM D1, D1, TWENTY4, D1
387 VPERM D2, D2, TWENTY4, D2
388 ADD X5, X0, X0
389 ADD X6, X1, X1
390 VADDUWM C0, D0, C0
391 VADDUWM C1, D1, C1
392 ADD X7, X2, X2
393 ADD X4, X3, X3
394 VADDUWM C2, D2, C2
395 VXOR B0, C0, T0
396 XOR X0, X15, X15
397 XOR X1, X12, X12
398 VXOR B1, C1, T1
399 VXOR B2, C2, T2
400 XOR X2, X13, X13
401 XOR X3, X14, X14
402 VRLW T0, SEVEN, B0
403 VRLW T1, SEVEN, B1
404 ROTLW $8, X15, X15
405 ROTLW $8, X12, X12
406 VRLW T2, SEVEN, B2
407 VSLDOI $8, C0, C0, C0
408 ROTLW $8, X13, X13
409 ROTLW $8, X14, X14
410 VSLDOI $8, C1, C1, C1
411 VSLDOI $8, C2, C2, C2
412 ADD X15, X10, X10
413 ADD X12, X11, X11
414 VSLDOI $4, B0, B0, B0
415 VSLDOI $4, B1, B1, B1
416 ADD X13, X8, X8
417 ADD X14, X9, X9
418 VSLDOI $4, B2, B2, B2
419 VSLDOI $12, D0, D0, D0
420 XOR X10, X5, X5
421 XOR X11, X6, X6
422 VSLDOI $12, D1, D1, D1
423 VSLDOI $12, D2, D2, D2
424 XOR X8, X7, X7
425 XOR X9, X4, X4
426 ROTLW $7, X5, X5
427 ROTLW $7, X6, X6
428 ROTLW $7, X7, X7
429 ROTLW $7, X4, X4
430 BC 0x10, 0, loop_vmx
431
432 SUB $256, LEN, LEN
433
434 // Accumulate key block
435 ADD $0x61707865, X0, X0
436 ADD $0x3320646e, X1, X1
437 ADD $0x79622d32, X2, X2
438 ADD $0x6b206574, X3, X3
439 ADD TMP0, X4, X4
440 ADD TMP1, X5, X5
441 ADD TMP2, X6, X6
442 ADD TMP3, X7, X7
443 MOVWZ 16(KEY), TMP0
444 MOVWZ 20(KEY), TMP1
445 MOVWZ 24(KEY), TMP2
446 MOVWZ 28(KEY), TMP3
447 ADD TMP0, X8, X8
448 ADD TMP1, X9, X9
449 ADD TMP2, X10, X10
450 ADD TMP3, X11, X11
451
452 MOVWZ 12(CNT), TMP0
453 MOVWZ 8(CNT), TMP1
454 MOVWZ 4(CNT), TMP2
455 MOVWZ 0(CNT), TEMP
456 ADD TMP0, X15, X15
457 ADD TMP1, X14, X14
458 ADD TMP2, X13, X13
459 ADD TEMP, X12, X12
460
461 // Accumulate key block
462 VADDUWM A0, K0, A0
463 VADDUWM A1, K0, A1
464 VADDUWM A2, K0, A2
465 VADDUWM B0, K1, B0
466 VADDUWM B1, K1, B1
467 VADDUWM B2, K1, B2
468 VADDUWM C0, K2, C0
469 VADDUWM C1, K2, C1
470 VADDUWM C2, K2, C2
471 VADDUWM D0, K3, D0
472 VADDUWM D1, K4, D1
473 VADDUWM D2, K5, D2
474
475 // Increment counter
476 ADD $4, TEMP, TEMP
477 MOVW TEMP, 0(CNT)
478
479 VADDUWM K3, FOUR, K3
480 VADDUWM K4, FOUR, K4
481 VADDUWM K5, FOUR, K5
482
483 // XOR the input slice (INP) with the keystream, which is stored in GPRs (X0-X3).
484
485 // Load input (aligned or not)
486 MOVWZ 0(INP), TMP0
487 MOVWZ 4(INP), TMP1
488 MOVWZ 8(INP), TMP2
489 MOVWZ 12(INP), TMP3
490
491 // XOR with input
492 XOR TMP0, X0, X0
493 XOR TMP1, X1, X1
494 XOR TMP2, X2, X2
495 XOR TMP3, X3, X3
496 MOVWZ 16(INP), TMP0
497 MOVWZ 20(INP), TMP1
498 MOVWZ 24(INP), TMP2
499 MOVWZ 28(INP), TMP3
500 XOR TMP0, X4, X4
501 XOR TMP1, X5, X5
502 XOR TMP2, X6, X6
503 XOR TMP3, X7, X7
504 MOVWZ 32(INP), TMP0
505 MOVWZ 36(INP), TMP1
506 MOVWZ 40(INP), TMP2
507 MOVWZ 44(INP), TMP3
508 XOR TMP0, X8, X8
509 XOR TMP1, X9, X9
510 XOR TMP2, X10, X10
511 XOR TMP3, X11, X11
512 MOVWZ 48(INP), TMP0
513 MOVWZ 52(INP), TMP1
514 MOVWZ 56(INP), TMP2
515 MOVWZ 60(INP), TMP3
516 XOR TMP0, X12, X12
517 XOR TMP1, X13, X13
518 XOR TMP2, X14, X14
519 XOR TMP3, X15, X15
520
521 // Store output (aligned or not)
522 MOVW X0, 0(OUT)
523 MOVW X1, 4(OUT)
524 MOVW X2, 8(OUT)
525 MOVW X3, 12(OUT)
526
527 ADD $64, INP, INP // INP points to the end of the slice for the alignment code below
528
529 MOVW X4, 16(OUT)
530 MOVD $16, TMP0
531 MOVW X5, 20(OUT)
532 MOVD $32, TMP1
533 MOVW X6, 24(OUT)
534 MOVD $48, TMP2
535 MOVW X7, 28(OUT)
536 MOVD $64, TMP3
537 MOVW X8, 32(OUT)
538 MOVW X9, 36(OUT)
539 MOVW X10, 40(OUT)
540 MOVW X11, 44(OUT)
541 MOVW X12, 48(OUT)
542 MOVW X13, 52(OUT)
543 MOVW X14, 56(OUT)
544 MOVW X15, 60(OUT)
545 ADD $64, OUT, OUT
546
547 // Load input
548 LVX (INP)(R0), DD0
549 LVX (INP)(TMP0), DD1
550 LVX (INP)(TMP1), DD2
551 LVX (INP)(TMP2), DD3
552 LVX (INP)(TMP3), DD4
553 ADD $64, INP, INP
554
555 VPERM DD1, DD0, INPPERM, DD0 // Align input
556 VPERM DD2, DD1, INPPERM, DD1
557 VPERM DD3, DD2, INPPERM, DD2
558 VPERM DD4, DD3, INPPERM, DD3
559 VXOR A0, DD0, A0 // XOR with input
560 VXOR B0, DD1, B0
561 LVX (INP)(TMP0), DD1 // Keep loading input
562 VXOR C0, DD2, C0
563 LVX (INP)(TMP1), DD2
564 VXOR D0, DD3, D0
565 LVX (INP)(TMP2), DD3
566 LVX (INP)(TMP3), DD0
567 ADD $64, INP, INP
568 MOVD $63, TMP3 // 63 is not a typo
569 VPERM A0, A0, OUTPERM, A0
570 VPERM B0, B0, OUTPERM, B0
571 VPERM C0, C0, OUTPERM, C0
572 VPERM D0, D0, OUTPERM, D0
573
574 VPERM DD1, DD4, INPPERM, DD4 // Align input
575 VPERM DD2, DD1, INPPERM, DD1
576 VPERM DD3, DD2, INPPERM, DD2
577 VPERM DD0, DD3, INPPERM, DD3
578 VXOR A1, DD4, A1
579 VXOR B1, DD1, B1
580 LVX (INP)(TMP0), DD1 // Keep loading
581 VXOR C1, DD2, C1
582 LVX (INP)(TMP1), DD2
583 VXOR D1, DD3, D1
584 LVX (INP)(TMP2), DD3
585
586 // Note that the LVX address is always rounded down to the nearest 16-byte
587 // boundary, and that it always points to at most 15 bytes beyond the end of
588 // the slice, so we cannot cross a page boundary.
589 LVX (INP)(TMP3), DD4 // Redundant in aligned case.
590 ADD $64, INP, INP
591 VPERM A1, A1, OUTPERM, A1 // Pre-misalign output
592 VPERM B1, B1, OUTPERM, B1
593 VPERM C1, C1, OUTPERM, C1
594 VPERM D1, D1, OUTPERM, D1
595
596 VPERM DD1, DD0, INPPERM, DD0 // Align Input
597 VPERM DD2, DD1, INPPERM, DD1
598 VPERM DD3, DD2, INPPERM, DD2
599 VPERM DD4, DD3, INPPERM, DD3
600 VXOR A2, DD0, A2
601 VXOR B2, DD1, B2
602 VXOR C2, DD2, C2
603 VXOR D2, DD3, D2
604 VPERM A2, A2, OUTPERM, A2
605 VPERM B2, B2, OUTPERM, B2
606 VPERM C2, C2, OUTPERM, C2
607 VPERM D2, D2, OUTPERM, D2
608
609 ANDCC $15, OUT, X1 // Is out aligned?
610 MOVD OUT, X0
611
612 VSEL A0, B0, OUTMASK, DD0 // Collect pre-misaligned output
613 VSEL B0, C0, OUTMASK, DD1
614 VSEL C0, D0, OUTMASK, DD2
615 VSEL D0, A1, OUTMASK, DD3
616 VSEL A1, B1, OUTMASK, B0
617 VSEL B1, C1, OUTMASK, C0
618 VSEL C1, D1, OUTMASK, D0
619 VSEL D1, A2, OUTMASK, A1
620 VSEL A2, B2, OUTMASK, B1
621 VSEL B2, C2, OUTMASK, C1
622 VSEL C2, D2, OUTMASK, D1
623
624 STVX DD0, (OUT+TMP0)
625 STVX DD1, (OUT+TMP1)
626 STVX DD2, (OUT+TMP2)
627 ADD $64, OUT, OUT
628 STVX DD3, (OUT+R0)
629 STVX B0, (OUT+TMP0)
630 STVX C0, (OUT+TMP1)
631 STVX D0, (OUT+TMP2)
632 ADD $64, OUT, OUT
633 STVX A1, (OUT+R0)
634 STVX B1, (OUT+TMP0)
635 STVX C1, (OUT+TMP1)
636 STVX D1, (OUT+TMP2)
637 ADD $64, OUT, OUT
638
639 BEQ aligned_vmx
640
641 SUB X1, OUT, X2 // in misaligned case edges
642 MOVD $0, X3 // are written byte-by-byte
643
644 unaligned_tail_vmx:
645 STVEBX D2, (X2+X3)
646 ADD $1, X3, X3
647 CMPW X3, X1
648 BNE unaligned_tail_vmx
649 SUB X1, X0, X2
650
651 unaligned_head_vmx:
652 STVEBX A0, (X2+X1)
653 CMPW X1, $15
654 ADD $1, X1, X1
655 BNE unaligned_head_vmx
656
657 CMPU LEN, $255 // done with 256-byte block yet?
658 BGT loop_outer_vmx
659
660 JMP done_vmx
661
662 aligned_vmx:
663 STVX A0, (X0+R0)
664 CMPU LEN, $255 // done with 256-byte block yet?
665 BGT loop_outer_vmx
666
667 done_vmx:
668 RET
View as plain text