Text file src/pkg/hash/crc32/crc32_ppc64le.s
1 // Copyright 2017 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // The vectorized implementation found below is a derived work
6 // from code written by Anton Blanchard <anton@au.ibm.com> found
7 // at https://github.com/antonblanchard/crc32-vpmsum. The original
8 // is dual licensed under GPL and Apache 2. As the copyright holder
9 // for the work, IBM has contributed this new work under
10 // the golang license.
11
12 // Changes include porting to Go assembler with modifications for
13 // the Go ABI for ppc64le.
14
15 #include "textflag.h"
16
17 #define POWER8_OFFSET 132
18
19 #define off16 R16
20 #define off32 R17
21 #define off48 R18
22 #define off64 R19
23 #define off80 R20
24 #define off96 R21
25 #define off112 R22
26
27 #define const1 V24
28 #define const2 V25
29
30 #define byteswap V26
31 #define mask_32bit V27
32 #define mask_64bit V28
33 #define zeroes V29
34
35 #define MAX_SIZE 32*1024
36 #define REFLECT
37
38 TEXT ·ppc64SlicingUpdateBy8(SB), NOSPLIT|NOFRAME, $0-44
39 MOVWZ crc+0(FP), R3 // incoming crc
40 MOVD table8+8(FP), R4 // *Table
41 MOVD p+16(FP), R5
42 MOVD p_len+24(FP), R6 // p len
43
44 CMP $0,R6 // len == 0?
45 BNE start
46 MOVW R3,ret+40(FP) // return crc
47 RET
48
49 start:
50 NOR R3,R3,R7 // ^crc
51 MOVWZ R7,R7 // 32 bits
52 CMP R6,$16
53 MOVD R6,CTR
54 BLT short
55 SRAD $3,R6,R8 // 8 byte chunks
56 MOVD R8,CTR
57
58 loop:
59 MOVWZ 0(R5),R8 // 0-3 bytes of p ?Endian?
60 MOVWZ 4(R5),R9 // 4-7 bytes of p
61 MOVD R4,R10 // &tab[0]
62 XOR R7,R8,R7 // crc ^= byte[0:3]
63 RLDICL $40,R9,$56,R17 // p[7]
64 SLD $2,R17,R17 // p[7]*4
65 RLDICL $40,R7,$56,R8 // crc>>24
66 ADD R17,R10,R17 // &tab[0][p[7]]
67 SLD $2,R8,R8 // crc>>24*4
68 RLDICL $48,R9,$56,R18 // p[6]
69 SLD $2,R18,R18 // p[6]*4
70 ADD $1024,R10,R10 // tab[1]
71 MOVWZ 0(R17),R21 // tab[0][p[7]]
72 RLDICL $56,R9,$56,R19 // p[5]
73 ADD R10,R18,R18 // &tab[1][p[6]]
74 SLD $2,R19,R19 // p[5]*4:1
75 MOVWZ 0(R18),R22 // tab[1][p[6]]
76 ADD $1024,R10,R10 // tab[2]
77 XOR R21,R22,R21 // xor done R22
78 ADD R19,R10,R19 // &tab[2][p[5]]
79 ANDCC $255,R9,R20 // p[4] ??
80 SLD $2,R20,R20 // p[4]*4
81 MOVWZ 0(R19),R23 // tab[2][p[5]]
82 ADD $1024,R10,R10 // &tab[3]
83 ADD R20,R10,R20 // tab[3][p[4]]
84 XOR R21,R23,R21 // xor done R23
85 ADD $1024,R10,R10 // &tab[4]
86 MOVWZ 0(R20),R24 // tab[3][p[4]]
87 ADD R10,R8,R23 // &tab[4][crc>>24]
88 XOR R21,R24,R21 // xor done R24
89 MOVWZ 0(R23),R25 // tab[4][crc>>24]
90 RLDICL $48,R7,$56,R24 // crc>>16&0xFF
91 XOR R21,R25,R21 // xor done R25
92 ADD $1024,R10,R10 // &tab[5]
93 SLD $2,R24,R24 // crc>>16&0xFF*4
94 ADD R24,R10,R24 // &tab[5][crc>>16&0xFF]
95 MOVWZ 0(R24),R26 // tab[5][crc>>16&0xFF]
96 XOR R21,R26,R21 // xor done R26
97 RLDICL $56,R7,$56,R25 // crc>>8
98 ADD $1024,R10,R10 // &tab[6]
99 SLD $2,R25,R25 // crc>>8&FF*2
100 ADD R25,R10,R25 // &tab[6][crc>>8&0xFF]
101 MOVBZ R7,R26 // crc&0xFF
102 ADD $1024,R10,R10 // &tab[7]
103 MOVWZ 0(R25),R27 // tab[6][crc>>8&0xFF]
104 SLD $2,R26,R26 // crc&0xFF*2
105 XOR R21,R27,R21 // xor done R27
106 ADD R26,R10,R26 // &tab[7][crc&0xFF]
107 ADD $8,R5 // p = p[8:]
108 MOVWZ 0(R26),R28 // tab[7][crc&0xFF]
109 XOR R21,R28,R21 // xor done R28
110 MOVWZ R21,R7 // crc for next round
111 BC 16,0,loop // next 8 bytes
112 ANDCC $7,R6,R8 // any leftover bytes
113 BEQ done // none --> done
114 MOVD R8,CTR // byte count
115
116 short:
117 MOVBZ 0(R5),R8 // get v
118 MOVBZ R7,R9 // byte(crc) -> R8 BE vs LE?
119 MOVWZ R7,R14
120 SRD $8,R14,R14 // crc>>8
121 XOR R8,R9,R8 // byte(crc)^v -> R8
122 ADD $1,R5 // ptr to next v
123 SLD $2,R8 // convert index-> bytes
124 ADD R8,R4,R9 // &tab[byte(crc)^v]
125 MOVWZ 0(R9),R10 // tab[byte(crc)^v]
126 XOR R10,R14,R7 // loop crc in R7
127 MOVWZ R7,R7 // 32 bits
128 BC 16,0,short
129 done:
130 NOR R7,R7,R7 // ^crc
131 MOVW R7,ret+40(FP) // return crc
132 RET
133
134 #ifdef BYTESWAP_DATA
135 DATA ·byteswapcons+0(SB)/8,$0x0706050403020100
136 DATA ·byteswapcons+8(SB)/8,$0x0f0e0d0c0b0a0908
137
138 GLOBL ·byteswapcons+0(SB),RODATA,$16
139 #endif
140
141 TEXT ·vectorCrc32(SB), NOSPLIT|NOFRAME, $0-36
142 MOVWZ crc+0(FP), R3 // incoming crc
143 MOVWZ ctab+4(FP), R14 // crc poly id
144 MOVD p+8(FP), R4
145 MOVD p_len+16(FP), R5 // p len
146
147 // R3 = incoming crc
148 // R14 = constant table identifier
149 // R5 = address of bytes
150 // R6 = length of bytes
151
152 // defines for index loads
153
154 MOVD $16,off16
155 MOVD $32,off32
156 MOVD $48,off48
157 MOVD $64,off64
158 MOVD $80,off80
159 MOVD $96,off96
160 MOVD $112,off112
161 MOVD $0,R15
162
163 MOVD R3,R10 // save initial crc
164
165 NOR R3,R3,R3 // ^crc
166 MOVWZ R3,R3 // 32 bits
167 VXOR zeroes,zeroes,zeroes // clear the V reg
168 VSPLTISW $-1,V0
169 VSLDOI $4,V29,V0,mask_32bit
170 VSLDOI $8,V29,V0,mask_64bit
171
172 VXOR V8,V8,V8
173 MTVSRD R3,VS40 // crc initial value VS40 = V8
174
175 #ifdef REFLECT
176 VSLDOI $8,zeroes,V8,V8 // or: VSLDOI V29,V8,V27,4 for top 32 bits?
177 #else
178 VSLDOI $4,V8,zeroes,V8
179 #endif
180
181 #ifdef BYTESWAP_DATA
182 MOVD $·byteswapcons(SB),R3
183 LVX (R3),byteswap
184 #endif
185
186 CMPU R5,$256 // length of bytes
187 BLT short
188
189 RLDICR $0,R5,$56,R6 // chunk to process
190
191 // First step for larger sizes
192 l1: MOVD $32768,R7
193 MOVD R7,R9
194 CMP R6,R7 // compare R6, R7 (MAX SIZE)
195 BGT top // less than MAX, just do remainder
196 MOVD R6,R7
197 top:
198 SUB R7,R6,R6
199
200 // mainloop does 128 bytes at a time
201 SRD $7,R7
202
203 // determine the offset into the constants table to start with.
204 // Each constant is 128 bytes, used against 16 bytes of data.
205 SLD $4,R7,R8
206 SRD $3,R9,R9
207 SUB R8,R9,R8
208
209 // The last iteration is reduced in a separate step
210 ADD $-1,R7
211 MOVD R7,CTR
212
213 // Determine which constant table (depends on poly)
214 CMP R14,$1
215 BNE castTable
216 MOVD $·IEEEConst(SB),R3
217 BR startConst
218 castTable:
219 MOVD $·CastConst(SB),R3
220
221 startConst:
222 ADD R3,R8,R3 // starting point in constants table
223
224 VXOR V0,V0,V0 // clear the V regs
225 VXOR V1,V1,V1
226 VXOR V2,V2,V2
227 VXOR V3,V3,V3
228 VXOR V4,V4,V4
229 VXOR V5,V5,V5
230 VXOR V6,V6,V6
231 VXOR V7,V7,V7
232
233 LVX (R3),const1 // loading constant values
234
235 CMP R15,$1 // Identify warm up pass
236 BEQ next
237
238 // First warm up pass: load the bytes to process
239 LVX (R4),V16
240 LVX (R4+off16),V17
241 LVX (R4+off32),V18
242 LVX (R4+off48),V19
243 LVX (R4+off64),V20
244 LVX (R4+off80),V21
245 LVX (R4+off96),V22
246 LVX (R4+off112),V23
247 ADD $128,R4 // bump up to next 128 bytes in buffer
248
249 VXOR V16,V8,V16 // xor in initial CRC in V8
250
251 next:
252 BC 18,0,first_warm_up_done
253
254 ADD $16,R3 // bump up to next constants
255 LVX (R3),const2 // table values
256
257 VPMSUMD V16,const1,V8 // second warm up pass
258 LVX (R4),V16 // load from buffer
259 OR $0,R2,R2
260
261 VPMSUMD V17,const1,V9 // vpmsumd with constants
262 LVX (R4+off16),V17 // load next from buffer
263 OR $0,R2,R2
264
265 VPMSUMD V18,const1,V10 // vpmsumd with constants
266 LVX (R4+off32),V18 // load next from buffer
267 OR $0,R2,R2
268
269 VPMSUMD V19,const1,V11 // vpmsumd with constants
270 LVX (R4+off48),V19 // load next from buffer
271 OR $0,R2,R2
272
273 VPMSUMD V20,const1,V12 // vpmsumd with constants
274 LVX (R4+off64),V20 // load next from buffer
275 OR $0,R2,R2
276
277 VPMSUMD V21,const1,V13 // vpmsumd with constants
278 LVX (R4+off80),V21 // load next from buffer
279 OR $0,R2,R2
280
281 VPMSUMD V22,const1,V14 // vpmsumd with constants
282 LVX (R4+off96),V22 // load next from buffer
283 OR $0,R2,R2
284
285 VPMSUMD V23,const1,V15 // vpmsumd with constants
286 LVX (R4+off112),V23 // load next from buffer
287
288 ADD $128,R4 // bump up to next 128 bytes in buffer
289
290 BC 18,0,first_cool_down
291
292 cool_top:
293 LVX (R3),const1 // constants
294 ADD $16,R3 // inc to next constants
295 OR $0,R2,R2
296
297 VXOR V0,V8,V0 // xor in previous vpmsumd
298 VPMSUMD V16,const2,V8 // vpmsumd with constants
299 LVX (R4),V16 // buffer
300 OR $0,R2,R2
301
302 VXOR V1,V9,V1 // xor in previous
303 VPMSUMD V17,const2,V9 // vpmsumd with constants
304 LVX (R4+off16),V17 // next in buffer
305 OR $0,R2,R2
306
307 VXOR V2,V10,V2 // xor in previous
308 VPMSUMD V18,const2,V10 // vpmsumd with constants
309 LVX (R4+off32),V18 // next in buffer
310 OR $0,R2,R2
311
312 VXOR V3,V11,V3 // xor in previous
313 VPMSUMD V19,const2,V11 // vpmsumd with constants
314 LVX (R4+off48),V19 // next in buffer
315 LVX (R3),const2 // get next constant
316 OR $0,R2,R2
317
318 VXOR V4,V12,V4 // xor in previous
319 VPMSUMD V20,const1,V12 // vpmsumd with constants
320 LVX (R4+off64),V20 // next in buffer
321 OR $0,R2,R2
322
323 VXOR V5,V13,V5 // xor in previous
324 VPMSUMD V21,const1,V13 // vpmsumd with constants
325 LVX (R4+off80),V21 // next in buffer
326 OR $0,R2,R2
327
328 VXOR V6,V14,V6 // xor in previous
329 VPMSUMD V22,const1,V14 // vpmsumd with constants
330 LVX (R4+off96),V22 // next in buffer
331 OR $0,R2,R2
332
333 VXOR V7,V15,V7 // xor in previous
334 VPMSUMD V23,const1,V15 // vpmsumd with constants
335 LVX (R4+off112),V23 // next in buffer
336
337 ADD $128,R4 // bump up buffer pointer
338 BC 16,0,cool_top // are we done?
339
340 first_cool_down:
341
342 // load the constants
343 // xor in the previous value
344 // vpmsumd the result with constants
345
346 LVX (R3),const1
347 ADD $16,R3
348
349 VXOR V0,V8,V0
350 VPMSUMD V16,const1,V8
351 OR $0,R2,R2
352
353 VXOR V1,V9,V1
354 VPMSUMD V17,const1,V9
355 OR $0,R2,R2
356
357 VXOR V2,V10,V2
358 VPMSUMD V18,const1,V10
359 OR $0,R2,R2
360
361 VXOR V3,V11,V3
362 VPMSUMD V19,const1,V11
363 OR $0,R2,R2
364
365 VXOR V4,V12,V4
366 VPMSUMD V20,const1,V12
367 OR $0,R2,R2
368
369 VXOR V5,V13,V5
370 VPMSUMD V21,const1,V13
371 OR $0,R2,R2
372
373 VXOR V6,V14,V6
374 VPMSUMD V22,const1,V14
375 OR $0,R2,R2
376
377 VXOR V7,V15,V7
378 VPMSUMD V23,const1,V15
379 OR $0,R2,R2
380
381 second_cool_down:
382
383 VXOR V0,V8,V0
384 VXOR V1,V9,V1
385 VXOR V2,V10,V2
386 VXOR V3,V11,V3
387 VXOR V4,V12,V4
388 VXOR V5,V13,V5
389 VXOR V6,V14,V6
390 VXOR V7,V15,V7
391
392 #ifdef REFLECT
393 VSLDOI $4,V0,zeroes,V0
394 VSLDOI $4,V1,zeroes,V1
395 VSLDOI $4,V2,zeroes,V2
396 VSLDOI $4,V3,zeroes,V3
397 VSLDOI $4,V4,zeroes,V4
398 VSLDOI $4,V5,zeroes,V5
399 VSLDOI $4,V6,zeroes,V6
400 VSLDOI $4,V7,zeroes,V7
401 #endif
402
403 LVX (R4),V8
404 LVX (R4+off16),V9
405 LVX (R4+off32),V10
406 LVX (R4+off48),V11
407 LVX (R4+off64),V12
408 LVX (R4+off80),V13
409 LVX (R4+off96),V14
410 LVX (R4+off112),V15
411
412 ADD $128,R4
413
414 VXOR V0,V8,V16
415 VXOR V1,V9,V17
416 VXOR V2,V10,V18
417 VXOR V3,V11,V19
418 VXOR V4,V12,V20
419 VXOR V5,V13,V21
420 VXOR V6,V14,V22
421 VXOR V7,V15,V23
422
423 MOVD $1,R15
424 CMP $0,R6
425 ADD $128,R6
426
427 BNE l1
428 ANDCC $127,R5
429 SUBC R5,$128,R6
430 ADD R3,R6,R3
431
432 SRD $4,R5,R7
433 MOVD R7,CTR
434 LVX (R3),V0
435 LVX (R3+off16),V1
436 LVX (R3+off32),V2
437 LVX (R3+off48),V3
438 LVX (R3+off64),V4
439 LVX (R3+off80),V5
440 LVX (R3+off96),V6
441 LVX (R3+off112),V7
442
443 ADD $128,R3
444
445 VPMSUMW V16,V0,V0
446 VPMSUMW V17,V1,V1
447 VPMSUMW V18,V2,V2
448 VPMSUMW V19,V3,V3
449 VPMSUMW V20,V4,V4
450 VPMSUMW V21,V5,V5
451 VPMSUMW V22,V6,V6
452 VPMSUMW V23,V7,V7
453
454 // now reduce the tail
455
456 CMP $0,R7
457 BEQ next1
458
459 LVX (R4),V16
460 LVX (R3),V17
461 VPMSUMW V16,V17,V16
462 VXOR V0,V16,V0
463 BC 18,0,next1
464
465 LVX (R4+off16),V16
466 LVX (R3+off16),V17
467 VPMSUMW V16,V17,V16
468 VXOR V0,V16,V0
469 BC 18,0,next1
470
471 LVX (R4+off32),V16
472 LVX (R3+off32),V17
473 VPMSUMW V16,V17,V16
474 VXOR V0,V16,V0
475 BC 18,0,next1
476
477 LVX (R4+off48),V16
478 LVX (R3+off48),V17
479 VPMSUMW V16,V17,V16
480 VXOR V0,V16,V0
481 BC 18,0,next1
482
483 LVX (R4+off64),V16
484 LVX (R3+off64),V17
485 VPMSUMW V16,V17,V16
486 VXOR V0,V16,V0
487 BC 18,0,next1
488
489 LVX (R4+off80),V16
490 LVX (R3+off80),V17
491 VPMSUMW V16,V17,V16
492 VXOR V0,V16,V0
493 BC 18,0,next1
494
495 LVX (R4+off96),V16
496 LVX (R3+off96),V17
497 VPMSUMW V16,V17,V16
498 VXOR V0,V16,V0
499
500 next1:
501 VXOR V0,V1,V0
502 VXOR V2,V3,V2
503 VXOR V4,V5,V4
504 VXOR V6,V7,V6
505 VXOR V0,V2,V0
506 VXOR V4,V6,V4
507 VXOR V0,V4,V0
508
509 barrett_reduction:
510
511 CMP R14,$1
512 BNE barcstTable
513 MOVD $·IEEEBarConst(SB),R3
514 BR startbarConst
515 barcstTable:
516 MOVD $·CastBarConst(SB),R3
517
518 startbarConst:
519 LVX (R3),const1
520 LVX (R3+off16),const2
521
522 VSLDOI $8,V0,V0,V1
523 VXOR V0,V1,V0
524
525 #ifdef REFLECT
526 VSPLTISB $1,V1
527 VSL V0,V1,V0
528 #endif
529
530 VAND V0,mask_64bit,V0
531
532 #ifndef REFLECT
533
534 VPMSUMD V0,const1,V1
535 VSLDOI $8,zeroes,V1,V1
536 VPMSUMD V1,const2,V1
537 VXOR V0,V1,V0
538 VSLDOI $8,V0,zeroes,V0
539
540 #else
541
542 VAND V0,mask_32bit,V1
543 VPMSUMD V1,const1,V1
544 VAND V1,mask_32bit,V1
545 VPMSUMD V1,const2,V1
546 VXOR V0,V1,V0
547 VSLDOI $4,V0,zeroes,V0
548
549 #endif
550
551 MFVSRD VS32,R3 // VS32 = V0
552
553 NOR R3,R3,R3 // return ^crc
554 MOVW R3,ret+32(FP)
555 RET
556
557 first_warm_up_done:
558
559 LVX (R3),const1
560 ADD $16,R3
561
562 VPMSUMD V16,const1,V8
563 VPMSUMD V17,const1,V9
564 VPMSUMD V18,const1,V10
565 VPMSUMD V19,const1,V11
566 VPMSUMD V20,const1,V12
567 VPMSUMD V21,const1,V13
568 VPMSUMD V22,const1,V14
569 VPMSUMD V23,const1,V15
570
571 BR second_cool_down
572
573 short:
574 CMP $0,R5
575 BEQ zero
576
577 // compute short constants
578
579 CMP R14,$1
580 BNE castshTable
581 MOVD $·IEEEConst(SB),R3
582 ADD $4080,R3
583 BR startshConst
584 castshTable:
585 MOVD $·CastConst(SB),R3
586 ADD $4080,R3
587
588 startshConst:
589 SUBC R5,$256,R6 // sub from 256
590 ADD R3,R6,R3
591
592 // calculate where to start
593
594 SRD $4,R5,R7
595 MOVD R7,CTR
596
597 VXOR V19,V19,V19
598 VXOR V20,V20,V20
599
600 LVX (R4),V0
601 LVX (R3),V16
602 VXOR V0,V8,V0
603 VPMSUMW V0,V16,V0
604 BC 18,0,v0
605
606 LVX (R4+off16),V1
607 LVX (R3+off16),V17
608 VPMSUMW V1,V17,V1
609 BC 18,0,v1
610
611 LVX (R4+off32),V2
612 LVX (R3+off32),V16
613 VPMSUMW V2,V16,V2
614 BC 18,0,v2
615
616 LVX (R4+off48),V3
617 LVX (R3+off48),V17
618 VPMSUMW V3,V17,V3
619 BC 18,0,v3
620
621 LVX (R4+off64),V4
622 LVX (R3+off64),V16
623 VPMSUMW V4,V16,V4
624 BC 18,0,v4
625
626 LVX (R4+off80),V5
627 LVX (R3+off80),V17
628 VPMSUMW V5,V17,V5
629 BC 18,0,v5
630
631 LVX (R4+off96),V6
632 LVX (R3+off96),V16
633 VPMSUMW V6,V16,V6
634 BC 18,0,v6
635
636 LVX (R4+off112),V7
637 LVX (R3+off112),V17
638 VPMSUMW V7,V17,V7
639 BC 18,0,v7
640
641 ADD $128,R3
642 ADD $128,R4
643
644 LVX (R4),V8
645 LVX (R3),V16
646 VPMSUMW V8,V16,V8
647 BC 18,0,v8
648
649 LVX (R4+off16),V9
650 LVX (R3+off16),V17
651 VPMSUMW V9,V17,V9
652 BC 18,0,v9
653
654 LVX (R4+off32),V10
655 LVX (R3+off32),V16
656 VPMSUMW V10,V16,V10
657 BC 18,0,v10
658
659 LVX (R4+off48),V11
660 LVX (R3+off48),V17
661 VPMSUMW V11,V17,V11
662 BC 18,0,v11
663
664 LVX (R4+off64),V12
665 LVX (R3+off64),V16
666 VPMSUMW V12,V16,V12
667 BC 18,0,v12
668
669 LVX (R4+off80),V13
670 LVX (R3+off80),V17
671 VPMSUMW V13,V17,V13
672 BC 18,0,v13
673
674 LVX (R4+off96),V14
675 LVX (R3+off96),V16
676 VPMSUMW V14,V16,V14
677 BC 18,0,v14
678
679 LVX (R4+off112),V15
680 LVX (R3+off112),V17
681 VPMSUMW V15,V17,V15
682
683 VXOR V19,V15,V19
684 v14: VXOR V20,V14,V20
685 v13: VXOR V19,V13,V19
686 v12: VXOR V20,V12,V20
687 v11: VXOR V19,V11,V19
688 v10: VXOR V20,V10,V20
689 v9: VXOR V19,V9,V19
690 v8: VXOR V20,V8,V20
691 v7: VXOR V19,V7,V19
692 v6: VXOR V20,V6,V20
693 v5: VXOR V19,V5,V19
694 v4: VXOR V20,V4,V20
695 v3: VXOR V19,V3,V19
696 v2: VXOR V20,V2,V20
697 v1: VXOR V19,V1,V19
698 v0: VXOR V20,V0,V20
699
700 VXOR V19,V20,V0
701
702 BR barrett_reduction
703
704 zero:
705 // This case is the original crc, so just return it
706 MOVW R10,ret+32(FP)
707 RET
View as plain text