Text file src/hash/crc32/crc32_s390x.s
1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "textflag.h"
6
7 // Vector register range containing CRC-32 constants
8
9 #define CONST_PERM_LE2BE V9
10 #define CONST_R2R1 V10
11 #define CONST_R4R3 V11
12 #define CONST_R5 V12
13 #define CONST_RU_POLY V13
14 #define CONST_CRC_POLY V14
15
16
17 // The CRC-32 constant block contains reduction constants to fold and
18 // process particular chunks of the input data stream in parallel.
19 //
20 // Note that the constant definitions below are extended in order to compute
21 // intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
22 // The rightmost doubleword can be 0 to prevent contribution to the result or
23 // can be multiplied by 1 to perform an XOR without the need for a separate
24 // VECTOR EXCLUSIVE OR instruction.
25 //
26 // The polynomials used are bit-reflected:
27 //
28 // IEEE: P'(x) = 0x0edb88320
29 // Castagnoli: P'(x) = 0x082f63b78
30
31
32 // IEEE polynomial constants
33 DATA ·crclecons+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
34 DATA ·crclecons+8(SB)/8, $0x0706050403020100
35 DATA ·crclecons+16(SB)/8, $0x00000001c6e41596 // R2
36 DATA ·crclecons+24(SB)/8, $0x0000000154442bd4 // R1
37 DATA ·crclecons+32(SB)/8, $0x00000000ccaa009e // R4
38 DATA ·crclecons+40(SB)/8, $0x00000001751997d0 // R3
39 DATA ·crclecons+48(SB)/8, $0x0000000000000000
40 DATA ·crclecons+56(SB)/8, $0x0000000163cd6124 // R5
41 DATA ·crclecons+64(SB)/8, $0x0000000000000000
42 DATA ·crclecons+72(SB)/8, $0x00000001F7011641 // u'
43 DATA ·crclecons+80(SB)/8, $0x0000000000000000
44 DATA ·crclecons+88(SB)/8, $0x00000001DB710641 // P'(x) << 1
45
46 GLOBL ·crclecons(SB),RODATA, $144
47
48 // Castagonli Polynomial constants
49 DATA ·crcclecons+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
50 DATA ·crcclecons+8(SB)/8, $0x0706050403020100
51 DATA ·crcclecons+16(SB)/8, $0x000000009e4addf8 // R2
52 DATA ·crcclecons+24(SB)/8, $0x00000000740eef02 // R1
53 DATA ·crcclecons+32(SB)/8, $0x000000014cd00bd6 // R4
54 DATA ·crcclecons+40(SB)/8, $0x00000000f20c0dfe // R3
55 DATA ·crcclecons+48(SB)/8, $0x0000000000000000
56 DATA ·crcclecons+56(SB)/8, $0x00000000dd45aab8 // R5
57 DATA ·crcclecons+64(SB)/8, $0x0000000000000000
58 DATA ·crcclecons+72(SB)/8, $0x00000000dea713f1 // u'
59 DATA ·crcclecons+80(SB)/8, $0x0000000000000000
60 DATA ·crcclecons+88(SB)/8, $0x0000000105ec76f0 // P'(x) << 1
61
62 GLOBL ·crcclecons(SB),RODATA, $144
63
64 // func hasVectorFacility() bool
65 TEXT ·hasVectorFacility(SB),NOSPLIT,$24-1
66 MOVD $x-24(SP), R1
67 XC $24, 0(R1), 0(R1) // clear the storage
68 MOVD $2, R0 // R0 is the number of double words stored -1
69 WORD $0xB2B01000 // STFLE 0(R1)
70 XOR R0, R0 // reset the value of R0
71 MOVBZ z-8(SP), R1
72 AND $0x40, R1
73 BEQ novector
74 vectorinstalled:
75 // check if the vector instruction has been enabled
76 VLEIB $0, $0xF, V16
77 VLGVB $0, V16, R1
78 CMPBNE R1, $0xF, novector
79 MOVB $1, ret+0(FP) // have vx
80 RET
81 novector:
82 MOVB $0, ret+0(FP) // no vx
83 RET
84
85
86 // The CRC-32 function(s) use these calling conventions:
87 //
88 // Parameters:
89 //
90 // R2: Initial CRC value, typically ~0; and final CRC (return) value.
91 // R3: Input buffer pointer, performance might be improved if the
92 // buffer is on a doubleword boundary.
93 // R4: Length of the buffer, must be 64 bytes or greater.
94 //
95 // Register usage:
96 //
97 // R5: CRC-32 constant pool base pointer.
98 // V0: Initial CRC value and intermediate constants and results.
99 // V1..V4: Data for CRC computation.
100 // V5..V8: Next data chunks that are fetched from the input buffer.
101 //
102 // V9..V14: CRC-32 constants.
103
104 // func vectorizedIEEE(crc uint32, p []byte) uint32
105 TEXT ·vectorizedIEEE(SB),NOSPLIT,$0
106 MOVWZ crc+0(FP), R2 // R2 stores the CRC value
107 MOVD p+8(FP), R3 // data pointer
108 MOVD p_len+16(FP), R4 // len(p)
109
110 MOVD $·crclecons(SB), R5
111 BR vectorizedBody<>(SB)
112
113 // func vectorizedCastagnoli(crc uint32, p []byte) uint32
114 TEXT ·vectorizedCastagnoli(SB),NOSPLIT,$0
115 MOVWZ crc+0(FP), R2 // R2 stores the CRC value
116 MOVD p+8(FP), R3 // data pointer
117 MOVD p_len+16(FP), R4 // len(p)
118
119 // R5: crc-32 constant pool base pointer, constant is used to reduce crc
120 MOVD $·crcclecons(SB), R5
121 BR vectorizedBody<>(SB)
122
123 TEXT vectorizedBody<>(SB),NOSPLIT,$0
124 XOR $0xffffffff, R2 // NOTW R2
125 VLM 0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY
126
127 // Load the initial CRC value into the rightmost word of V0
128 VZERO V0
129 VLVGF $3, R2, V0
130
131 // Crash if the input size is less than 64-bytes.
132 CMP R4, $64
133 BLT crash
134
135 // Load a 64-byte data chunk and XOR with CRC
136 VLM 0(R3), V1, V4 // 64-bytes into V1..V4
137
138 // Reflect the data if the CRC operation is in the bit-reflected domain
139 VPERM V1, V1, CONST_PERM_LE2BE, V1
140 VPERM V2, V2, CONST_PERM_LE2BE, V2
141 VPERM V3, V3, CONST_PERM_LE2BE, V3
142 VPERM V4, V4, CONST_PERM_LE2BE, V4
143
144 VX V0, V1, V1 // V1 ^= CRC
145 ADD $64, R3 // BUF = BUF + 64
146 ADD $(-64), R4
147
148 // Check remaining buffer size and jump to proper folding method
149 CMP R4, $64
150 BLT less_than_64bytes
151
152 fold_64bytes_loop:
153 // Load the next 64-byte data chunk into V5 to V8
154 VLM 0(R3), V5, V8
155 VPERM V5, V5, CONST_PERM_LE2BE, V5
156 VPERM V6, V6, CONST_PERM_LE2BE, V6
157 VPERM V7, V7, CONST_PERM_LE2BE, V7
158 VPERM V8, V8, CONST_PERM_LE2BE, V8
159
160
161 // Perform a GF(2) multiplication of the doublewords in V1 with
162 // the reduction constants in V0. The intermediate result is
163 // then folded (accumulated) with the next data chunk in V5 and
164 // stored in V1. Repeat this step for the register contents
165 // in V2, V3, and V4 respectively.
166
167 VGFMAG CONST_R2R1, V1, V5, V1
168 VGFMAG CONST_R2R1, V2, V6, V2
169 VGFMAG CONST_R2R1, V3, V7, V3
170 VGFMAG CONST_R2R1, V4, V8 ,V4
171
172 // Adjust buffer pointer and length for next loop
173 ADD $64, R3 // BUF = BUF + 64
174 ADD $(-64), R4 // LEN = LEN - 64
175
176 CMP R4, $64
177 BGE fold_64bytes_loop
178
179 less_than_64bytes:
180 // Fold V1 to V4 into a single 128-bit value in V1
181 VGFMAG CONST_R4R3, V1, V2, V1
182 VGFMAG CONST_R4R3, V1, V3, V1
183 VGFMAG CONST_R4R3, V1, V4, V1
184
185 // Check whether to continue with 64-bit folding
186 CMP R4, $16
187 BLT final_fold
188
189 fold_16bytes_loop:
190 VL 0(R3), V2 // Load next data chunk
191 VPERM V2, V2, CONST_PERM_LE2BE, V2
192
193 VGFMAG CONST_R4R3, V1, V2, V1 // Fold next data chunk
194
195 // Adjust buffer pointer and size for folding next data chunk
196 ADD $16, R3
197 ADD $-16, R4
198
199 // Process remaining data chunks
200 CMP R4 ,$16
201 BGE fold_16bytes_loop
202
203 final_fold:
204 VLEIB $7, $0x40, V9
205 VSRLB V9, CONST_R4R3, V0
206 VLEIG $0, $1, V0
207
208 VGFMG V0, V1, V1
209
210 VLEIB $7, $0x20, V9 // Shift by words
211 VSRLB V9, V1, V2 // Store remaining bits in V2
212 VUPLLF V1, V1 // Split rightmost doubleword
213 VGFMAG CONST_R5, V1, V2, V1 // V1 = (V1 * R5) XOR V2
214
215
216 // The input values to the Barret reduction are the degree-63 polynomial
217 // in V1 (R(x)), degree-32 generator polynomial, and the reduction
218 // constant u. The Barret reduction result is the CRC value of R(x) mod
219 // P(x).
220 //
221 // The Barret reduction algorithm is defined as:
222 //
223 // 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
224 // 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
225 // 3. C(x) = R(x) XOR T2(x) mod x^32
226 //
227 // Note: To compensate the division by x^32, use the vector unpack
228 // instruction to move the leftmost word into the leftmost doubleword
229 // of the vector register. The rightmost doubleword is multiplied
230 // with zero to not contribute to the intermediate results.
231
232
233 // T1(x) = floor( R(x) / x^32 ) GF2MUL u
234 VUPLLF V1, V2
235 VGFMG CONST_RU_POLY, V2, V2
236
237
238 // Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
239 // V2 and XOR the intermediate result, T2(x), with the value in V1.
240 // The final result is in the rightmost word of V2.
241
242 VUPLLF V2 , V2
243 VGFMAG CONST_CRC_POLY, V2, V1, V2
244
245 done:
246 VLGVF $2, V2, R2
247 XOR $0xffffffff, R2 // NOTW R2
248 MOVWZ R2, ret + 32(FP)
249 RET
250
251 crash:
252 MOVD $0, (R0) // input size is less than 64-bytes
View as plain text