Text file src/pkg/internal/bytealg/indexbyte_ppc64x.s
1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // +build ppc64 ppc64le
6
7 #include "go_asm.h"
8 #include "textflag.h"
9
10 TEXT ·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
11 MOVD b_base+0(FP), R3 // R3 = byte array pointer
12 MOVD b_len+8(FP), R4 // R4 = length
13 MOVBZ c+24(FP), R5 // R5 = byte
14 MOVD $ret+32(FP), R14 // R14 = &ret
15 BR indexbytebody<>(SB)
16
17 TEXT ·IndexByteString(SB),NOSPLIT|NOFRAME,$0-32
18 MOVD s_base+0(FP), R3 // R3 = string
19 MOVD s_len+8(FP), R4 // R4 = length
20 MOVBZ c+16(FP), R5 // R5 = byte
21 MOVD $ret+24(FP), R14 // R14 = &ret
22 BR indexbytebody<>(SB)
23
24 TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
25 MOVD R3,R17 // Save base address for calculating the index later.
26 RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8.
27 RLDIMI $8,R5,$48,R5 // Replicating the byte across the register.
28 ADD R4,R3,R7 // Last acceptable address in R7.
29 DCBT (R8) // Prepare cache line.
30
31 RLDIMI $16,R5,$32,R5
32 CMPU R4,$32 // Check if it's a small string (≤32 bytes). Those will be processed differently.
33 MOVD $-1,R9
34 WORD $0x54661EB8 // Calculate padding in R6 (rlwinm r6,r3,3,26,28).
35 RLDIMI $32,R5,$0,R5
36 MOVD R7,R10 // Save last acceptable address in R10 for later.
37 ADD $-1,R7,R7
38 #ifdef GOARCH_ppc64le
39 SLD R6,R9,R9 // Prepare mask for Little Endian
40 #else
41 SRD R6,R9,R9 // Same for Big Endian
42 #endif
43 BLE small_string // Jump to the small string case if it's ≤32 bytes.
44
45 // If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
46 // in V0, V1 and V10, then branch to the preloop.
47 ANDCC $63,R3,R11
48 BEQ CR0,qw_align
49 RLDICL $0,R3,$61,R11
50
51 MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
52 CMPB R12,R5,R3 // Check for a match.
53 AND R9,R3,R3 // Mask bytes below s_base
54 RLDICL $0,R7,$61,R6 // length-1
55 RLDICR $0,R7,$60,R7 // Last doubleword in R7
56 CMPU R3,$0,CR7 // If we have a match, jump to the final computation
57 BNE CR7,done
58 ADD $8,R8,R8
59 ADD $-8,R4,R4
60 ADD R4,R11,R4
61
62 // Check for quadword alignment
63 ANDCC $15,R8,R11
64 BEQ CR0,qw_align
65
66 // Not aligned, so handle the next doubleword
67 MOVD 0(R8),R12
68 CMPB R12,R5,R3
69 CMPU R3,$0,CR7
70 BNE CR7,done
71 ADD $8,R8,R8
72 ADD $-8,R4,R4
73
74 // Either quadword aligned or 64-byte at this point. We can use LVX.
75 qw_align:
76
77 // Set up auxiliary data for the vectorized algorithm.
78 VSPLTISB $0,V0 // Replicate 0 across V0
79 VSPLTISB $3,V10 // Use V10 as control for VBPERMQ
80 MTVRD R5,V1
81 LVSL (R0+R0),V11
82 VSLB V11,V10,V10
83 VSPLTB $7,V1,V1 // Replicate byte across V1
84 CMPU R4, $64 // If len ≤ 64, don't use the vectorized loop
85 BLE tail
86
87 // We will load 4 quardwords per iteration in the loop, so check for
88 // 64-byte alignment. If 64-byte aligned, then branch to the preloop.
89 ANDCC $63,R8,R11
90 BEQ CR0,preloop
91
92 // Not 64-byte aligned. Load one quadword at a time until aligned.
93 LVX (R8+R0),V4
94 VCMPEQUBCC V1,V4,V6 // Check for byte in V4
95 BNE CR6,found_qw_align
96 ADD $16,R8,R8
97 ADD $-16,R4,R4
98
99 ANDCC $63,R8,R11
100 BEQ CR0,preloop
101 LVX (R8+R0),V4
102 VCMPEQUBCC V1,V4,V6 // Check for byte in V4
103 BNE CR6,found_qw_align
104 ADD $16,R8,R8
105 ADD $-16,R4,R4
106
107 ANDCC $63,R8,R11
108 BEQ CR0,preloop
109 LVX (R8+R0),V4
110 VCMPEQUBCC V1,V4,V6 // Check for byte in V4
111 BNE CR6,found_qw_align
112 ADD $-16,R4,R4
113 ADD $16,R8,R8
114
115 // 64-byte aligned. Prepare for the main loop.
116 preloop:
117 CMPU R4,$64
118 BLE tail // If len ≤ 64, don't use the vectorized loop
119
120 // We are now aligned to a 64-byte boundary. We will load 4 quadwords
121 // per loop iteration. The last doubleword is in R10, so our loop counter
122 // starts at (R10-R8)/64.
123 SUB R8,R10,R6
124 SRD $6,R6,R9 // Loop counter in R9
125 MOVD R9,CTR
126
127 ADD $-64,R8,R8 // Adjust index for loop entry
128 MOVD $16,R11 // Load offsets for the vector loads
129 MOVD $32,R9
130 MOVD $48,R7
131
132 // Main loop we will load 64 bytes per iteration
133 loop:
134 ADD $64,R8,R8 // Fuse addi+lvx for performance
135 LVX (R8+R0),V2 // Load 4 16-byte vectors
136 LVX (R8+R11),V3
137 VCMPEQUB V1,V2,V6 // Look for byte in each vector
138 VCMPEQUB V1,V3,V7
139
140 LVX (R8+R9),V4
141 LVX (R8+R7),V5
142 VCMPEQUB V1,V4,V8
143 VCMPEQUB V1,V5,V9
144
145 VOR V6,V7,V11 // Compress the result in a single vector
146 VOR V8,V9,V12
147 VOR V11,V12,V13
148 VCMPEQUBCC V0,V13,V14 // Check for byte
149 BGE CR6,found
150 BC 16,0,loop // bdnz loop
151
152 // Handle the tailing bytes or R4 ≤ 64
153 RLDICL $0,R6,$58,R4
154 ADD $64,R8,R8
155 tail:
156 CMPU R4,$0
157 BEQ notfound
158 LVX (R8+R0),V4
159 VCMPEQUBCC V1,V4,V6
160 BNE CR6,found_qw_align
161 ADD $16,R8,R8
162 CMPU R4,$16,CR6
163 BLE CR6,notfound
164 ADD $-16,R4,R4
165
166 LVX (R8+R0),V4
167 VCMPEQUBCC V1,V4,V6
168 BNE CR6,found_qw_align
169 ADD $16,R8,R8
170 CMPU R4,$16,CR6
171 BLE CR6,notfound
172 ADD $-16,R4,R4
173
174 LVX (R8+R0),V4
175 VCMPEQUBCC V1,V4,V6
176 BNE CR6,found_qw_align
177 ADD $16,R8,R8
178 CMPU R4,$16,CR6
179 BLE CR6,notfound
180 ADD $-16,R4,R4
181
182 LVX (R8+R0),V4
183 VCMPEQUBCC V1,V4,V6
184 BNE CR6,found_qw_align
185
186 notfound:
187 MOVD $-1,R3
188 MOVD R3,(R14)
189 RET
190
191 found:
192 // We will now compress the results into a single doubleword,
193 // so it can be moved to a GPR for the final index calculation.
194
195 // The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
196 // first bit of each byte into bits 48-63.
197 VBPERMQ V6,V10,V6
198 VBPERMQ V7,V10,V7
199 VBPERMQ V8,V10,V8
200 VBPERMQ V9,V10,V9
201
202 // Shift each 16-bit component into its correct position for
203 // merging into a single doubleword.
204 #ifdef GOARCH_ppc64le
205 VSLDOI $2,V7,V7,V7
206 VSLDOI $4,V8,V8,V8
207 VSLDOI $6,V9,V9,V9
208 #else
209 VSLDOI $6,V6,V6,V6
210 VSLDOI $4,V7,V7,V7
211 VSLDOI $2,V8,V8,V8
212 #endif
213
214 // Merge V6-V9 into a single doubleword and move to a GPR.
215 VOR V6,V7,V11
216 VOR V8,V9,V4
217 VOR V4,V11,V4
218 MFVRD V4,R3
219
220 #ifdef GOARCH_ppc64le
221 ADD $-1,R3,R11
222 ANDN R3,R11,R11
223 POPCNTD R11,R11 // Count trailing zeros (Little Endian).
224 #else
225 CNTLZD R3,R11 // Count leading zeros (Big Endian).
226 #endif
227 ADD R8,R11,R3 // Calculate byte address
228
229 return:
230 SUB R17,R3
231 MOVD R3,(R14)
232 RET
233
234 found_qw_align:
235 // Use the same algorithm as above. Compress the result into
236 // a single doubleword and move it to a GPR for the final
237 // calculation.
238 VBPERMQ V6,V10,V6
239
240 #ifdef GOARCH_ppc64le
241 MFVRD V6,R3
242 ADD $-1,R3,R11
243 ANDN R3,R11,R11
244 POPCNTD R11,R11
245 #else
246 VSLDOI $6,V6,V6,V6
247 MFVRD V6,R3
248 CNTLZD R3,R11
249 #endif
250 ADD R8,R11,R3
251 CMPU R11,R4
252 BLT return
253 BR notfound
254
255 done:
256 // At this point, R3 has 0xFF in the same position as the byte we are
257 // looking for in the doubleword. Use that to calculate the exact index
258 // of the byte.
259 #ifdef GOARCH_ppc64le
260 ADD $-1,R3,R11
261 ANDN R3,R11,R11
262 POPCNTD R11,R11 // Count trailing zeros (Little Endian).
263 #else
264 CNTLZD R3,R11 // Count leading zeros (Big Endian).
265 #endif
266 CMPU R8,R7 // Check if we are at the last doubleword.
267 SRD $3,R11 // Convert trailing zeros to bytes.
268 ADD R11,R8,R3
269 CMPU R11,R6,CR7 // If at the last doubleword, check the byte offset.
270 BNE return
271 BLE CR7,return
272 BR notfound
273
274 small_string:
275 // We unroll this loop for better performance.
276 CMPU R4,$0 // Check for length=0
277 BEQ notfound
278
279 MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
280 CMPB R12,R5,R3 // Check for a match.
281 AND R9,R3,R3 // Mask bytes below s_base.
282 CMPU R3,$0,CR7 // If we have a match, jump to the final computation.
283 RLDICL $0,R7,$61,R6 // length-1
284 RLDICR $0,R7,$60,R7 // Last doubleword in R7.
285 CMPU R8,R7
286 BNE CR7,done
287 BEQ notfound // Hit length.
288
289 MOVDU 8(R8),R12
290 CMPB R12,R5,R3
291 CMPU R3,$0,CR6
292 CMPU R8,R7
293 BNE CR6,done
294 BEQ notfound
295
296 MOVDU 8(R8),R12
297 CMPB R12,R5,R3
298 CMPU R3,$0,CR6
299 CMPU R8,R7
300 BNE CR6,done
301 BEQ notfound
302
303 MOVDU 8(R8),R12
304 CMPB R12,R5,R3
305 CMPU R3,$0,CR6
306 CMPU R8,R7
307 BNE CR6,done
308 BEQ notfound
309
310 MOVDU 8(R8),R12
311 CMPB R12,R5,R3
312 CMPU R3,$0,CR6
313 BNE CR6,done
314 BR notfound
315
View as plain text