Text file src/pkg/math/big/arith_ppc64x.s
1 // Copyright 2013 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // +build !math_big_pure_go,ppc64 !math_big_pure_go,ppc64le
6
7 #include "textflag.h"
8
9 // This file provides fast assembly versions for the elementary
10 // arithmetic operations on vectors implemented in arith.go.
11
12 // func mulWW(x, y Word) (z1, z0 Word)
13 TEXT ·mulWW(SB), NOSPLIT, $0
14 MOVD x+0(FP), R4
15 MOVD y+8(FP), R5
16 MULHDU R4, R5, R6
17 MULLD R4, R5, R7
18 MOVD R6, z1+16(FP)
19 MOVD R7, z0+24(FP)
20 RET
21
22 // func addVV(z, y, y []Word) (c Word)
23 // z[i] = x[i] + y[i] for all i, carrying
24 TEXT ·addVV(SB), NOSPLIT, $0
25 MOVD z_len+8(FP), R7 // R7 = z_len
26 MOVD x+24(FP), R8 // R8 = x[]
27 MOVD y+48(FP), R9 // R9 = y[]
28 MOVD z+0(FP), R10 // R10 = z[]
29
30 // If z_len = 0, we are done
31 CMP R0, R7
32 MOVD R0, R4
33 BEQ done
34
35 // Process the first iteration out of the loop so we can
36 // use MOVDU and avoid 3 index registers updates.
37 MOVD 0(R8), R11 // R11 = x[i]
38 MOVD 0(R9), R12 // R12 = y[i]
39 ADD $-1, R7 // R7 = z_len - 1
40 ADDC R12, R11, R15 // R15 = x[i] + y[i], set CA
41 CMP R0, R7
42 MOVD R15, 0(R10) // z[i]
43 BEQ final // If z_len was 1, we are done
44
45 SRD $2, R7, R5 // R5 = z_len/4
46 CMP R0, R5
47 MOVD R5, CTR // Set up loop counter
48 BEQ tail // If R5 = 0, we can't use the loop
49
50 // Process 4 elements per iteration. Unrolling this loop
51 // means a performance trade-off: we will lose performance
52 // for small values of z_len (0.90x in the worst case), but
53 // gain significant performance as z_len increases (up to
54 // 1.45x).
55 loop:
56 MOVD 8(R8), R11 // R11 = x[i]
57 MOVD 16(R8), R12 // R12 = x[i+1]
58 MOVD 24(R8), R14 // R14 = x[i+2]
59 MOVDU 32(R8), R15 // R15 = x[i+3]
60 MOVD 8(R9), R16 // R16 = y[i]
61 MOVD 16(R9), R17 // R17 = y[i+1]
62 MOVD 24(R9), R18 // R18 = y[i+2]
63 MOVDU 32(R9), R19 // R19 = y[i+3]
64 ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA
65 ADDE R12, R17, R21 // R21 = x[i+1] + y[i+1] + CA
66 ADDE R14, R18, R22 // R22 = x[i+2] + y[i+2] + CA
67 ADDE R15, R19, R23 // R23 = x[i+3] + y[i+3] + CA
68 MOVD R20, 8(R10) // z[i]
69 MOVD R21, 16(R10) // z[i+1]
70 MOVD R22, 24(R10) // z[i+2]
71 MOVDU R23, 32(R10) // z[i+3]
72 ADD $-4, R7 // R7 = z_len - 4
73 BC 16, 0, loop // bdnz
74
75 // We may have more elements to read
76 CMP R0, R7
77 BEQ final
78
79 // Process the remaining elements, one at a time
80 tail:
81 MOVDU 8(R8), R11 // R11 = x[i]
82 MOVDU 8(R9), R16 // R16 = y[i]
83 ADD $-1, R7 // R7 = z_len - 1
84 ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA
85 CMP R0, R7
86 MOVDU R20, 8(R10) // z[i]
87 BEQ final // If R7 = 0, we are done
88
89 MOVDU 8(R8), R11
90 MOVDU 8(R9), R16
91 ADD $-1, R7
92 ADDE R11, R16, R20
93 CMP R0, R7
94 MOVDU R20, 8(R10)
95 BEQ final
96
97 MOVD 8(R8), R11
98 MOVD 8(R9), R16
99 ADDE R11, R16, R20
100 MOVD R20, 8(R10)
101
102 final:
103 ADDZE R4 // Capture CA
104
105 done:
106 MOVD R4, c+72(FP)
107 RET
108
109 // func subVV(z, x, y []Word) (c Word)
110 // z[i] = x[i] - y[i] for all i, carrying
111 TEXT ·subVV(SB), NOSPLIT, $0
112 MOVD z_len+8(FP), R7 // R7 = z_len
113 MOVD x+24(FP), R8 // R8 = x[]
114 MOVD y+48(FP), R9 // R9 = y[]
115 MOVD z+0(FP), R10 // R10 = z[]
116
117 // If z_len = 0, we are done
118 CMP R0, R7
119 MOVD R0, R4
120 BEQ done
121
122 // Process the first iteration out of the loop so we can
123 // use MOVDU and avoid 3 index registers updates.
124 MOVD 0(R8), R11 // R11 = x[i]
125 MOVD 0(R9), R12 // R12 = y[i]
126 ADD $-1, R7 // R7 = z_len - 1
127 SUBC R12, R11, R15 // R15 = x[i] - y[i], set CA
128 CMP R0, R7
129 MOVD R15, 0(R10) // z[i]
130 BEQ final // If z_len was 1, we are done
131
132 SRD $2, R7, R5 // R5 = z_len/4
133 CMP R0, R5
134 MOVD R5, CTR // Set up loop counter
135 BEQ tail // If R5 = 0, we can't use the loop
136
137 // Process 4 elements per iteration. Unrolling this loop
138 // means a performance trade-off: we will lose performance
139 // for small values of z_len (0.92x in the worst case), but
140 // gain significant performance as z_len increases (up to
141 // 1.45x).
142 loop:
143 MOVD 8(R8), R11 // R11 = x[i]
144 MOVD 16(R8), R12 // R12 = x[i+1]
145 MOVD 24(R8), R14 // R14 = x[i+2]
146 MOVDU 32(R8), R15 // R15 = x[i+3]
147 MOVD 8(R9), R16 // R16 = y[i]
148 MOVD 16(R9), R17 // R17 = y[i+1]
149 MOVD 24(R9), R18 // R18 = y[i+2]
150 MOVDU 32(R9), R19 // R19 = y[i+3]
151 SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA
152 SUBE R17, R12, R21 // R21 = x[i+1] - y[i+1] + CA
153 SUBE R18, R14, R22 // R22 = x[i+2] - y[i+2] + CA
154 SUBE R19, R15, R23 // R23 = x[i+3] - y[i+3] + CA
155 MOVD R20, 8(R10) // z[i]
156 MOVD R21, 16(R10) // z[i+1]
157 MOVD R22, 24(R10) // z[i+2]
158 MOVDU R23, 32(R10) // z[i+3]
159 ADD $-4, R7 // R7 = z_len - 4
160 BC 16, 0, loop // bdnz
161
162 // We may have more elements to read
163 CMP R0, R7
164 BEQ final
165
166 // Process the remaining elements, one at a time
167 tail:
168 MOVDU 8(R8), R11 // R11 = x[i]
169 MOVDU 8(R9), R16 // R16 = y[i]
170 ADD $-1, R7 // R7 = z_len - 1
171 SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA
172 CMP R0, R7
173 MOVDU R20, 8(R10) // z[i]
174 BEQ final // If R7 = 0, we are done
175
176 MOVDU 8(R8), R11
177 MOVDU 8(R9), R16
178 ADD $-1, R7
179 SUBE R16, R11, R20
180 CMP R0, R7
181 MOVDU R20, 8(R10)
182 BEQ final
183
184 MOVD 8(R8), R11
185 MOVD 8(R9), R16
186 SUBE R16, R11, R20
187 MOVD R20, 8(R10)
188
189 final:
190 ADDZE R4
191 XOR $1, R4
192
193 done:
194 MOVD R4, c+72(FP)
195 RET
196
197 // func addVW(z, x []Word, y Word) (c Word)
198 TEXT ·addVW(SB), NOSPLIT, $0
199 MOVD z+0(FP), R10 // R10 = z[]
200 MOVD x+24(FP), R8 // R8 = x[]
201 MOVD y+48(FP), R4 // R4 = y = c
202 MOVD z_len+8(FP), R11 // R11 = z_len
203
204 CMP R0, R11 // If z_len is zero, return
205 BEQ done
206
207 // We will process the first iteration out of the loop so we capture
208 // the value of c. In the subsequent iterations, we will rely on the
209 // value of CA set here.
210 MOVD 0(R8), R20 // R20 = x[i]
211 ADD $-1, R11 // R11 = z_len - 1
212 ADDC R20, R4, R6 // R6 = x[i] + c
213 CMP R0, R11 // If z_len was 1, we are done
214 MOVD R6, 0(R10) // z[i]
215 BEQ final
216
217 // We will read 4 elements per iteration
218 SRD $2, R11, R9 // R9 = z_len/4
219 DCBT (R8)
220 CMP R0, R9
221 MOVD R9, CTR // Set up the loop counter
222 BEQ tail // If R9 = 0, we can't use the loop
223
224 loop:
225 MOVD 8(R8), R20 // R20 = x[i]
226 MOVD 16(R8), R21 // R21 = x[i+1]
227 MOVD 24(R8), R22 // R22 = x[i+2]
228 MOVDU 32(R8), R23 // R23 = x[i+3]
229 ADDZE R20, R24 // R24 = x[i] + CA
230 ADDZE R21, R25 // R25 = x[i+1] + CA
231 ADDZE R22, R26 // R26 = x[i+2] + CA
232 ADDZE R23, R27 // R27 = x[i+3] + CA
233 MOVD R24, 8(R10) // z[i]
234 MOVD R25, 16(R10) // z[i+1]
235 MOVD R26, 24(R10) // z[i+2]
236 MOVDU R27, 32(R10) // z[i+3]
237 ADD $-4, R11 // R11 = z_len - 4
238 BC 16, 0, loop // bdnz
239
240 // We may have some elements to read
241 CMP R0, R11
242 BEQ final
243
244 tail:
245 MOVDU 8(R8), R20
246 ADDZE R20, R24
247 ADD $-1, R11
248 MOVDU R24, 8(R10)
249 CMP R0, R11
250 BEQ final
251
252 MOVDU 8(R8), R20
253 ADDZE R20, R24
254 ADD $-1, R11
255 MOVDU R24, 8(R10)
256 CMP R0, R11
257 BEQ final
258
259 MOVD 8(R8), R20
260 ADDZE R20, R24
261 MOVD R24, 8(R10)
262
263 final:
264 ADDZE R0, R4 // c = CA
265 done:
266 MOVD R4, c+56(FP)
267 RET
268
269 // func subVW(z, x []Word, y Word) (c Word)
270 TEXT ·subVW(SB), NOSPLIT, $0
271 MOVD z+0(FP), R10 // R10 = z[]
272 MOVD x+24(FP), R8 // R8 = x[]
273 MOVD y+48(FP), R4 // R4 = y = c
274 MOVD z_len+8(FP), R11 // R11 = z_len
275
276 CMP R0, R11 // If z_len is zero, return
277 BEQ done
278
279 // We will process the first iteration out of the loop so we capture
280 // the value of c. In the subsequent iterations, we will rely on the
281 // value of CA set here.
282 MOVD 0(R8), R20 // R20 = x[i]
283 ADD $-1, R11 // R11 = z_len - 1
284 SUBC R4, R20, R6 // R6 = x[i] - c
285 CMP R0, R11 // If z_len was 1, we are done
286 MOVD R6, 0(R10) // z[i]
287 BEQ final
288
289 // We will read 4 elements per iteration
290 SRD $2, R11, R9 // R9 = z_len/4
291 DCBT (R8)
292 CMP R0, R9
293 MOVD R9, CTR // Set up the loop counter
294 BEQ tail // If R9 = 0, we can't use the loop
295
296 // The loop here is almost the same as the one used in s390x, but
297 // we don't need to capture CA every iteration because we've already
298 // done that above.
299 loop:
300 MOVD 8(R8), R20
301 MOVD 16(R8), R21
302 MOVD 24(R8), R22
303 MOVDU 32(R8), R23
304 SUBE R0, R20
305 SUBE R0, R21
306 SUBE R0, R22
307 SUBE R0, R23
308 MOVD R20, 8(R10)
309 MOVD R21, 16(R10)
310 MOVD R22, 24(R10)
311 MOVDU R23, 32(R10)
312 ADD $-4, R11
313 BC 16, 0, loop // bdnz
314
315 // We may have some elements to read
316 CMP R0, R11
317 BEQ final
318
319 tail:
320 MOVDU 8(R8), R20
321 SUBE R0, R20
322 ADD $-1, R11
323 MOVDU R20, 8(R10)
324 CMP R0, R11
325 BEQ final
326
327 MOVDU 8(R8), R20
328 SUBE R0, R20
329 ADD $-1, R11
330 MOVDU R20, 8(R10)
331 CMP R0, R11
332 BEQ final
333
334 MOVD 8(R8), R20
335 SUBE R0, R20
336 MOVD R20, 8(R10)
337
338 final:
339 // Capture CA
340 SUBE R4, R4
341 NEG R4, R4
342
343 done:
344 MOVD R4, c+56(FP)
345 RET
346
347 TEXT ·shlVU(SB), NOSPLIT, $0
348 BR ·shlVU_g(SB)
349
350 TEXT ·shrVU(SB), NOSPLIT, $0
351 BR ·shrVU_g(SB)
352
353 // func mulAddVWW(z, x []Word, y, r Word) (c Word)
354 TEXT ·mulAddVWW(SB), NOSPLIT, $0
355 MOVD z+0(FP), R10 // R10 = z[]
356 MOVD x+24(FP), R8 // R8 = x[]
357 MOVD y+48(FP), R9 // R9 = y
358 MOVD r+56(FP), R4 // R4 = r = c
359 MOVD z_len+8(FP), R11 // R11 = z_len
360
361 CMP R0, R11
362 BEQ done
363
364 MOVD 0(R8), R20
365 ADD $-1, R11
366 MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y)
367 MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y)
368 ADDC R4, R6 // R6 = z0 + r
369 ADDZE R7 // R7 = z1 + CA
370 CMP R0, R11
371 MOVD R7, R4 // R4 = c
372 MOVD R6, 0(R10) // z[i]
373 BEQ done
374
375 // We will read 4 elements per iteration
376 SRD $2, R11, R14 // R14 = z_len/4
377 DCBT (R8)
378 CMP R0, R14
379 MOVD R14, CTR // Set up the loop counter
380 BEQ tail // If R9 = 0, we can't use the loop
381
382 loop:
383 MOVD 8(R8), R20 // R20 = x[i]
384 MOVD 16(R8), R21 // R21 = x[i+1]
385 MOVD 24(R8), R22 // R22 = x[i+2]
386 MOVDU 32(R8), R23 // R23 = x[i+3]
387 MULLD R9, R20, R24 // R24 = z0[i]
388 MULHDU R9, R20, R20 // R20 = z1[i]
389 ADDC R4, R24 // R24 = z0[i] + c
390 ADDZE R20 // R7 = z1[i] + CA
391 MULLD R9, R21, R25
392 MULHDU R9, R21, R21
393 ADDC R20, R25
394 ADDZE R21
395 MULLD R9, R22, R26
396 MULHDU R9, R22, R22
397 ADDC R21, R26
398 ADDZE R22
399 MULLD R9, R23, R27
400 MULHDU R9, R23, R23
401 ADDC R22, R27
402 ADDZE R23
403 MOVD R24, 8(R10) // z[i]
404 MOVD R25, 16(R10) // z[i+1]
405 MOVD R26, 24(R10) // z[i+2]
406 MOVDU R27, 32(R10) // z[i+3]
407 MOVD R23, R4 // R4 = c
408 ADD $-4, R11 // R11 = z_len - 4
409 BC 16, 0, loop // bdnz
410
411 // We may have some elements to read
412 CMP R0, R11
413 BEQ done
414
415 // Process the remaining elements, one at a time
416 tail:
417 MOVDU 8(R8), R20 // R20 = x[i]
418 MULLD R9, R20, R24 // R24 = z0[i]
419 MULHDU R9, R20, R25 // R25 = z1[i]
420 ADD $-1, R11 // R11 = z_len - 1
421 ADDC R4, R24
422 ADDZE R25
423 MOVDU R24, 8(R10) // z[i]
424 CMP R0, R11
425 MOVD R25, R4 // R4 = c
426 BEQ done // If R11 = 0, we are done
427
428 MOVDU 8(R8), R20
429 MULLD R9, R20, R24
430 MULHDU R9, R20, R25
431 ADD $-1, R11
432 ADDC R4, R24
433 ADDZE R25
434 MOVDU R24, 8(R10)
435 CMP R0, R11
436 MOVD R25, R4
437 BEQ done
438
439 MOVD 8(R8), R20
440 MULLD R9, R20, R24
441 MULHDU R9, R20, R25
442 ADD $-1, R11
443 ADDC R4, R24
444 ADDZE R25
445 MOVD R24, 8(R10)
446 MOVD R25, R4
447
448 done:
449 MOVD R4, c+64(FP)
450 RET
451
452 // func addMulVVW(z, x []Word, y Word) (c Word)
453 TEXT ·addMulVVW(SB), NOSPLIT, $0
454 MOVD z+0(FP), R10 // R10 = z[]
455 MOVD x+24(FP), R8 // R8 = x[]
456 MOVD y+48(FP), R9 // R9 = y
457 MOVD z_len+8(FP), R22 // R22 = z_len
458
459 MOVD R0, R3 // R3 will be the index register
460 CMP R0, R22
461 MOVD R0, R4 // R4 = c = 0
462 MOVD R22, CTR // Initialize loop counter
463 BEQ done
464
465 loop:
466 MOVD (R8)(R3), R20 // Load x[i]
467 MOVD (R10)(R3), R21 // Load z[i]
468 MULLD R9, R20, R6 // R6 = Low-order(x[i]*y)
469 MULHDU R9, R20, R7 // R7 = High-order(x[i]*y)
470 ADDC R21, R6 // R6 = z0
471 ADDZE R7 // R7 = z1
472 ADDC R4, R6 // R6 = z0 + c + 0
473 ADDZE R7, R4 // c += z1
474 MOVD R6, (R10)(R3) // Store z[i]
475 ADD $8, R3
476 BC 16, 0, loop // bdnz
477
478 done:
479 MOVD R4, c+56(FP)
480 RET
481
482 // func divWW(x1, x0, y Word) (q, r Word)
483 TEXT ·divWW(SB), NOSPLIT, $0
484 MOVD x1+0(FP), R4
485 MOVD x0+8(FP), R5
486 MOVD y+16(FP), R6
487
488 CMPU R4, R6
489 BGE divbigger
490
491 // from the programmer's note in ch. 3 of the ISA manual, p.74
492 DIVDEU R6, R4, R3
493 DIVDU R6, R5, R7
494 MULLD R6, R3, R8
495 MULLD R6, R7, R20
496 SUB R20, R5, R10
497 ADD R7, R3, R3
498 SUB R8, R10, R4
499 CMPU R4, R10
500 BLT adjust
501 CMPU R4, R6
502 BLT end
503
504 adjust:
505 MOVD $1, R21
506 ADD R21, R3, R3
507 SUB R6, R4, R4
508
509 end:
510 MOVD R3, q+24(FP)
511 MOVD R4, r+32(FP)
512
513 RET
514
515 divbigger:
516 MOVD $-1, R7
517 MOVD R7, q+24(FP)
518 MOVD R7, r+32(FP)
519 RET
520
521 TEXT ·divWVW(SB), NOSPLIT, $0
522 BR ·divWVW_g(SB)
View as plain text