Text file src/pkg/math/big/arith_arm64.s
1 // Copyright 2013 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // +build !math_big_pure_go
6
7 #include "textflag.h"
8
9 // This file provides fast assembly versions for the elementary
10 // arithmetic operations on vectors implemented in arith.go.
11
12 // TODO: Consider re-implementing using Advanced SIMD
13 // once the assembler supports those instructions.
14
15 // func mulWW(x, y Word) (z1, z0 Word)
16 TEXT ·mulWW(SB),NOSPLIT,$0
17 MOVD x+0(FP), R0
18 MOVD y+8(FP), R1
19 MUL R0, R1, R2
20 UMULH R0, R1, R3
21 MOVD R3, z1+16(FP)
22 MOVD R2, z0+24(FP)
23 RET
24
25
26 // func divWW(x1, x0, y Word) (q, r Word)
27 TEXT ·divWW(SB),NOSPLIT,$0
28 B ·divWW_g(SB) // ARM64 has no multiword division
29
30
31 // func addVV(z, x, y []Word) (c Word)
32 TEXT ·addVV(SB),NOSPLIT,$0
33 MOVD z_len+8(FP), R0
34 MOVD x+24(FP), R8
35 MOVD y+48(FP), R9
36 MOVD z+0(FP), R10
37 ADDS $0, R0 // clear carry flag
38 TBZ $0, R0, two
39 MOVD.P 8(R8), R11
40 MOVD.P 8(R9), R15
41 ADCS R15, R11
42 MOVD.P R11, 8(R10)
43 SUB $1, R0
44 two:
45 TBZ $1, R0, loop
46 LDP.P 16(R8), (R11, R12)
47 LDP.P 16(R9), (R15, R16)
48 ADCS R15, R11
49 ADCS R16, R12
50 STP.P (R11, R12), 16(R10)
51 SUB $2, R0
52 loop:
53 CBZ R0, done // careful not to touch the carry flag
54 LDP.P 32(R8), (R11, R12)
55 LDP -16(R8), (R13, R14)
56 LDP.P 32(R9), (R15, R16)
57 LDP -16(R9), (R17, R19)
58 ADCS R15, R11
59 ADCS R16, R12
60 ADCS R17, R13
61 ADCS R19, R14
62 STP.P (R11, R12), 32(R10)
63 STP (R13, R14), -16(R10)
64 SUB $4, R0
65 B loop
66 done:
67 CSET HS, R0 // extract carry flag
68 MOVD R0, c+72(FP)
69 RET
70
71
72 // func subVV(z, x, y []Word) (c Word)
73 TEXT ·subVV(SB),NOSPLIT,$0
74 MOVD z_len+8(FP), R0
75 MOVD x+24(FP), R8
76 MOVD y+48(FP), R9
77 MOVD z+0(FP), R10
78 CMP R0, R0 // set carry flag
79 TBZ $0, R0, two
80 MOVD.P 8(R8), R11
81 MOVD.P 8(R9), R15
82 SBCS R15, R11
83 MOVD.P R11, 8(R10)
84 SUB $1, R0
85 two:
86 TBZ $1, R0, loop
87 LDP.P 16(R8), (R11, R12)
88 LDP.P 16(R9), (R15, R16)
89 SBCS R15, R11
90 SBCS R16, R12
91 STP.P (R11, R12), 16(R10)
92 SUB $2, R0
93 loop:
94 CBZ R0, done // careful not to touch the carry flag
95 LDP.P 32(R8), (R11, R12)
96 LDP -16(R8), (R13, R14)
97 LDP.P 32(R9), (R15, R16)
98 LDP -16(R9), (R17, R19)
99 SBCS R15, R11
100 SBCS R16, R12
101 SBCS R17, R13
102 SBCS R19, R14
103 STP.P (R11, R12), 32(R10)
104 STP (R13, R14), -16(R10)
105 SUB $4, R0
106 B loop
107 done:
108 CSET LO, R0 // extract carry flag
109 MOVD R0, c+72(FP)
110 RET
111
112
113 // func addVW(z, x []Word, y Word) (c Word)
114 TEXT ·addVW(SB),NOSPLIT,$0
115 MOVD z+0(FP), R3
116 MOVD z_len+8(FP), R0
117 MOVD x+24(FP), R1
118 MOVD y+48(FP), R2
119 CBZ R0, len0 // the length of z is 0
120 MOVD.P 8(R1), R4
121 ADDS R2, R4 // z[0] = x[0] + y, set carry
122 MOVD.P R4, 8(R3)
123 SUB $1, R0
124 CBZ R0, len1 // the length of z is 1
125 TBZ $0, R0, two
126 MOVD.P 8(R1), R4 // do it once
127 ADCS $0, R4
128 MOVD.P R4, 8(R3)
129 SUB $1, R0
130 two: // do it twice
131 TBZ $1, R0, loop
132 LDP.P 16(R1), (R4, R5)
133 ADCS $0, R4, R8 // c, z[i] = x[i] + c
134 ADCS $0, R5, R9
135 STP.P (R8, R9), 16(R3)
136 SUB $2, R0
137 loop: // do four times per round
138 CBZ R0, len1 // careful not to touch the carry flag
139 LDP.P 32(R1), (R4, R5)
140 LDP -16(R1), (R6, R7)
141 ADCS $0, R4, R8
142 ADCS $0, R5, R9
143 ADCS $0, R6, R10
144 ADCS $0, R7, R11
145 STP.P (R8, R9), 32(R3)
146 STP (R10, R11), -16(R3)
147 SUB $4, R0
148 B loop
149 len1:
150 CSET HS, R2 // extract carry flag
151 len0:
152 MOVD R2, c+56(FP)
153 RET
154
155 // func subVW(z, x []Word, y Word) (c Word)
156 TEXT ·subVW(SB),NOSPLIT,$0
157 MOVD z+0(FP), R3
158 MOVD z_len+8(FP), R0
159 MOVD x+24(FP), R1
160 MOVD y+48(FP), R2
161 CBZ R0, len0 // the length of z is 0
162 MOVD.P 8(R1), R4
163 SUBS R2, R4 // z[0] = x[0] - y, set carry
164 MOVD.P R4, 8(R3)
165 SUB $1, R0
166 CBZ R0, len1 // the length of z is 1
167 TBZ $0, R0, two // do it once
168 MOVD.P 8(R1), R4
169 SBCS $0, R4
170 MOVD.P R4, 8(R3)
171 SUB $1, R0
172 two: // do it twice
173 TBZ $1, R0, loop
174 LDP.P 16(R1), (R4, R5)
175 SBCS $0, R4, R8 // c, z[i] = x[i] + c
176 SBCS $0, R5, R9
177 STP.P (R8, R9), 16(R3)
178 SUB $2, R0
179 loop: // do four times per round
180 CBZ R0, len1 // careful not to touch the carry flag
181 LDP.P 32(R1), (R4, R5)
182 LDP -16(R1), (R6, R7)
183 SBCS $0, R4, R8
184 SBCS $0, R5, R9
185 SBCS $0, R6, R10
186 SBCS $0, R7, R11
187 STP.P (R8, R9), 32(R3)
188 STP (R10, R11), -16(R3)
189 SUB $4, R0
190 B loop
191 len1:
192 CSET LO, R2 // extract carry flag
193 len0:
194 MOVD R2, c+56(FP)
195 RET
196
197 // func shlVU(z, x []Word, s uint) (c Word)
198 // This implementation handles the shift operation from the high word to the low word,
199 // which may be an error for the case where the low word of x overlaps with the high
200 // word of z. When calling this function directly, you need to pay attention to this
201 // situation.
202 TEXT ·shlVU(SB),NOSPLIT,$0
203 LDP z+0(FP), (R0, R1) // R0 = z.ptr, R1 = len(z)
204 MOVD x+24(FP), R2
205 MOVD s+48(FP), R3
206 ADD R1<<3, R0 // R0 = &z[n]
207 ADD R1<<3, R2 // R2 = &x[n]
208 CBZ R1, len0
209 CBZ R3, copy // if the number of shift is 0, just copy x to z
210 MOVD $64, R4
211 SUB R3, R4
212 // handling the most significant element x[n-1]
213 MOVD.W -8(R2), R6
214 LSR R4, R6, R5 // return value
215 LSL R3, R6, R8 // x[i] << s
216 SUB $1, R1
217 one: TBZ $0, R1, two
218 MOVD.W -8(R2), R6
219 LSR R4, R6, R7
220 ORR R8, R7
221 LSL R3, R6, R8
222 SUB $1, R1
223 MOVD.W R7, -8(R0)
224 two:
225 TBZ $1, R1, loop
226 LDP.W -16(R2), (R6, R7)
227 LSR R4, R7, R10
228 ORR R8, R10
229 LSL R3, R7
230 LSR R4, R6, R9
231 ORR R7, R9
232 LSL R3, R6, R8
233 SUB $2, R1
234 STP.W (R9, R10), -16(R0)
235 loop:
236 CBZ R1, done
237 LDP.W -32(R2), (R10, R11)
238 LDP 16(R2), (R12, R13)
239 LSR R4, R13, R23
240 ORR R8, R23 // z[i] = (x[i] << s) | (x[i-1] >> (64 - s))
241 LSL R3, R13
242 LSR R4, R12, R22
243 ORR R13, R22
244 LSL R3, R12
245 LSR R4, R11, R21
246 ORR R12, R21
247 LSL R3, R11
248 LSR R4, R10, R20
249 ORR R11, R20
250 LSL R3, R10, R8
251 STP.W (R20, R21), -32(R0)
252 STP (R22, R23), 16(R0)
253 SUB $4, R1
254 B loop
255 done:
256 MOVD.W R8, -8(R0) // the first element x[0]
257 MOVD R5, c+56(FP) // the part moved out from x[n-1]
258 RET
259 copy:
260 CMP R0, R2
261 BEQ len0
262 TBZ $0, R1, ctwo
263 MOVD.W -8(R2), R4
264 MOVD.W R4, -8(R0)
265 SUB $1, R1
266 ctwo:
267 TBZ $1, R1, cloop
268 LDP.W -16(R2), (R4, R5)
269 STP.W (R4, R5), -16(R0)
270 SUB $2, R1
271 cloop:
272 CBZ R1, len0
273 LDP.W -32(R2), (R4, R5)
274 LDP 16(R2), (R6, R7)
275 STP.W (R4, R5), -32(R0)
276 STP (R6, R7), 16(R0)
277 SUB $4, R1
278 B cloop
279 len0:
280 MOVD $0, c+56(FP)
281 RET
282
283 // func shrVU(z, x []Word, s uint) (c Word)
284 // This implementation handles the shift operation from the low word to the high word,
285 // which may be an error for the case where the high word of x overlaps with the low
286 // word of z. When calling this function directly, you need to pay attention to this
287 // situation.
288 TEXT ·shrVU(SB),NOSPLIT,$0
289 MOVD z+0(FP), R0
290 MOVD z_len+8(FP), R1
291 MOVD x+24(FP), R2
292 MOVD s+48(FP), R3
293 MOVD $0, R8
294 MOVD $64, R4
295 SUB R3, R4
296 CBZ R1, len0
297 CBZ R3, copy // if the number of shift is 0, just copy x to z
298
299 MOVD.P 8(R2), R20
300 LSR R3, R20, R8
301 LSL R4, R20
302 MOVD R20, c+56(FP) // deal with the first element
303 SUB $1, R1
304
305 TBZ $0, R1, two
306 MOVD.P 8(R2), R6
307 LSL R4, R6, R20
308 ORR R8, R20
309 LSR R3, R6, R8
310 MOVD.P R20, 8(R0)
311 SUB $1, R1
312 two:
313 TBZ $1, R1, loop
314 LDP.P 16(R2), (R6, R7)
315 LSL R4, R6, R20
316 LSR R3, R6
317 ORR R8, R20
318 LSL R4, R7, R21
319 LSR R3, R7, R8
320 ORR R6, R21
321 STP.P (R20, R21), 16(R0)
322 SUB $2, R1
323 loop:
324 CBZ R1, done
325 LDP.P 32(R2), (R10, R11)
326 LDP -16(R2), (R12, R13)
327 LSL R4, R10, R20
328 LSR R3, R10
329 ORR R8, R20 // z[i] = (x[i] >> s) | (x[i+1] << (64 - s))
330 LSL R4, R11, R21
331 LSR R3, R11
332 ORR R10, R21
333 LSL R4, R12, R22
334 LSR R3, R12
335 ORR R11, R22
336 LSL R4, R13, R23
337 LSR R3, R13, R8
338 ORR R12, R23
339 STP.P (R20, R21), 32(R0)
340 STP (R22, R23), -16(R0)
341 SUB $4, R1
342 B loop
343 done:
344 MOVD R8, (R0) // deal with the last element
345 RET
346 copy:
347 CMP R0, R2
348 BEQ len0
349 TBZ $0, R1, ctwo
350 MOVD.P 8(R2), R3
351 MOVD.P R3, 8(R0)
352 SUB $1, R1
353 ctwo:
354 TBZ $1, R1, cloop
355 LDP.P 16(R2), (R4, R5)
356 STP.P (R4, R5), 16(R0)
357 SUB $2, R1
358 cloop:
359 CBZ R1, len0
360 LDP.P 32(R2), (R4, R5)
361 LDP -16(R2), (R6, R7)
362 STP.P (R4, R5), 32(R0)
363 STP (R6, R7), -16(R0)
364 SUB $4, R1
365 B cloop
366 len0:
367 MOVD $0, c+56(FP)
368 RET
369
370
371 // func mulAddVWW(z, x []Word, y, r Word) (c Word)
372 TEXT ·mulAddVWW(SB),NOSPLIT,$0
373 MOVD z+0(FP), R1
374 MOVD z_len+8(FP), R0
375 MOVD x+24(FP), R2
376 MOVD y+48(FP), R3
377 MOVD r+56(FP), R4
378 // c, z = x * y + r
379 TBZ $0, R0, two
380 MOVD.P 8(R2), R5
381 MUL R3, R5, R7
382 UMULH R3, R5, R8
383 ADDS R4, R7
384 ADC $0, R8, R4 // c, z[i] = x[i] * y + r
385 MOVD.P R7, 8(R1)
386 SUB $1, R0
387 two:
388 TBZ $1, R0, loop
389 LDP.P 16(R2), (R5, R6)
390 MUL R3, R5, R10
391 UMULH R3, R5, R11
392 ADDS R4, R10
393 MUL R3, R6, R12
394 UMULH R3, R6, R13
395 ADCS R12, R11
396 ADC $0, R13, R4
397
398 STP.P (R10, R11), 16(R1)
399 SUB $2, R0
400 loop:
401 CBZ R0, done
402 LDP.P 32(R2), (R5, R6)
403 LDP -16(R2), (R7, R8)
404
405 MUL R3, R5, R10
406 UMULH R3, R5, R11
407 ADDS R4, R10
408 MUL R3, R6, R12
409 UMULH R3, R6, R13
410 ADCS R11, R12
411
412 MUL R3, R7, R14
413 UMULH R3, R7, R15
414 ADCS R13, R14
415 MUL R3, R8, R16
416 UMULH R3, R8, R17
417 ADCS R15, R16
418 ADC $0, R17, R4
419
420 STP.P (R10, R12), 32(R1)
421 STP (R14, R16), -16(R1)
422 SUB $4, R0
423 B loop
424 done:
425 MOVD R4, c+64(FP)
426 RET
427
428
429 // func addMulVVW(z, x []Word, y Word) (c Word)
430 TEXT ·addMulVVW(SB),NOSPLIT,$0
431 MOVD z+0(FP), R1
432 MOVD z_len+8(FP), R0
433 MOVD x+24(FP), R2
434 MOVD y+48(FP), R3
435 MOVD $0, R4
436
437 TBZ $0, R0, two
438
439 MOVD.P 8(R2), R5
440 MOVD (R1), R6
441
442 MUL R5, R3, R7
443 UMULH R5, R3, R8
444
445 ADDS R7, R6
446 ADC $0, R8, R4
447
448 MOVD.P R6, 8(R1)
449 SUB $1, R0
450
451 two:
452 TBZ $1, R0, loop
453
454 LDP.P 16(R2), (R5, R10)
455 LDP (R1), (R6, R11)
456
457 MUL R10, R3, R13
458 UMULH R10, R3, R12
459
460 MUL R5, R3, R7
461 UMULH R5, R3, R8
462
463 ADDS R4, R6
464 ADCS R13, R11
465 ADC $0, R12
466
467 ADDS R7, R6
468 ADCS R8, R11
469 ADC $0, R12, R4
470
471 STP.P (R6, R11), 16(R1)
472 SUB $2, R0
473
474 // The main loop of this code operates on a block of 4 words every iteration
475 // performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9]
476 // where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next
477 // 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z.
478 loop:
479 CBZ R0, done
480
481 LDP.P 16(R2), (R5, R6)
482 LDP.P 16(R2), (R7, R8)
483
484 LDP (R1), (R9, R10)
485 ADDS R4, R9
486 MUL R6, R3, R14
487 ADCS R14, R10
488 MUL R7, R3, R15
489 LDP 16(R1), (R11, R12)
490 ADCS R15, R11
491 MUL R8, R3, R16
492 ADCS R16, R12
493 UMULH R8, R3, R20
494 ADC $0, R20
495
496 MUL R5, R3, R13
497 ADDS R13, R9
498 UMULH R5, R3, R17
499 ADCS R17, R10
500 UMULH R6, R3, R21
501 STP.P (R9, R10), 16(R1)
502 ADCS R21, R11
503 UMULH R7, R3, R19
504 ADCS R19, R12
505 STP.P (R11, R12), 16(R1)
506 ADC $0, R20, R4
507
508 SUB $4, R0
509 B loop
510
511 done:
512 MOVD R4, c+56(FP)
513 RET
514
515 // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
516 TEXT ·divWVW(SB),NOSPLIT,$0
517 B ·divWVW_g(SB)
View as plain text