Text file src/math/big/arith_amd64.s
1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // +build !math_big_pure_go
6
7 #include "textflag.h"
8
9 // This file provides fast assembly versions for the elementary
10 // arithmetic operations on vectors implemented in arith.go.
11
12 // func mulWW(x, y Word) (z1, z0 Word)
13 TEXT ·mulWW(SB),NOSPLIT,$0
14 MOVQ x+0(FP), AX
15 MULQ y+8(FP)
16 MOVQ DX, z1+16(FP)
17 MOVQ AX, z0+24(FP)
18 RET
19
20
21 // func divWW(x1, x0, y Word) (q, r Word)
22 TEXT ·divWW(SB),NOSPLIT,$0
23 MOVQ x1+0(FP), DX
24 MOVQ x0+8(FP), AX
25 DIVQ y+16(FP)
26 MOVQ AX, q+24(FP)
27 MOVQ DX, r+32(FP)
28 RET
29
30 // The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
31 // It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
32 // This is faster than using rotate instructions.
33
34 // func addVV(z, x, y []Word) (c Word)
35 TEXT ·addVV(SB),NOSPLIT,$0
36 MOVQ z_len+8(FP), DI
37 MOVQ x+24(FP), R8
38 MOVQ y+48(FP), R9
39 MOVQ z+0(FP), R10
40
41 MOVQ $0, CX // c = 0
42 MOVQ $0, SI // i = 0
43
44 // s/JL/JMP/ below to disable the unrolled loop
45 SUBQ $4, DI // n -= 4
46 JL V1 // if n < 0 goto V1
47
48 U1: // n >= 0
49 // regular loop body unrolled 4x
50 ADDQ CX, CX // restore CF
51 MOVQ 0(R8)(SI*8), R11
52 MOVQ 8(R8)(SI*8), R12
53 MOVQ 16(R8)(SI*8), R13
54 MOVQ 24(R8)(SI*8), R14
55 ADCQ 0(R9)(SI*8), R11
56 ADCQ 8(R9)(SI*8), R12
57 ADCQ 16(R9)(SI*8), R13
58 ADCQ 24(R9)(SI*8), R14
59 MOVQ R11, 0(R10)(SI*8)
60 MOVQ R12, 8(R10)(SI*8)
61 MOVQ R13, 16(R10)(SI*8)
62 MOVQ R14, 24(R10)(SI*8)
63 SBBQ CX, CX // save CF
64
65 ADDQ $4, SI // i += 4
66 SUBQ $4, DI // n -= 4
67 JGE U1 // if n >= 0 goto U1
68
69 V1: ADDQ $4, DI // n += 4
70 JLE E1 // if n <= 0 goto E1
71
72 L1: // n > 0
73 ADDQ CX, CX // restore CF
74 MOVQ 0(R8)(SI*8), R11
75 ADCQ 0(R9)(SI*8), R11
76 MOVQ R11, 0(R10)(SI*8)
77 SBBQ CX, CX // save CF
78
79 ADDQ $1, SI // i++
80 SUBQ $1, DI // n--
81 JG L1 // if n > 0 goto L1
82
83 E1: NEGQ CX
84 MOVQ CX, c+72(FP) // return c
85 RET
86
87
88 // func subVV(z, x, y []Word) (c Word)
89 // (same as addVV except for SBBQ instead of ADCQ and label names)
90 TEXT ·subVV(SB),NOSPLIT,$0
91 MOVQ z_len+8(FP), DI
92 MOVQ x+24(FP), R8
93 MOVQ y+48(FP), R9
94 MOVQ z+0(FP), R10
95
96 MOVQ $0, CX // c = 0
97 MOVQ $0, SI // i = 0
98
99 // s/JL/JMP/ below to disable the unrolled loop
100 SUBQ $4, DI // n -= 4
101 JL V2 // if n < 0 goto V2
102
103 U2: // n >= 0
104 // regular loop body unrolled 4x
105 ADDQ CX, CX // restore CF
106 MOVQ 0(R8)(SI*8), R11
107 MOVQ 8(R8)(SI*8), R12
108 MOVQ 16(R8)(SI*8), R13
109 MOVQ 24(R8)(SI*8), R14
110 SBBQ 0(R9)(SI*8), R11
111 SBBQ 8(R9)(SI*8), R12
112 SBBQ 16(R9)(SI*8), R13
113 SBBQ 24(R9)(SI*8), R14
114 MOVQ R11, 0(R10)(SI*8)
115 MOVQ R12, 8(R10)(SI*8)
116 MOVQ R13, 16(R10)(SI*8)
117 MOVQ R14, 24(R10)(SI*8)
118 SBBQ CX, CX // save CF
119
120 ADDQ $4, SI // i += 4
121 SUBQ $4, DI // n -= 4
122 JGE U2 // if n >= 0 goto U2
123
124 V2: ADDQ $4, DI // n += 4
125 JLE E2 // if n <= 0 goto E2
126
127 L2: // n > 0
128 ADDQ CX, CX // restore CF
129 MOVQ 0(R8)(SI*8), R11
130 SBBQ 0(R9)(SI*8), R11
131 MOVQ R11, 0(R10)(SI*8)
132 SBBQ CX, CX // save CF
133
134 ADDQ $1, SI // i++
135 SUBQ $1, DI // n--
136 JG L2 // if n > 0 goto L2
137
138 E2: NEGQ CX
139 MOVQ CX, c+72(FP) // return c
140 RET
141
142
143 // func addVW(z, x []Word, y Word) (c Word)
144 TEXT ·addVW(SB),NOSPLIT,$0
145 MOVQ z_len+8(FP), DI
146 CMPQ DI, $32
147 JG large
148 MOVQ x+24(FP), R8
149 MOVQ y+48(FP), CX // c = y
150 MOVQ z+0(FP), R10
151
152 MOVQ $0, SI // i = 0
153
154 // s/JL/JMP/ below to disable the unrolled loop
155 SUBQ $4, DI // n -= 4
156 JL V3 // if n < 4 goto V3
157
158 U3: // n >= 0
159 // regular loop body unrolled 4x
160 MOVQ 0(R8)(SI*8), R11
161 MOVQ 8(R8)(SI*8), R12
162 MOVQ 16(R8)(SI*8), R13
163 MOVQ 24(R8)(SI*8), R14
164 ADDQ CX, R11
165 ADCQ $0, R12
166 ADCQ $0, R13
167 ADCQ $0, R14
168 SBBQ CX, CX // save CF
169 NEGQ CX
170 MOVQ R11, 0(R10)(SI*8)
171 MOVQ R12, 8(R10)(SI*8)
172 MOVQ R13, 16(R10)(SI*8)
173 MOVQ R14, 24(R10)(SI*8)
174
175 ADDQ $4, SI // i += 4
176 SUBQ $4, DI // n -= 4
177 JGE U3 // if n >= 0 goto U3
178
179 V3: ADDQ $4, DI // n += 4
180 JLE E3 // if n <= 0 goto E3
181
182 L3: // n > 0
183 ADDQ 0(R8)(SI*8), CX
184 MOVQ CX, 0(R10)(SI*8)
185 SBBQ CX, CX // save CF
186 NEGQ CX
187
188 ADDQ $1, SI // i++
189 SUBQ $1, DI // n--
190 JG L3 // if n > 0 goto L3
191
192 E3: MOVQ CX, c+56(FP) // return c
193 RET
194 large:
195 JMP ·addVWlarge(SB)
196
197
198 // func subVW(z, x []Word, y Word) (c Word)
199 // (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
200 TEXT ·subVW(SB),NOSPLIT,$0
201 MOVQ z_len+8(FP), DI
202 CMPQ DI, $32
203 JG large
204 MOVQ x+24(FP), R8
205 MOVQ y+48(FP), CX // c = y
206 MOVQ z+0(FP), R10
207
208 MOVQ $0, SI // i = 0
209
210 // s/JL/JMP/ below to disable the unrolled loop
211 SUBQ $4, DI // n -= 4
212 JL V4 // if n < 4 goto V4
213
214 U4: // n >= 0
215 // regular loop body unrolled 4x
216 MOVQ 0(R8)(SI*8), R11
217 MOVQ 8(R8)(SI*8), R12
218 MOVQ 16(R8)(SI*8), R13
219 MOVQ 24(R8)(SI*8), R14
220 SUBQ CX, R11
221 SBBQ $0, R12
222 SBBQ $0, R13
223 SBBQ $0, R14
224 SBBQ CX, CX // save CF
225 NEGQ CX
226 MOVQ R11, 0(R10)(SI*8)
227 MOVQ R12, 8(R10)(SI*8)
228 MOVQ R13, 16(R10)(SI*8)
229 MOVQ R14, 24(R10)(SI*8)
230
231 ADDQ $4, SI // i += 4
232 SUBQ $4, DI // n -= 4
233 JGE U4 // if n >= 0 goto U4
234
235 V4: ADDQ $4, DI // n += 4
236 JLE E4 // if n <= 0 goto E4
237
238 L4: // n > 0
239 MOVQ 0(R8)(SI*8), R11
240 SUBQ CX, R11
241 MOVQ R11, 0(R10)(SI*8)
242 SBBQ CX, CX // save CF
243 NEGQ CX
244
245 ADDQ $1, SI // i++
246 SUBQ $1, DI // n--
247 JG L4 // if n > 0 goto L4
248
249 E4: MOVQ CX, c+56(FP) // return c
250 RET
251 large:
252 JMP ·subVWlarge(SB)
253
254
255 // func shlVU(z, x []Word, s uint) (c Word)
256 TEXT ·shlVU(SB),NOSPLIT,$0
257 MOVQ z_len+8(FP), BX // i = z
258 SUBQ $1, BX // i--
259 JL X8b // i < 0 (n <= 0)
260
261 // n > 0
262 MOVQ z+0(FP), R10
263 MOVQ x+24(FP), R8
264 MOVQ s+48(FP), CX
265 MOVQ (R8)(BX*8), AX // w1 = x[n-1]
266 MOVQ $0, DX
267 SHLQ CX, AX, DX // w1>>ŝ
268 MOVQ DX, c+56(FP)
269
270 CMPQ BX, $0
271 JLE X8a // i <= 0
272
273 // i > 0
274 L8: MOVQ AX, DX // w = w1
275 MOVQ -8(R8)(BX*8), AX // w1 = x[i-1]
276 SHLQ CX, AX, DX // w<<s | w1>>ŝ
277 MOVQ DX, (R10)(BX*8) // z[i] = w<<s | w1>>ŝ
278 SUBQ $1, BX // i--
279 JG L8 // i > 0
280
281 // i <= 0
282 X8a: SHLQ CX, AX // w1<<s
283 MOVQ AX, (R10) // z[0] = w1<<s
284 RET
285
286 X8b: MOVQ $0, c+56(FP)
287 RET
288
289
290 // func shrVU(z, x []Word, s uint) (c Word)
291 TEXT ·shrVU(SB),NOSPLIT,$0
292 MOVQ z_len+8(FP), R11
293 SUBQ $1, R11 // n--
294 JL X9b // n < 0 (n <= 0)
295
296 // n > 0
297 MOVQ z+0(FP), R10
298 MOVQ x+24(FP), R8
299 MOVQ s+48(FP), CX
300 MOVQ (R8), AX // w1 = x[0]
301 MOVQ $0, DX
302 SHRQ CX, AX, DX // w1<<ŝ
303 MOVQ DX, c+56(FP)
304
305 MOVQ $0, BX // i = 0
306 JMP E9
307
308 // i < n-1
309 L9: MOVQ AX, DX // w = w1
310 MOVQ 8(R8)(BX*8), AX // w1 = x[i+1]
311 SHRQ CX, AX, DX // w>>s | w1<<ŝ
312 MOVQ DX, (R10)(BX*8) // z[i] = w>>s | w1<<ŝ
313 ADDQ $1, BX // i++
314
315 E9: CMPQ BX, R11
316 JL L9 // i < n-1
317
318 // i >= n-1
319 X9a: SHRQ CX, AX // w1>>s
320 MOVQ AX, (R10)(R11*8) // z[n-1] = w1>>s
321 RET
322
323 X9b: MOVQ $0, c+56(FP)
324 RET
325
326
327 // func mulAddVWW(z, x []Word, y, r Word) (c Word)
328 TEXT ·mulAddVWW(SB),NOSPLIT,$0
329 MOVQ z+0(FP), R10
330 MOVQ x+24(FP), R8
331 MOVQ y+48(FP), R9
332 MOVQ r+56(FP), CX // c = r
333 MOVQ z_len+8(FP), R11
334 MOVQ $0, BX // i = 0
335
336 CMPQ R11, $4
337 JL E5
338
339 U5: // i+4 <= n
340 // regular loop body unrolled 4x
341 MOVQ (0*8)(R8)(BX*8), AX
342 MULQ R9
343 ADDQ CX, AX
344 ADCQ $0, DX
345 MOVQ AX, (0*8)(R10)(BX*8)
346 MOVQ DX, CX
347 MOVQ (1*8)(R8)(BX*8), AX
348 MULQ R9
349 ADDQ CX, AX
350 ADCQ $0, DX
351 MOVQ AX, (1*8)(R10)(BX*8)
352 MOVQ DX, CX
353 MOVQ (2*8)(R8)(BX*8), AX
354 MULQ R9
355 ADDQ CX, AX
356 ADCQ $0, DX
357 MOVQ AX, (2*8)(R10)(BX*8)
358 MOVQ DX, CX
359 MOVQ (3*8)(R8)(BX*8), AX
360 MULQ R9
361 ADDQ CX, AX
362 ADCQ $0, DX
363 MOVQ AX, (3*8)(R10)(BX*8)
364 MOVQ DX, CX
365 ADDQ $4, BX // i += 4
366
367 LEAQ 4(BX), DX
368 CMPQ DX, R11
369 JLE U5
370 JMP E5
371
372 L5: MOVQ (R8)(BX*8), AX
373 MULQ R9
374 ADDQ CX, AX
375 ADCQ $0, DX
376 MOVQ AX, (R10)(BX*8)
377 MOVQ DX, CX
378 ADDQ $1, BX // i++
379
380 E5: CMPQ BX, R11 // i < n
381 JL L5
382
383 MOVQ CX, c+64(FP)
384 RET
385
386
387 // func addMulVVW(z, x []Word, y Word) (c Word)
388 TEXT ·addMulVVW(SB),NOSPLIT,$0
389 CMPB ·support_adx(SB), $1
390 JEQ adx
391 MOVQ z+0(FP), R10
392 MOVQ x+24(FP), R8
393 MOVQ y+48(FP), R9
394 MOVQ z_len+8(FP), R11
395 MOVQ $0, BX // i = 0
396 MOVQ $0, CX // c = 0
397 MOVQ R11, R12
398 ANDQ $-2, R12
399 CMPQ R11, $2
400 JAE A6
401 JMP E6
402
403 A6:
404 MOVQ (R8)(BX*8), AX
405 MULQ R9
406 ADDQ (R10)(BX*8), AX
407 ADCQ $0, DX
408 ADDQ CX, AX
409 ADCQ $0, DX
410 MOVQ DX, CX
411 MOVQ AX, (R10)(BX*8)
412
413 MOVQ (8)(R8)(BX*8), AX
414 MULQ R9
415 ADDQ (8)(R10)(BX*8), AX
416 ADCQ $0, DX
417 ADDQ CX, AX
418 ADCQ $0, DX
419 MOVQ DX, CX
420 MOVQ AX, (8)(R10)(BX*8)
421
422 ADDQ $2, BX
423 CMPQ BX, R12
424 JL A6
425 JMP E6
426
427 L6: MOVQ (R8)(BX*8), AX
428 MULQ R9
429 ADDQ CX, AX
430 ADCQ $0, DX
431 ADDQ AX, (R10)(BX*8)
432 ADCQ $0, DX
433 MOVQ DX, CX
434 ADDQ $1, BX // i++
435
436 E6: CMPQ BX, R11 // i < n
437 JL L6
438
439 MOVQ CX, c+56(FP)
440 RET
441
442 adx:
443 MOVQ z_len+8(FP), R11
444 MOVQ z+0(FP), R10
445 MOVQ x+24(FP), R8
446 MOVQ y+48(FP), DX
447 MOVQ $0, BX // i = 0
448 MOVQ $0, CX // carry
449 CMPQ R11, $8
450 JAE adx_loop_header
451 CMPQ BX, R11
452 JL adx_short
453 MOVQ CX, c+56(FP)
454 RET
455
456 adx_loop_header:
457 MOVQ R11, R13
458 ANDQ $-8, R13
459 adx_loop:
460 XORQ R9, R9 // unset flags
461 MULXQ (R8), SI, DI
462 ADCXQ CX,SI
463 ADOXQ (R10), SI
464 MOVQ SI,(R10)
465
466 MULXQ 8(R8), AX, CX
467 ADCXQ DI, AX
468 ADOXQ 8(R10), AX
469 MOVQ AX, 8(R10)
470
471 MULXQ 16(R8), SI, DI
472 ADCXQ CX, SI
473 ADOXQ 16(R10), SI
474 MOVQ SI, 16(R10)
475
476 MULXQ 24(R8), AX, CX
477 ADCXQ DI, AX
478 ADOXQ 24(R10), AX
479 MOVQ AX, 24(R10)
480
481 MULXQ 32(R8), SI, DI
482 ADCXQ CX, SI
483 ADOXQ 32(R10), SI
484 MOVQ SI, 32(R10)
485
486 MULXQ 40(R8), AX, CX
487 ADCXQ DI, AX
488 ADOXQ 40(R10), AX
489 MOVQ AX, 40(R10)
490
491 MULXQ 48(R8), SI, DI
492 ADCXQ CX, SI
493 ADOXQ 48(R10), SI
494 MOVQ SI, 48(R10)
495
496 MULXQ 56(R8), AX, CX
497 ADCXQ DI, AX
498 ADOXQ 56(R10), AX
499 MOVQ AX, 56(R10)
500
501 ADCXQ R9, CX
502 ADOXQ R9, CX
503
504 ADDQ $64, R8
505 ADDQ $64, R10
506 ADDQ $8, BX
507
508 CMPQ BX, R13
509 JL adx_loop
510 MOVQ z+0(FP), R10
511 MOVQ x+24(FP), R8
512 CMPQ BX, R11
513 JL adx_short
514 MOVQ CX, c+56(FP)
515 RET
516
517 adx_short:
518 MULXQ (R8)(BX*8), SI, DI
519 ADDQ CX, SI
520 ADCQ $0, DI
521 ADDQ SI, (R10)(BX*8)
522 ADCQ $0, DI
523 MOVQ DI, CX
524 ADDQ $1, BX // i++
525
526 CMPQ BX, R11
527 JL adx_short
528
529 MOVQ CX, c+56(FP)
530 RET
531
532
533
534 // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
535 TEXT ·divWVW(SB),NOSPLIT,$0
536 MOVQ z+0(FP), R10
537 MOVQ xn+24(FP), DX // r = xn
538 MOVQ x+32(FP), R8
539 MOVQ y+56(FP), R9
540 MOVQ z_len+8(FP), BX // i = z
541 JMP E7
542
543 L7: MOVQ (R8)(BX*8), AX
544 DIVQ R9
545 MOVQ AX, (R10)(BX*8)
546
547 E7: SUBQ $1, BX // i--
548 JGE L7 // i >= 0
549
550 MOVQ DX, r+64(FP)
551 RET
View as plain text