Text file src/pkg/math/big/arith_s390x.s
1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // +build !math_big_pure_go,s390x
6
7 #include "textflag.h"
8
9 // This file provides fast assembly versions for the elementary
10 // arithmetic operations on vectors implemented in arith.go.
11
12 TEXT ·hasVectorFacility(SB),NOSPLIT,$24-1
13 MOVD $x-24(SP), R1
14 XC $24, 0(R1), 0(R1) // clear the storage
15 MOVD $2, R0 // R0 is the number of double words stored -1
16 WORD $0xB2B01000 // STFLE 0(R1)
17 XOR R0, R0 // reset the value of R0
18 MOVBZ z-8(SP), R1
19 AND $0x40, R1
20 BEQ novector
21 vectorinstalled:
22 // check if the vector instruction has been enabled
23 VLEIB $0, $0xF, V16
24 VLGVB $0, V16, R1
25 CMPBNE R1, $0xF, novector
26 MOVB $1, ret+0(FP) // have vx
27 RET
28 novector:
29 MOVB $0, ret+0(FP) // no vx
30 RET
31
32 TEXT ·mulWW(SB),NOSPLIT,$0
33 MOVD x+0(FP), R3
34 MOVD y+8(FP), R4
35 MULHDU R3, R4
36 MOVD R10, z1+16(FP)
37 MOVD R11, z0+24(FP)
38 RET
39
40 // func divWW(x1, x0, y Word) (q, r Word)
41 TEXT ·divWW(SB),NOSPLIT,$0
42 MOVD x1+0(FP), R10
43 MOVD x0+8(FP), R11
44 MOVD y+16(FP), R5
45 WORD $0xb98700a5 // dlgr r10,r5
46 MOVD R11, q+24(FP)
47 MOVD R10, r+32(FP)
48 RET
49
50 // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
51 // func addVV(z, x, y []Word) (c Word)
52
53
54 TEXT ·addVV(SB),NOSPLIT,$0
55 MOVD addvectorfacility+0x00(SB),R1
56 BR (R1)
57
58 TEXT ·addVV_check(SB),NOSPLIT, $0
59 MOVB ·hasVX(SB), R1
60 CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
61 MOVD $addvectorfacility+0x00(SB), R1
62 MOVD $·addVV_novec(SB), R2
63 MOVD R2, 0(R1)
64 //MOVD $·addVV_novec(SB), 0(R1)
65 BR ·addVV_novec(SB)
66 vectorimpl:
67 MOVD $addvectorfacility+0x00(SB), R1
68 MOVD $·addVV_vec(SB), R2
69 MOVD R2, 0(R1)
70 //MOVD $·addVV_vec(SB), 0(R1)
71 BR ·addVV_vec(SB)
72
73 GLOBL addvectorfacility+0x00(SB), NOPTR, $8
74 DATA addvectorfacility+0x00(SB)/8, $·addVV_check(SB)
75
76 TEXT ·addVV_vec(SB),NOSPLIT,$0
77 MOVD z_len+8(FP), R3
78 MOVD x+24(FP), R8
79 MOVD y+48(FP), R9
80 MOVD z+0(FP), R2
81
82 MOVD $0, R4 // c = 0
83 MOVD $0, R0 // make sure it's zero
84 MOVD $0, R10 // i = 0
85
86
87 // s/JL/JMP/ below to disable the unrolled loop
88 SUB $4, R3
89 BLT v1
90 SUB $12, R3 // n -= 16
91 BLT A1 // if n < 0 goto A1
92
93 MOVD R8, R5
94 MOVD R9, R6
95 MOVD R2, R7
96 // n >= 0
97 // regular loop body unrolled 16x
98 VZERO V0 // c = 0
99 UU1: VLM 0(R5), V1, V4 // 64-bytes into V1..V8
100 ADD $64, R5
101 VPDI $0x4,V1,V1,V1 // flip the doublewords to big-endian order
102 VPDI $0x4,V2,V2,V2 // flip the doublewords to big-endian order
103
104
105 VLM 0(R6), V9, V12 // 64-bytes into V9..V16
106 ADD $64, R6
107 VPDI $0x4,V9,V9,V9 // flip the doublewords to big-endian order
108 VPDI $0x4,V10,V10,V10 // flip the doublewords to big-endian order
109
110 VACCCQ V1, V9, V0, V25
111 VACQ V1, V9, V0, V17
112 VACCCQ V2, V10, V25, V26
113 VACQ V2, V10, V25, V18
114
115
116 VLM 0(R5), V5, V6 // 32-bytes into V1..V8
117 VLM 0(R6), V13, V14 // 32-bytes into V9..V16
118 ADD $32, R5
119 ADD $32, R6
120
121 VPDI $0x4,V3,V3,V3 // flip the doublewords to big-endian order
122 VPDI $0x4,V4,V4,V4 // flip the doublewords to big-endian order
123 VPDI $0x4,V11,V11,V11 // flip the doublewords to big-endian order
124 VPDI $0x4,V12,V12,V12 // flip the doublewords to big-endian order
125
126 VACCCQ V3, V11, V26, V27
127 VACQ V3, V11, V26, V19
128 VACCCQ V4, V12, V27, V28
129 VACQ V4, V12, V27, V20
130
131 VLM 0(R5), V7, V8 // 32-bytes into V1..V8
132 VLM 0(R6), V15, V16 // 32-bytes into V9..V16
133 ADD $32, R5
134 ADD $32, R6
135
136 VPDI $0x4,V5,V5,V5 // flip the doublewords to big-endian order
137 VPDI $0x4,V6,V6,V6 // flip the doublewords to big-endian order
138 VPDI $0x4,V13,V13,V13 // flip the doublewords to big-endian order
139 VPDI $0x4,V14,V14,V14 // flip the doublewords to big-endian order
140
141 VACCCQ V5, V13, V28, V29
142 VACQ V5, V13, V28, V21
143 VACCCQ V6, V14, V29, V30
144 VACQ V6, V14, V29, V22
145
146 VPDI $0x4,V7,V7,V7 // flip the doublewords to big-endian order
147 VPDI $0x4,V8,V8,V8 // flip the doublewords to big-endian order
148 VPDI $0x4,V15,V15,V15 // flip the doublewords to big-endian order
149 VPDI $0x4,V16,V16,V16 // flip the doublewords to big-endian order
150
151 VACCCQ V7, V15, V30, V31
152 VACQ V7, V15, V30, V23
153 VACCCQ V8, V16, V31, V0 //V0 has carry-over
154 VACQ V8, V16, V31, V24
155
156 VPDI $0x4,V17,V17,V17 // flip the doublewords to big-endian order
157 VPDI $0x4,V18,V18,V18 // flip the doublewords to big-endian order
158 VPDI $0x4,V19,V19,V19 // flip the doublewords to big-endian order
159 VPDI $0x4,V20,V20,V20 // flip the doublewords to big-endian order
160 VPDI $0x4,V21,V21,V21 // flip the doublewords to big-endian order
161 VPDI $0x4,V22,V22,V22 // flip the doublewords to big-endian order
162 VPDI $0x4,V23,V23,V23 // flip the doublewords to big-endian order
163 VPDI $0x4,V24,V24,V24 // flip the doublewords to big-endian order
164 VSTM V17, V24, 0(R7) // 128-bytes into z
165 ADD $128, R7
166 ADD $128, R10 // i += 16
167 SUB $16, R3 // n -= 16
168 BGE UU1 // if n >= 0 goto U1
169 VLGVG $1, V0, R4 // put cf into R4
170 NEG R4, R4 // save cf
171
172 A1: ADD $12, R3 // n += 16
173
174
175 // s/JL/JMP/ below to disable the unrolled loop
176 BLT v1 // if n < 0 goto v1
177
178 U1: // n >= 0
179 // regular loop body unrolled 4x
180 MOVD 0(R8)(R10*1), R5
181 MOVD 8(R8)(R10*1), R6
182 MOVD 16(R8)(R10*1), R7
183 MOVD 24(R8)(R10*1), R1
184 ADDC R4, R4 // restore CF
185 MOVD 0(R9)(R10*1), R11
186 ADDE R11, R5
187 MOVD 8(R9)(R10*1), R11
188 ADDE R11, R6
189 MOVD 16(R9)(R10*1), R11
190 ADDE R11, R7
191 MOVD 24(R9)(R10*1), R11
192 ADDE R11, R1
193 MOVD R0, R4
194 ADDE R4, R4 // save CF
195 NEG R4, R4
196 MOVD R5, 0(R2)(R10*1)
197 MOVD R6, 8(R2)(R10*1)
198 MOVD R7, 16(R2)(R10*1)
199 MOVD R1, 24(R2)(R10*1)
200
201
202 ADD $32, R10 // i += 4
203 SUB $4, R3 // n -= 4
204 BGE U1 // if n >= 0 goto U1
205
206 v1: ADD $4, R3 // n += 4
207 BLE E1 // if n <= 0 goto E1
208
209 L1: // n > 0
210 ADDC R4, R4 // restore CF
211 MOVD 0(R8)(R10*1), R5
212 MOVD 0(R9)(R10*1), R11
213 ADDE R11, R5
214 MOVD R5, 0(R2)(R10*1)
215 MOVD R0, R4
216 ADDE R4, R4 // save CF
217 NEG R4, R4
218
219 ADD $8, R10 // i++
220 SUB $1, R3 // n--
221 BGT L1 // if n > 0 goto L1
222
223 E1: NEG R4, R4
224 MOVD R4, c+72(FP) // return c
225 RET
226
227 TEXT ·addVV_novec(SB),NOSPLIT,$0
228 novec:
229 MOVD z_len+8(FP), R3
230 MOVD x+24(FP), R8
231 MOVD y+48(FP), R9
232 MOVD z+0(FP), R2
233
234 MOVD $0, R4 // c = 0
235 MOVD $0, R0 // make sure it's zero
236 MOVD $0, R10 // i = 0
237
238 // s/JL/JMP/ below to disable the unrolled loop
239 SUB $4, R3 // n -= 4
240 BLT v1n // if n < 0 goto v1n
241 U1n: // n >= 0
242 // regular loop body unrolled 4x
243 MOVD 0(R8)(R10*1), R5
244 MOVD 8(R8)(R10*1), R6
245 MOVD 16(R8)(R10*1), R7
246 MOVD 24(R8)(R10*1), R1
247 ADDC R4, R4 // restore CF
248 MOVD 0(R9)(R10*1), R11
249 ADDE R11, R5
250 MOVD 8(R9)(R10*1), R11
251 ADDE R11, R6
252 MOVD 16(R9)(R10*1), R11
253 ADDE R11, R7
254 MOVD 24(R9)(R10*1), R11
255 ADDE R11, R1
256 MOVD R0, R4
257 ADDE R4, R4 // save CF
258 NEG R4, R4
259 MOVD R5, 0(R2)(R10*1)
260 MOVD R6, 8(R2)(R10*1)
261 MOVD R7, 16(R2)(R10*1)
262 MOVD R1, 24(R2)(R10*1)
263
264
265 ADD $32, R10 // i += 4
266 SUB $4, R3 // n -= 4
267 BGE U1n // if n >= 0 goto U1n
268
269 v1n: ADD $4, R3 // n += 4
270 BLE E1n // if n <= 0 goto E1n
271
272 L1n: // n > 0
273 ADDC R4, R4 // restore CF
274 MOVD 0(R8)(R10*1), R5
275 MOVD 0(R9)(R10*1), R11
276 ADDE R11, R5
277 MOVD R5, 0(R2)(R10*1)
278 MOVD R0, R4
279 ADDE R4, R4 // save CF
280 NEG R4, R4
281
282 ADD $8, R10 // i++
283 SUB $1, R3 // n--
284 BGT L1n // if n > 0 goto L1n
285
286 E1n: NEG R4, R4
287 MOVD R4, c+72(FP) // return c
288 RET
289
290
291 TEXT ·subVV(SB),NOSPLIT,$0
292 MOVD subvectorfacility+0x00(SB),R1
293 BR (R1)
294
295 TEXT ·subVV_check(SB),NOSPLIT,$0
296 MOVB ·hasVX(SB), R1
297 CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
298 MOVD $subvectorfacility+0x00(SB), R1
299 MOVD $·subVV_novec(SB), R2
300 MOVD R2, 0(R1)
301 //MOVD $·subVV_novec(SB), 0(R1)
302 BR ·subVV_novec(SB)
303 vectorimpl:
304 MOVD $subvectorfacility+0x00(SB), R1
305 MOVD $·subVV_vec(SB), R2
306 MOVD R2, 0(R1)
307 //MOVD $·subVV_vec(SB), 0(R1)
308 BR ·subVV_vec(SB)
309
310 GLOBL subvectorfacility+0x00(SB), NOPTR, $8
311 DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB)
312
313 // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
314 // func subVV(z, x, y []Word) (c Word)
315 // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
316 TEXT ·subVV_vec(SB),NOSPLIT,$0
317 MOVD z_len+8(FP), R3
318 MOVD x+24(FP), R8
319 MOVD y+48(FP), R9
320 MOVD z+0(FP), R2
321 MOVD $0, R4 // c = 0
322 MOVD $0, R0 // make sure it's zero
323 MOVD $0, R10 // i = 0
324
325 // s/JL/JMP/ below to disable the unrolled loop
326 SUB $4, R3 // n -= 4
327 BLT v1 // if n < 0 goto v1
328 SUB $12, R3 // n -= 16
329 BLT A1 // if n < 0 goto A1
330
331 MOVD R8, R5
332 MOVD R9, R6
333 MOVD R2, R7
334
335 // n >= 0
336 // regular loop body unrolled 16x
337 VZERO V0 // cf = 0
338 MOVD $1, R4 // for 390 subtraction cf starts as 1 (no borrow)
339 VLVGG $1, R4, V0 //put carry into V0
340
341 UU1: VLM 0(R5), V1, V4 // 64-bytes into V1..V8
342 ADD $64, R5
343 VPDI $0x4,V1,V1,V1 // flip the doublewords to big-endian order
344 VPDI $0x4,V2,V2,V2 // flip the doublewords to big-endian order
345
346
347 VLM 0(R6), V9, V12 // 64-bytes into V9..V16
348 ADD $64, R6
349 VPDI $0x4,V9,V9,V9 // flip the doublewords to big-endian order
350 VPDI $0x4,V10,V10,V10 // flip the doublewords to big-endian order
351
352 VSBCBIQ V1, V9, V0, V25
353 VSBIQ V1, V9, V0, V17
354 VSBCBIQ V2, V10, V25, V26
355 VSBIQ V2, V10, V25, V18
356
357
358 VLM 0(R5), V5, V6 // 32-bytes into V1..V8
359 VLM 0(R6), V13, V14 // 32-bytes into V9..V16
360 ADD $32, R5
361 ADD $32, R6
362
363 VPDI $0x4,V3,V3,V3 // flip the doublewords to big-endian order
364 VPDI $0x4,V4,V4,V4 // flip the doublewords to big-endian order
365 VPDI $0x4,V11,V11,V11 // flip the doublewords to big-endian order
366 VPDI $0x4,V12,V12,V12 // flip the doublewords to big-endian order
367
368 VSBCBIQ V3, V11, V26, V27
369 VSBIQ V3, V11, V26, V19
370 VSBCBIQ V4, V12, V27, V28
371 VSBIQ V4, V12, V27, V20
372
373 VLM 0(R5), V7, V8 // 32-bytes into V1..V8
374 VLM 0(R6), V15, V16 // 32-bytes into V9..V16
375 ADD $32, R5
376 ADD $32, R6
377
378 VPDI $0x4,V5,V5,V5 // flip the doublewords to big-endian order
379 VPDI $0x4,V6,V6,V6 // flip the doublewords to big-endian order
380 VPDI $0x4,V13,V13,V13 // flip the doublewords to big-endian order
381 VPDI $0x4,V14,V14,V14 // flip the doublewords to big-endian order
382
383 VSBCBIQ V5, V13, V28, V29
384 VSBIQ V5, V13, V28, V21
385 VSBCBIQ V6, V14, V29, V30
386 VSBIQ V6, V14, V29, V22
387
388 VPDI $0x4,V7,V7,V7 // flip the doublewords to big-endian order
389 VPDI $0x4,V8,V8,V8 // flip the doublewords to big-endian order
390 VPDI $0x4,V15,V15,V15 // flip the doublewords to big-endian order
391 VPDI $0x4,V16,V16,V16 // flip the doublewords to big-endian order
392
393 VSBCBIQ V7, V15, V30, V31
394 VSBIQ V7, V15, V30, V23
395 VSBCBIQ V8, V16, V31, V0 //V0 has carry-over
396 VSBIQ V8, V16, V31, V24
397
398 VPDI $0x4,V17,V17,V17 // flip the doublewords to big-endian order
399 VPDI $0x4,V18,V18,V18 // flip the doublewords to big-endian order
400 VPDI $0x4,V19,V19,V19 // flip the doublewords to big-endian order
401 VPDI $0x4,V20,V20,V20 // flip the doublewords to big-endian order
402 VPDI $0x4,V21,V21,V21 // flip the doublewords to big-endian order
403 VPDI $0x4,V22,V22,V22 // flip the doublewords to big-endian order
404 VPDI $0x4,V23,V23,V23 // flip the doublewords to big-endian order
405 VPDI $0x4,V24,V24,V24 // flip the doublewords to big-endian order
406 VSTM V17, V24, 0(R7) // 128-bytes into z
407 ADD $128, R7
408 ADD $128, R10 // i += 16
409 SUB $16, R3 // n -= 16
410 BGE UU1 // if n >= 0 goto U1
411 VLGVG $1, V0, R4 // put cf into R4
412 SUB $1, R4 // save cf
413
414 A1: ADD $12, R3 // n += 16
415 BLT v1 // if n < 0 goto v1
416
417 U1: // n >= 0
418 // regular loop body unrolled 4x
419 MOVD 0(R8)(R10*1), R5
420 MOVD 8(R8)(R10*1), R6
421 MOVD 16(R8)(R10*1), R7
422 MOVD 24(R8)(R10*1), R1
423 MOVD R0, R11
424 SUBC R4, R11 // restore CF
425 MOVD 0(R9)(R10*1), R11
426 SUBE R11, R5
427 MOVD 8(R9)(R10*1), R11
428 SUBE R11, R6
429 MOVD 16(R9)(R10*1), R11
430 SUBE R11, R7
431 MOVD 24(R9)(R10*1), R11
432 SUBE R11, R1
433 MOVD R0, R4
434 SUBE R4, R4 // save CF
435 MOVD R5, 0(R2)(R10*1)
436 MOVD R6, 8(R2)(R10*1)
437 MOVD R7, 16(R2)(R10*1)
438 MOVD R1, 24(R2)(R10*1)
439
440 ADD $32, R10 // i += 4
441 SUB $4, R3 // n -= 4
442 BGE U1 // if n >= 0 goto U1n
443
444 v1: ADD $4, R3 // n += 4
445 BLE E1 // if n <= 0 goto E1
446
447 L1: // n > 0
448 MOVD R0, R11
449 SUBC R4, R11 // restore CF
450 MOVD 0(R8)(R10*1), R5
451 MOVD 0(R9)(R10*1), R11
452 SUBE R11, R5
453 MOVD R5, 0(R2)(R10*1)
454 MOVD R0, R4
455 SUBE R4, R4 // save CF
456
457 ADD $8, R10 // i++
458 SUB $1, R3 // n--
459 BGT L1 // if n > 0 goto L1n
460
461 E1: NEG R4, R4
462 MOVD R4, c+72(FP) // return c
463 RET
464
465
466 // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
467 // func subVV(z, x, y []Word) (c Word)
468 // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
469 TEXT ·subVV_novec(SB),NOSPLIT,$0
470 MOVD z_len+8(FP), R3
471 MOVD x+24(FP), R8
472 MOVD y+48(FP), R9
473 MOVD z+0(FP), R2
474
475 MOVD $0, R4 // c = 0
476 MOVD $0, R0 // make sure it's zero
477 MOVD $0, R10 // i = 0
478
479 // s/JL/JMP/ below to disable the unrolled loop
480 SUB $4, R3 // n -= 4
481 BLT v1 // if n < 0 goto v1
482
483 U1: // n >= 0
484 // regular loop body unrolled 4x
485 MOVD 0(R8)(R10*1), R5
486 MOVD 8(R8)(R10*1), R6
487 MOVD 16(R8)(R10*1), R7
488 MOVD 24(R8)(R10*1), R1
489 MOVD R0, R11
490 SUBC R4, R11 // restore CF
491 MOVD 0(R9)(R10*1), R11
492 SUBE R11, R5
493 MOVD 8(R9)(R10*1), R11
494 SUBE R11, R6
495 MOVD 16(R9)(R10*1), R11
496 SUBE R11, R7
497 MOVD 24(R9)(R10*1), R11
498 SUBE R11, R1
499 MOVD R0, R4
500 SUBE R4, R4 // save CF
501 MOVD R5, 0(R2)(R10*1)
502 MOVD R6, 8(R2)(R10*1)
503 MOVD R7, 16(R2)(R10*1)
504 MOVD R1, 24(R2)(R10*1)
505
506
507 ADD $32, R10 // i += 4
508 SUB $4, R3 // n -= 4
509 BGE U1 // if n >= 0 goto U1
510
511 v1: ADD $4, R3 // n += 4
512 BLE E1 // if n <= 0 goto E1
513
514 L1: // n > 0
515 MOVD R0, R11
516 SUBC R4, R11 // restore CF
517 MOVD 0(R8)(R10*1), R5
518 MOVD 0(R9)(R10*1), R11
519 SUBE R11, R5
520 MOVD R5, 0(R2)(R10*1)
521 MOVD R0, R4
522 SUBE R4, R4 // save CF
523
524 ADD $8, R10 // i++
525 SUB $1, R3 // n--
526 BGT L1 // if n > 0 goto L1
527
528 E1: NEG R4, R4
529 MOVD R4, c+72(FP) // return c
530 RET
531
532 TEXT ·addVW(SB),NOSPLIT,$0
533 MOVD addwvectorfacility+0x00(SB),R1
534 BR (R1)
535
536 TEXT ·addVW_check(SB),NOSPLIT,$0
537 MOVB ·hasVX(SB), R1
538 CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
539 MOVD $addwvectorfacility+0x00(SB), R1
540 MOVD $·addVW_novec(SB), R2
541 MOVD R2, 0(R1)
542 //MOVD $·addVW_novec(SB), 0(R1)
543 BR ·addVW_novec(SB)
544 vectorimpl:
545 MOVD $addwvectorfacility+0x00(SB), R1
546 MOVD $·addVW_vec(SB), R2
547 MOVD R2, 0(R1)
548 //MOVD $·addVW_vec(SB), 0(R1)
549 BR ·addVW_vec(SB)
550
551 GLOBL addwvectorfacility+0x00(SB), NOPTR, $8
552 DATA addwvectorfacility+0x00(SB)/8, $·addVW_check(SB)
553
554
555 // func addVW_vec(z, x []Word, y Word) (c Word)
556 TEXT ·addVW_vec(SB),NOSPLIT,$0
557 MOVD z_len+8(FP), R3
558 MOVD x+24(FP), R8
559 MOVD y+48(FP), R4 // c = y
560 MOVD z+0(FP), R2
561
562 MOVD $0, R0 // make sure it's zero
563 MOVD $0, R10 // i = 0
564 MOVD R8, R5
565 MOVD R2, R7
566
567 // s/JL/JMP/ below to disable the unrolled loop
568 SUB $4, R3 // n -= 4
569 BLT v10 // if n < 0 goto v10
570 SUB $12, R3
571 BLT A10
572
573 // n >= 0
574 // regular loop body unrolled 16x
575
576 VZERO V0 // prepare V0 to be final carry register
577 VZERO V9 // to ensure upper half is zero
578 VLVGG $1, R4, V9
579 UU1: VLM 0(R5), V1, V4 // 64-bytes into V1..V4
580 ADD $64, R5
581 VPDI $0x4,V1,V1,V1 // flip the doublewords to big-endian order
582 VPDI $0x4,V2,V2,V2 // flip the doublewords to big-endian order
583
584
585 VACCCQ V1, V9, V0, V25
586 VACQ V1, V9, V0, V17
587 VZERO V9
588 VACCCQ V2, V9, V25, V26
589 VACQ V2, V9, V25, V18
590
591
592 VLM 0(R5), V5, V6 // 32-bytes into V5..V6
593 ADD $32, R5
594
595 VPDI $0x4,V3,V3,V3 // flip the doublewords to big-endian order
596 VPDI $0x4,V4,V4,V4 // flip the doublewords to big-endian order
597
598 VACCCQ V3, V9, V26, V27
599 VACQ V3, V9, V26, V19
600 VACCCQ V4, V9, V27, V28
601 VACQ V4, V9, V27, V20
602
603 VLM 0(R5), V7, V8 // 32-bytes into V7..V8
604 ADD $32, R5
605
606 VPDI $0x4,V5,V5,V5 // flip the doublewords to big-endian order
607 VPDI $0x4,V6,V6,V6 // flip the doublewords to big-endian order
608
609 VACCCQ V5, V9, V28, V29
610 VACQ V5, V9, V28, V21
611 VACCCQ V6, V9, V29, V30
612 VACQ V6, V9, V29, V22
613
614 VPDI $0x4,V7,V7,V7 // flip the doublewords to big-endian order
615 VPDI $0x4,V8,V8,V8 // flip the doublewords to big-endian order
616
617 VACCCQ V7, V9, V30, V31
618 VACQ V7, V9, V30, V23
619 VACCCQ V8, V9, V31, V0 //V0 has carry-over
620 VACQ V8, V9, V31, V24
621
622 VPDI $0x4,V17,V17,V17 // flip the doublewords to big-endian order
623 VPDI $0x4,V18,V18,V18 // flip the doublewords to big-endian order
624 VPDI $0x4,V19,V19,V19 // flip the doublewords to big-endian order
625 VPDI $0x4,V20,V20,V20 // flip the doublewords to big-endian order
626 VPDI $0x4,V21,V21,V21 // flip the doublewords to big-endian order
627 VPDI $0x4,V22,V22,V22 // flip the doublewords to big-endian order
628 VPDI $0x4,V23,V23,V23 // flip the doublewords to big-endian order
629 VPDI $0x4,V24,V24,V24 // flip the doublewords to big-endian order
630 VSTM V17, V24, 0(R7) // 128-bytes into z
631 ADD $128, R7
632 ADD $128, R10 // i += 16
633 SUB $16, R3 // n -= 16
634 BGE UU1 // if n >= 0 goto U1
635 VLGVG $1, V0, R4 // put cf into R4 in case we branch to v10
636
637 A10: ADD $12, R3 // n += 16
638
639
640 // s/JL/JMP/ below to disable the unrolled loop
641
642 BLT v10 // if n < 0 goto v10
643
644
645 U4: // n >= 0
646 // regular loop body unrolled 4x
647 MOVD 0(R8)(R10*1), R5
648 MOVD 8(R8)(R10*1), R6
649 MOVD 16(R8)(R10*1), R7
650 MOVD 24(R8)(R10*1), R1
651 ADDC R4, R5
652 ADDE R0, R6
653 ADDE R0, R7
654 ADDE R0, R1
655 ADDE R0, R0
656 MOVD R0, R4 // save CF
657 SUB R0, R0
658 MOVD R5, 0(R2)(R10*1)
659 MOVD R6, 8(R2)(R10*1)
660 MOVD R7, 16(R2)(R10*1)
661 MOVD R1, 24(R2)(R10*1)
662
663 ADD $32, R10 // i += 4 -> i +=32
664 SUB $4, R3 // n -= 4
665 BGE U4 // if n >= 0 goto U4
666
667 v10: ADD $4, R3 // n += 4
668 BLE E10 // if n <= 0 goto E4
669
670
671 L4: // n > 0
672 MOVD 0(R8)(R10*1), R5
673 ADDC R4, R5
674 ADDE R0, R0
675 MOVD R0, R4 // save CF
676 SUB R0, R0
677 MOVD R5, 0(R2)(R10*1)
678
679 ADD $8, R10 // i++
680 SUB $1, R3 // n--
681 BGT L4 // if n > 0 goto L4
682
683 E10: MOVD R4, c+56(FP) // return c
684
685 RET
686
687
688 TEXT ·addVW_novec(SB),NOSPLIT,$0
689 //DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0)
690 MOVD z_len+8(FP), R3
691 MOVD x+24(FP), R8
692 MOVD y+48(FP), R4 // c = y
693 MOVD z+0(FP), R2
694 MOVD $0, R0 // make sure it's 0
695 MOVD $0, R10 // i = 0
696
697 // s/JL/JMP/ below to disable the unrolled loop
698 SUB $4, R3 // n -= 4
699 BLT v4 // if n < 4 goto v4
700
701 U4: // n >= 0
702 // regular loop body unrolled 4x
703 MOVD 0(R8)(R10*1), R5
704 MOVD 8(R8)(R10*1), R6
705 MOVD 16(R8)(R10*1), R7
706 MOVD 24(R8)(R10*1), R1
707 ADDC R4, R5
708 ADDE R0, R6
709 ADDE R0, R7
710 ADDE R0, R1
711 ADDE R0, R0
712 MOVD R0, R4 // save CF
713 SUB R0, R0
714 MOVD R5, 0(R2)(R10*1)
715 MOVD R6, 8(R2)(R10*1)
716 MOVD R7, 16(R2)(R10*1)
717 MOVD R1, 24(R2)(R10*1)
718
719 ADD $32, R10 // i += 4 -> i +=32
720 SUB $4, R3 // n -= 4
721 BGE U4 // if n >= 0 goto U4
722
723 v4: ADD $4, R3 // n += 4
724 BLE E4 // if n <= 0 goto E4
725
726 L4: // n > 0
727 MOVD 0(R8)(R10*1), R5
728 ADDC R4, R5
729 ADDE R0, R0
730 MOVD R0, R4 // save CF
731 SUB R0, R0
732 MOVD R5, 0(R2)(R10*1)
733
734 ADD $8, R10 // i++
735 SUB $1, R3 // n--
736 BGT L4 // if n > 0 goto L4
737
738 E4: MOVD R4, c+56(FP) // return c
739
740 RET
741
742 TEXT ·subVW(SB),NOSPLIT,$0
743 MOVD subwvectorfacility+0x00(SB),R1
744 BR (R1)
745
746 TEXT ·subVW_check(SB),NOSPLIT,$0
747 MOVB ·hasVX(SB), R1
748 CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
749 MOVD $subwvectorfacility+0x00(SB), R1
750 MOVD $·subVW_novec(SB), R2
751 MOVD R2, 0(R1)
752 //MOVD $·subVW_novec(SB), 0(R1)
753 BR ·subVW_novec(SB)
754 vectorimpl:
755 MOVD $subwvectorfacility+0x00(SB), R1
756 MOVD $·subVW_vec(SB), R2
757 MOVD R2, 0(R1)
758 //MOVD $·subVW_vec(SB), 0(R1)
759 BR ·subVW_vec(SB)
760
761 GLOBL subwvectorfacility+0x00(SB), NOPTR, $8
762 DATA subwvectorfacility+0x00(SB)/8, $·subVW_check(SB)
763
764 // func subVW(z, x []Word, y Word) (c Word)
765 TEXT ·subVW_vec(SB),NOSPLIT,$0
766 MOVD z_len+8(FP), R3
767 MOVD x+24(FP), R8
768 MOVD y+48(FP), R4 // c = y
769 MOVD z+0(FP), R2
770
771 MOVD $0, R0 // make sure it's zero
772 MOVD $0, R10 // i = 0
773 MOVD R8, R5
774 MOVD R2, R7
775
776 // s/JL/JMP/ below to disable the unrolled loop
777 SUB $4, R3 // n -= 4
778 BLT v11 // if n < 0 goto v11
779 SUB $12, R3
780 BLT A11
781
782 VZERO V0
783 MOVD $1, R6 // prepare V0 to be final carry register
784 VLVGG $1, R6, V0 // borrow is initially "no borrow"
785 VZERO V9 // to ensure upper half is zero
786 VLVGG $1, R4, V9
787
788 // n >= 0
789 // regular loop body unrolled 16x
790
791
792 UU1: VLM 0(R5), V1, V4 // 64-bytes into V1..V4
793 ADD $64, R5
794 VPDI $0x4,V1,V1,V1 // flip the doublewords to big-endian order
795 VPDI $0x4,V2,V2,V2 // flip the doublewords to big-endian order
796
797
798 VSBCBIQ V1, V9, V0, V25
799 VSBIQ V1, V9, V0, V17
800 VZERO V9
801 VSBCBIQ V2, V9, V25, V26
802 VSBIQ V2, V9, V25, V18
803
804 VLM 0(R5), V5, V6 // 32-bytes into V5..V6
805 ADD $32, R5
806
807 VPDI $0x4,V3,V3,V3 // flip the doublewords to big-endian order
808 VPDI $0x4,V4,V4,V4 // flip the doublewords to big-endian order
809
810
811 VSBCBIQ V3, V9, V26, V27
812 VSBIQ V3, V9, V26, V19
813 VSBCBIQ V4, V9, V27, V28
814 VSBIQ V4, V9, V27, V20
815
816 VLM 0(R5), V7, V8 // 32-bytes into V7..V8
817 ADD $32, R5
818
819 VPDI $0x4,V5,V5,V5 // flip the doublewords to big-endian order
820 VPDI $0x4,V6,V6,V6 // flip the doublewords to big-endian order
821
822 VSBCBIQ V5, V9, V28, V29
823 VSBIQ V5, V9, V28, V21
824 VSBCBIQ V6, V9, V29, V30
825 VSBIQ V6, V9, V29, V22
826
827 VPDI $0x4,V7,V7,V7 // flip the doublewords to big-endian order
828 VPDI $0x4,V8,V8,V8 // flip the doublewords to big-endian order
829
830 VSBCBIQ V7, V9, V30, V31
831 VSBIQ V7, V9, V30, V23
832 VSBCBIQ V8, V9, V31, V0 // V0 has carry-over
833 VSBIQ V8, V9, V31, V24
834
835 VPDI $0x4,V17,V17,V17 // flip the doublewords to big-endian order
836 VPDI $0x4,V18,V18,V18 // flip the doublewords to big-endian order
837 VPDI $0x4,V19,V19,V19 // flip the doublewords to big-endian order
838 VPDI $0x4,V20,V20,V20 // flip the doublewords to big-endian order
839 VPDI $0x4,V21,V21,V21 // flip the doublewords to big-endian order
840 VPDI $0x4,V22,V22,V22 // flip the doublewords to big-endian order
841 VPDI $0x4,V23,V23,V23 // flip the doublewords to big-endian order
842 VPDI $0x4,V24,V24,V24 // flip the doublewords to big-endian order
843 VSTM V17, V24, 0(R7) // 128-bytes into z
844 ADD $128, R7
845 ADD $128, R10 // i += 16
846 SUB $16, R3 // n -= 16
847 BGE UU1 // if n >= 0 goto U1
848 VLGVG $1, V0, R4 // put cf into R4 in case we branch to v10
849 SUB $1, R4 // save cf
850 NEG R4, R4
851 A11: ADD $12, R3 // n += 16
852
853 BLT v11 // if n < 0 goto v11
854
855 // n >= 0
856 // regular loop body unrolled 4x
857
858 U4: // n >= 0
859 // regular loop body unrolled 4x
860 MOVD 0(R8)(R10*1), R5
861 MOVD 8(R8)(R10*1), R6
862 MOVD 16(R8)(R10*1), R7
863 MOVD 24(R8)(R10*1), R1
864 SUBC R4, R5 //SLGR -> SUBC
865 SUBE R0, R6 //SLBGR -> SUBE
866 SUBE R0, R7
867 SUBE R0, R1
868 SUBE R4, R4 // save CF
869 NEG R4, R4
870 MOVD R5, 0(R2)(R10*1)
871 MOVD R6, 8(R2)(R10*1)
872 MOVD R7, 16(R2)(R10*1)
873 MOVD R1, 24(R2)(R10*1)
874
875 ADD $32, R10 // i += 4 -> i +=32
876 SUB $4, R3 // n -= 4
877 BGE U4 // if n >= 0 goto U4
878
879 v11: ADD $4, R3 // n += 4
880 BLE E11 // if n <= 0 goto E4
881
882 L4: // n > 0
883
884 MOVD 0(R8)(R10*1), R5
885 SUBC R4, R5
886 SUBE R4, R4 // save CF
887 NEG R4, R4
888 MOVD R5, 0(R2)(R10*1)
889
890 ADD $8, R10 // i++
891 SUB $1, R3 // n--
892 BGT L4 // if n > 0 goto L4
893
894 E11: MOVD R4, c+56(FP) // return c
895
896 RET
897
898 //DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0)
899 // func subVW(z, x []Word, y Word) (c Word)
900 // (same as addVW except for SUBC/SUBE instead of ADDC/ADDE and label names)
901 TEXT ·subVW_novec(SB),NOSPLIT,$0
902 MOVD z_len+8(FP), R3
903 MOVD x+24(FP), R8
904 MOVD y+48(FP), R4 // c = y
905 MOVD z+0(FP), R2
906 MOVD $0, R0 // make sure it's 0
907 MOVD $0, R10 // i = 0
908
909 // s/JL/JMP/ below to disable the unrolled loop
910 SUB $4, R3 // n -= 4
911 BLT v4 // if n < 4 goto v4
912
913 U4: // n >= 0
914 // regular loop body unrolled 4x
915 MOVD 0(R8)(R10*1), R5
916 MOVD 8(R8)(R10*1), R6
917 MOVD 16(R8)(R10*1), R7
918 MOVD 24(R8)(R10*1), R1
919 SUBC R4, R5 //SLGR -> SUBC
920 SUBE R0, R6 //SLBGR -> SUBE
921 SUBE R0, R7
922 SUBE R0, R1
923 SUBE R4, R4 // save CF
924 NEG R4, R4
925 MOVD R5, 0(R2)(R10*1)
926 MOVD R6, 8(R2)(R10*1)
927 MOVD R7, 16(R2)(R10*1)
928 MOVD R1, 24(R2)(R10*1)
929
930 ADD $32, R10 // i += 4 -> i +=32
931 SUB $4, R3 // n -= 4
932 BGE U4 // if n >= 0 goto U4
933
934 v4: ADD $4, R3 // n += 4
935 BLE E4 // if n <= 0 goto E4
936
937 L4: // n > 0
938 MOVD 0(R8)(R10*1), R5
939 SUBC R4, R5
940 SUBE R4, R4 // save CF
941 NEG R4, R4
942 MOVD R5, 0(R2)(R10*1)
943
944 ADD $8, R10 // i++
945 SUB $1, R3 // n--
946 BGT L4 // if n > 0 goto L4
947
948 E4: MOVD R4, c+56(FP) // return c
949
950 RET
951
952 // func shlVU(z, x []Word, s uint) (c Word)
953 TEXT ·shlVU(SB),NOSPLIT,$0
954 MOVD z_len+8(FP), R5
955 MOVD $0, R0
956 SUB $1, R5 // n--
957 BLT X8b // n < 0 (n <= 0)
958
959 // n > 0
960 MOVD s+48(FP), R4
961 CMPBEQ R0, R4, Z80 //handle 0 case beq
962 MOVD $64, R6
963 CMPBEQ R6, R4, Z864 //handle 64 case beq
964 MOVD z+0(FP), R2
965 MOVD x+24(FP), R8
966 SLD $3, R5 // n = n*8
967 SUB R4, R6, R7
968 MOVD (R8)(R5*1), R10 // w1 = x[i-1]
969 SRD R7, R10, R3
970 MOVD R3, c+56(FP)
971
972 MOVD $0, R1 // i = 0
973 BR E8
974
975 // i < n-1
976 L8: MOVD R10, R3 // w = w1
977 MOVD -8(R8)(R5*1), R10 // w1 = x[i+1]
978
979 SLD R4, R3 // w<<s | w1>>ŝ
980 SRD R7, R10, R6
981 OR R6, R3
982 MOVD R3, (R2)(R5*1) // z[i] = w<<s | w1>>ŝ
983 SUB $8, R5 // i--
984
985 E8: CMPBGT R5, R0, L8 // i < n-1
986
987 // i >= n-1
988 X8a: SLD R4, R10 // w1<<s
989 MOVD R10, (R2) // z[0] = w1<<s
990 RET
991
992 X8b: MOVD R0, c+56(FP)
993 RET
994
995 Z80: MOVD z+0(FP), R2
996 MOVD x+24(FP), R8
997 SLD $3, R5 // n = n*8
998
999 MOVD (R8), R10
1000 MOVD $0, R3
1001 MOVD R3, c+56(FP)
1002
1003 MOVD $0, R1 // i = 0
1004 BR E8Z
1005
1006 // i < n-1
1007 L8Z: MOVD R10, R3
1008 MOVD 8(R8)(R1*1), R10
1009
1010 MOVD R3, (R2)(R1*1)
1011 ADD $8, R1
1012
1013 E8Z: CMPBLT R1, R5, L8Z
1014
1015 // i >= n-1
1016 MOVD R10, (R2)(R5*1)
1017 RET
1018
1019 Z864: MOVD z+0(FP), R2
1020 MOVD x+24(FP), R8
1021 SLD $3, R5 // n = n*8
1022 MOVD (R8)(R5*1), R3 // w1 = x[n-1]
1023 MOVD R3, c+56(FP) // z[i] = x[n-1]
1024
1025 BR E864
1026
1027 // i < n-1
1028 L864: MOVD -8(R8)(R5*1), R3
1029
1030 MOVD R3, (R2)(R5*1) // z[i] = x[n-1]
1031 SUB $8, R5 // i--
1032
1033 E864: CMPBGT R5, R0, L864 // i < n-1
1034
1035 MOVD R0, (R2) // z[n-1] = 0
1036 RET
1037
1038
1039 // CX = R4, r8 = r8, r10 = r2 , r11 = r5, DX = r3, AX = r10 , BX = R1 , 64-count = r7 (R0 set to 0) temp = R6
1040 // func shrVU(z, x []Word, s uint) (c Word)
1041 TEXT ·shrVU(SB),NOSPLIT,$0
1042 MOVD z_len+8(FP), R5
1043 MOVD $0, R0
1044 SUB $1, R5 // n--
1045 BLT X9b // n < 0 (n <= 0)
1046
1047 // n > 0
1048 MOVD s+48(FP), R4
1049 CMPBEQ R0, R4, ZB0 //handle 0 case beq
1050 MOVD $64, R6
1051 CMPBEQ R6, R4, ZB64 //handle 64 case beq
1052 MOVD z+0(FP), R2
1053 MOVD x+24(FP), R8
1054 SLD $3, R5 // n = n*8
1055 SUB R4, R6, R7
1056 MOVD (R8), R10 // w1 = x[0]
1057 SLD R7, R10, R3
1058 MOVD R3, c+56(FP)
1059
1060 MOVD $0, R1 // i = 0
1061 BR E9
1062
1063 // i < n-1
1064 L9: MOVD R10, R3 // w = w1
1065 MOVD 8(R8)(R1*1), R10 // w1 = x[i+1]
1066
1067 SRD R4, R3 // w>>s | w1<<s
1068 SLD R7, R10, R6
1069 OR R6, R3
1070 MOVD R3, (R2)(R1*1) // z[i] = w>>s | w1<<s
1071 ADD $8, R1 // i++
1072
1073 E9: CMPBLT R1, R5, L9 // i < n-1
1074
1075 // i >= n-1
1076 X9a: SRD R4, R10 // w1>>s
1077 MOVD R10, (R2)(R5*1) // z[n-1] = w1>>s
1078 RET
1079
1080 X9b: MOVD R0, c+56(FP)
1081 RET
1082
1083 ZB0: MOVD z+0(FP), R2
1084 MOVD x+24(FP), R8
1085 SLD $3, R5 // n = n*8
1086
1087 MOVD (R8), R10 // w1 = x[0]
1088 MOVD $0, R3 // R10 << 64
1089 MOVD R3, c+56(FP)
1090
1091 MOVD $0, R1 // i = 0
1092 BR E9Z
1093
1094 // i < n-1
1095 L9Z: MOVD R10, R3 // w = w1
1096 MOVD 8(R8)(R1*1), R10 // w1 = x[i+1]
1097
1098 MOVD R3, (R2)(R1*1) // z[i] = w>>s | w1<<s
1099 ADD $8, R1 // i++
1100
1101 E9Z: CMPBLT R1, R5, L9Z // i < n-1
1102
1103 // i >= n-1
1104 MOVD R10, (R2)(R5*1) // z[n-1] = w1>>s
1105 RET
1106
1107 ZB64: MOVD z+0(FP), R2
1108 MOVD x+24(FP), R8
1109 SLD $3, R5 // n = n*8
1110 MOVD (R8), R3 // w1 = x[0]
1111 MOVD R3, c+56(FP)
1112
1113 MOVD $0, R1 // i = 0
1114 BR E964
1115
1116 // i < n-1
1117 L964: MOVD 8(R8)(R1*1), R3 // w1 = x[i+1]
1118
1119 MOVD R3, (R2)(R1*1) // z[i] = w>>s | w1<<s
1120 ADD $8, R1 // i++
1121
1122 E964: CMPBLT R1, R5, L964 // i < n-1
1123
1124 // i >= n-1
1125 MOVD $0, R10 // w1>>s
1126 MOVD R10, (R2)(R5*1) // z[n-1] = w1>>s
1127 RET
1128
1129 // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, DX = r3, AX = r6 , BX = R1 , (R0 set to 0) + use R11 + use R7 for i
1130 // func mulAddVWW(z, x []Word, y, r Word) (c Word)
1131 TEXT ·mulAddVWW(SB),NOSPLIT,$0
1132 MOVD z+0(FP), R2
1133 MOVD x+24(FP), R8
1134 MOVD y+48(FP), R9
1135 MOVD r+56(FP), R4 // c = r
1136 MOVD z_len+8(FP), R5
1137 MOVD $0, R1 // i = 0
1138 MOVD $0, R7 // i*8 = 0
1139 MOVD $0, R0 // make sure it's zero
1140 BR E5
1141
1142 L5: MOVD (R8)(R1*1), R6
1143 MULHDU R9, R6
1144 ADDC R4, R11 //add to low order bits
1145 ADDE R0, R6
1146 MOVD R11, (R2)(R1*1)
1147 MOVD R6, R4
1148 ADD $8, R1 // i*8 + 8
1149 ADD $1, R7 // i++
1150
1151 E5: CMPBLT R7, R5, L5 // i < n
1152
1153 MOVD R4, c+64(FP)
1154 RET
1155
1156 // func addMulVVW(z, x []Word, y Word) (c Word)
1157 // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1 , (R0 set to 0) + use R11 + use R7 for i
1158 TEXT ·addMulVVW(SB),NOSPLIT,$0
1159 MOVD z+0(FP), R2
1160 MOVD x+24(FP), R8
1161 MOVD y+48(FP), R9
1162 MOVD z_len+8(FP), R5
1163
1164 MOVD $0, R1 // i*8 = 0
1165 MOVD $0, R7 // i = 0
1166 MOVD $0, R0 // make sure it's zero
1167 MOVD $0, R4 // c = 0
1168
1169 MOVD R5, R12
1170 AND $-2, R12
1171 CMPBGE R5, $2, A6
1172 BR E6
1173
1174 A6: MOVD (R8)(R1*1), R6
1175 MULHDU R9, R6
1176 MOVD (R2)(R1*1), R10
1177 ADDC R10, R11 //add to low order bits
1178 ADDE R0, R6
1179 ADDC R4, R11
1180 ADDE R0, R6
1181 MOVD R6, R4
1182 MOVD R11, (R2)(R1*1)
1183
1184 MOVD (8)(R8)(R1*1), R6
1185 MULHDU R9, R6
1186 MOVD (8)(R2)(R1*1), R10
1187 ADDC R10, R11 //add to low order bits
1188 ADDE R0, R6
1189 ADDC R4, R11
1190 ADDE R0, R6
1191 MOVD R6, R4
1192 MOVD R11, (8)(R2)(R1*1)
1193
1194 ADD $16, R1 // i*8 + 8
1195 ADD $2, R7 // i++
1196
1197 CMPBLT R7, R12, A6
1198 BR E6
1199
1200 L6: MOVD (R8)(R1*1), R6
1201 MULHDU R9, R6
1202 MOVD (R2)(R1*1), R10
1203 ADDC R10, R11 //add to low order bits
1204 ADDE R0, R6
1205 ADDC R4, R11
1206 ADDE R0, R6
1207 MOVD R6, R4
1208 MOVD R11, (R2)(R1*1)
1209
1210 ADD $8, R1 // i*8 + 8
1211 ADD $1, R7 // i++
1212
1213 E6: CMPBLT R7, R5, L6 // i < n
1214
1215 MOVD R4, c+56(FP)
1216 RET
1217
1218 // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
1219 // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1(*8) , (R0 set to 0) + use R11 + use R7 for i
1220 TEXT ·divWVW(SB),NOSPLIT,$0
1221 MOVD z+0(FP), R2
1222 MOVD xn+24(FP), R10 // r = xn
1223 MOVD x+32(FP), R8
1224 MOVD y+56(FP), R9
1225 MOVD z_len+8(FP), R7 // i = z
1226 SLD $3, R7, R1 // i*8
1227 MOVD $0, R0 // make sure it's zero
1228 BR E7
1229
1230 L7: MOVD (R8)(R1*1), R11
1231 WORD $0xB98700A9 //DLGR R10,R9
1232 MOVD R11, (R2)(R1*1)
1233
1234 E7: SUB $1, R7 // i--
1235 SUB $8, R1
1236 BGE L7 // i >= 0
1237
1238 MOVD R10, r+64(FP)
1239 RET
View as plain text