...

Text file src/pkg/math/big/arith_ppc64x.s

     1	// Copyright 2013 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// +build !math_big_pure_go,ppc64 !math_big_pure_go,ppc64le
     6	
     7	#include "textflag.h"
     8	
     9	// This file provides fast assembly versions for the elementary
    10	// arithmetic operations on vectors implemented in arith.go.
    11	
    12	// func mulWW(x, y Word) (z1, z0 Word)
    13	TEXT ·mulWW(SB), NOSPLIT, $0
    14		MOVD   x+0(FP), R4
    15		MOVD   y+8(FP), R5
    16		MULHDU R4, R5, R6
    17		MULLD  R4, R5, R7
    18		MOVD   R6, z1+16(FP)
    19		MOVD   R7, z0+24(FP)
    20		RET
    21	
    22	// func addVV(z, y, y []Word) (c Word)
    23	// z[i] = x[i] + y[i] for all i, carrying
    24	TEXT ·addVV(SB), NOSPLIT, $0
    25		MOVD  z_len+8(FP), R7   // R7 = z_len
    26		MOVD  x+24(FP), R8      // R8 = x[]
    27		MOVD  y+48(FP), R9      // R9 = y[]
    28		MOVD  z+0(FP), R10      // R10 = z[]
    29	
    30		// If z_len = 0, we are done
    31		CMP   R0, R7
    32		MOVD  R0, R4
    33		BEQ   done
    34	
    35		// Process the first iteration out of the loop so we can
    36		// use MOVDU and avoid 3 index registers updates.
    37		MOVD  0(R8), R11      // R11 = x[i]
    38		MOVD  0(R9), R12      // R12 = y[i]
    39		ADD   $-1, R7         // R7 = z_len - 1
    40		ADDC  R12, R11, R15   // R15 = x[i] + y[i], set CA
    41		CMP   R0, R7
    42		MOVD  R15, 0(R10)     // z[i]
    43		BEQ   final          // If z_len was 1, we are done
    44	
    45		SRD   $2, R7, R5      // R5 = z_len/4
    46		CMP   R0, R5
    47		MOVD  R5, CTR         // Set up loop counter
    48		BEQ   tail            // If R5 = 0, we can't use the loop
    49	
    50		// Process 4 elements per iteration. Unrolling this loop
    51		// means a performance trade-off: we will lose performance
    52		// for small values of z_len (0.90x in the worst case), but
    53		// gain significant performance as z_len increases (up to
    54		// 1.45x).
    55	loop:
    56		MOVD  8(R8), R11      // R11 = x[i]
    57		MOVD  16(R8), R12     // R12 = x[i+1]
    58		MOVD  24(R8), R14     // R14 = x[i+2]
    59		MOVDU 32(R8), R15     // R15 = x[i+3]
    60		MOVD  8(R9), R16      // R16 = y[i]
    61		MOVD  16(R9), R17     // R17 = y[i+1]
    62		MOVD  24(R9), R18     // R18 = y[i+2]
    63		MOVDU 32(R9), R19     // R19 = y[i+3]
    64		ADDE  R11, R16, R20   // R20 = x[i] + y[i] + CA
    65		ADDE  R12, R17, R21   // R21 = x[i+1] + y[i+1] + CA
    66		ADDE  R14, R18, R22   // R22 = x[i+2] + y[i+2] + CA
    67		ADDE  R15, R19, R23   // R23 = x[i+3] + y[i+3] + CA
    68		MOVD  R20, 8(R10)     // z[i]
    69		MOVD  R21, 16(R10)    // z[i+1]
    70		MOVD  R22, 24(R10)    // z[i+2]
    71		MOVDU R23, 32(R10)    // z[i+3]
    72		ADD   $-4, R7         // R7 = z_len - 4
    73		BC  16, 0, loop       // bdnz
    74	
    75		// We may have more elements to read
    76		CMP   R0, R7
    77		BEQ   final
    78	
    79		// Process the remaining elements, one at a time
    80	tail:
    81		MOVDU 8(R8), R11      // R11 = x[i]
    82		MOVDU 8(R9), R16      // R16 = y[i]
    83		ADD   $-1, R7         // R7 = z_len - 1
    84		ADDE  R11, R16, R20   // R20 = x[i] + y[i] + CA
    85		CMP   R0, R7
    86		MOVDU R20, 8(R10)     // z[i]
    87		BEQ   final           // If R7 = 0, we are done
    88	
    89		MOVDU 8(R8), R11
    90		MOVDU 8(R9), R16
    91		ADD   $-1, R7
    92		ADDE  R11, R16, R20
    93		CMP   R0, R7
    94		MOVDU R20, 8(R10)
    95		BEQ   final
    96	
    97		MOVD  8(R8), R11
    98		MOVD  8(R9), R16
    99		ADDE  R11, R16, R20
   100		MOVD  R20, 8(R10)
   101	
   102	final:
   103		ADDZE R4              // Capture CA
   104	
   105	done:
   106		MOVD  R4, c+72(FP)
   107		RET
   108	
   109	// func subVV(z, x, y []Word) (c Word)
   110	// z[i] = x[i] - y[i] for all i, carrying
   111	TEXT ·subVV(SB), NOSPLIT, $0
   112		MOVD  z_len+8(FP), R7 // R7 = z_len
   113		MOVD  x+24(FP), R8    // R8 = x[]
   114		MOVD  y+48(FP), R9    // R9 = y[]
   115		MOVD  z+0(FP), R10    // R10 = z[]
   116	
   117		// If z_len = 0, we are done
   118		CMP   R0, R7
   119		MOVD  R0, R4
   120		BEQ   done
   121	
   122		// Process the first iteration out of the loop so we can
   123		// use MOVDU and avoid 3 index registers updates.
   124		MOVD  0(R8), R11      // R11 = x[i]
   125		MOVD  0(R9), R12      // R12 = y[i]
   126		ADD   $-1, R7         // R7 = z_len - 1
   127		SUBC  R12, R11, R15   // R15 = x[i] - y[i], set CA
   128		CMP   R0, R7
   129		MOVD  R15, 0(R10)     // z[i]
   130		BEQ   final           // If z_len was 1, we are done
   131	
   132		SRD   $2, R7, R5      // R5 = z_len/4
   133		CMP   R0, R5
   134		MOVD  R5, CTR         // Set up loop counter
   135		BEQ   tail            // If R5 = 0, we can't use the loop
   136	
   137		// Process 4 elements per iteration. Unrolling this loop
   138		// means a performance trade-off: we will lose performance
   139		// for small values of z_len (0.92x in the worst case), but
   140		// gain significant performance as z_len increases (up to
   141		// 1.45x).
   142	loop:
   143		MOVD  8(R8), R11      // R11 = x[i]
   144		MOVD  16(R8), R12     // R12 = x[i+1]
   145		MOVD  24(R8), R14     // R14 = x[i+2]
   146		MOVDU 32(R8), R15     // R15 = x[i+3]
   147		MOVD  8(R9), R16      // R16 = y[i]
   148		MOVD  16(R9), R17     // R17 = y[i+1]
   149		MOVD  24(R9), R18     // R18 = y[i+2]
   150		MOVDU 32(R9), R19     // R19 = y[i+3]
   151		SUBE  R16, R11, R20   // R20 = x[i] - y[i] + CA
   152		SUBE  R17, R12, R21   // R21 = x[i+1] - y[i+1] + CA
   153		SUBE  R18, R14, R22   // R22 = x[i+2] - y[i+2] + CA
   154		SUBE  R19, R15, R23   // R23 = x[i+3] - y[i+3] + CA
   155		MOVD  R20, 8(R10)     // z[i]
   156		MOVD  R21, 16(R10)    // z[i+1]
   157		MOVD  R22, 24(R10)    // z[i+2]
   158		MOVDU R23, 32(R10)    // z[i+3]
   159		ADD   $-4, R7         // R7 = z_len - 4
   160		BC  16, 0, loop       // bdnz
   161	
   162		// We may have more elements to read
   163		CMP   R0, R7
   164		BEQ   final
   165	
   166		// Process the remaining elements, one at a time
   167	tail:
   168		MOVDU 8(R8), R11      // R11 = x[i]
   169		MOVDU 8(R9), R16      // R16 = y[i]
   170		ADD   $-1, R7         // R7 = z_len - 1
   171		SUBE  R16, R11, R20   // R20 = x[i] - y[i] + CA
   172		CMP   R0, R7
   173		MOVDU R20, 8(R10)     // z[i]
   174		BEQ   final           // If R7 = 0, we are done
   175	
   176		MOVDU 8(R8), R11
   177		MOVDU 8(R9), R16
   178		ADD   $-1, R7
   179		SUBE  R16, R11, R20
   180		CMP   R0, R7
   181		MOVDU R20, 8(R10)
   182		BEQ   final
   183	
   184		MOVD  8(R8), R11
   185		MOVD  8(R9), R16
   186		SUBE  R16, R11, R20
   187		MOVD  R20, 8(R10)
   188	
   189	final:
   190		ADDZE R4
   191		XOR   $1, R4
   192	
   193	done:
   194		MOVD  R4, c+72(FP)
   195		RET
   196	
   197	// func addVW(z, x []Word, y Word) (c Word)
   198	TEXT ·addVW(SB), NOSPLIT, $0
   199		MOVD z+0(FP), R10	// R10 = z[]
   200		MOVD x+24(FP), R8	// R8 = x[]
   201		MOVD y+48(FP), R4	// R4 = y = c
   202		MOVD z_len+8(FP), R11	// R11 = z_len
   203	
   204		CMP   R0, R11		// If z_len is zero, return
   205		BEQ   done
   206	
   207		// We will process the first iteration out of the loop so we capture
   208		// the value of c. In the subsequent iterations, we will rely on the
   209		// value of CA set here.
   210		MOVD  0(R8), R20	// R20 = x[i]
   211		ADD   $-1, R11		// R11 = z_len - 1
   212		ADDC  R20, R4, R6	// R6 = x[i] + c
   213		CMP   R0, R11		// If z_len was 1, we are done
   214		MOVD  R6, 0(R10)	// z[i]
   215		BEQ   final
   216	
   217		// We will read 4 elements per iteration
   218		SRD   $2, R11, R9	// R9 = z_len/4
   219		DCBT  (R8)
   220		CMP   R0, R9
   221		MOVD  R9, CTR		// Set up the loop counter
   222		BEQ   tail		// If R9 = 0, we can't use the loop
   223	
   224	loop:
   225		MOVD  8(R8), R20	// R20 = x[i]
   226		MOVD  16(R8), R21	// R21 = x[i+1]
   227		MOVD  24(R8), R22	// R22 = x[i+2]
   228		MOVDU 32(R8), R23	// R23 = x[i+3]
   229		ADDZE R20, R24		// R24 = x[i] + CA
   230		ADDZE R21, R25		// R25 = x[i+1] + CA
   231		ADDZE R22, R26		// R26 = x[i+2] + CA
   232		ADDZE R23, R27		// R27 = x[i+3] + CA
   233		MOVD  R24, 8(R10)	// z[i]
   234		MOVD  R25, 16(R10)	// z[i+1]
   235		MOVD  R26, 24(R10)	// z[i+2]
   236		MOVDU R27, 32(R10)	// z[i+3]
   237		ADD   $-4, R11		// R11 = z_len - 4
   238		BC    16, 0, loop	// bdnz
   239	
   240		// We may have some elements to read
   241		CMP R0, R11
   242		BEQ final
   243	
   244	tail:
   245		MOVDU 8(R8), R20
   246		ADDZE R20, R24
   247		ADD $-1, R11
   248		MOVDU R24, 8(R10)
   249		CMP R0, R11
   250		BEQ final
   251	
   252		MOVDU 8(R8), R20
   253		ADDZE R20, R24
   254		ADD $-1, R11
   255		MOVDU R24, 8(R10)
   256		CMP R0, R11
   257		BEQ final
   258	
   259		MOVD 8(R8), R20
   260		ADDZE R20, R24
   261		MOVD R24, 8(R10)
   262	
   263	final:
   264		ADDZE R0, R4		// c = CA
   265	done:
   266		MOVD  R4, c+56(FP)
   267		RET
   268	
   269	// func subVW(z, x []Word, y Word) (c Word)
   270	TEXT ·subVW(SB), NOSPLIT, $0
   271		MOVD  z+0(FP), R10	// R10 = z[]
   272		MOVD  x+24(FP), R8	// R8 = x[]
   273		MOVD  y+48(FP), R4	// R4 = y = c
   274		MOVD  z_len+8(FP), R11	// R11 = z_len
   275	
   276		CMP   R0, R11		// If z_len is zero, return
   277		BEQ   done
   278	
   279		// We will process the first iteration out of the loop so we capture
   280		// the value of c. In the subsequent iterations, we will rely on the
   281		// value of CA set here.
   282		MOVD  0(R8), R20	// R20 = x[i]
   283		ADD   $-1, R11		// R11 = z_len - 1
   284		SUBC  R4, R20, R6	// R6 = x[i] - c
   285		CMP   R0, R11		// If z_len was 1, we are done
   286		MOVD  R6, 0(R10)	// z[i]
   287		BEQ   final
   288	
   289		// We will read 4 elements per iteration
   290		SRD   $2, R11, R9	// R9 = z_len/4
   291		DCBT  (R8)
   292		CMP   R0, R9
   293		MOVD  R9, CTR		// Set up the loop counter
   294		BEQ   tail		// If R9 = 0, we can't use the loop
   295	
   296		// The loop here is almost the same as the one used in s390x, but
   297		// we don't need to capture CA every iteration because we've already
   298		// done that above.
   299	loop:
   300		MOVD  8(R8), R20
   301		MOVD  16(R8), R21
   302		MOVD  24(R8), R22
   303		MOVDU 32(R8), R23
   304		SUBE  R0, R20
   305		SUBE  R0, R21
   306		SUBE  R0, R22
   307		SUBE  R0, R23
   308		MOVD  R20, 8(R10)
   309		MOVD  R21, 16(R10)
   310		MOVD  R22, 24(R10)
   311		MOVDU R23, 32(R10)
   312		ADD   $-4, R11
   313		BC    16, 0, loop	// bdnz
   314	
   315		// We may have some elements to read
   316		CMP   R0, R11
   317		BEQ   final
   318	
   319	tail:
   320		MOVDU 8(R8), R20
   321		SUBE  R0, R20
   322		ADD   $-1, R11
   323		MOVDU R20, 8(R10)
   324		CMP   R0, R11
   325		BEQ   final
   326	
   327		MOVDU 8(R8), R20
   328		SUBE  R0, R20
   329		ADD   $-1, R11
   330		MOVDU R20, 8(R10)
   331		CMP   R0, R11
   332		BEQ   final
   333	
   334		MOVD  8(R8), R20
   335		SUBE  R0, R20
   336		MOVD  R20, 8(R10)
   337	
   338	final:
   339		// Capture CA
   340		SUBE  R4, R4
   341		NEG   R4, R4
   342	
   343	done:
   344		MOVD  R4, c+56(FP)
   345		RET
   346	
   347	TEXT ·shlVU(SB), NOSPLIT, $0
   348		BR ·shlVU_g(SB)
   349	
   350	TEXT ·shrVU(SB), NOSPLIT, $0
   351		BR ·shrVU_g(SB)
   352	
   353	// func mulAddVWW(z, x []Word, y, r Word) (c Word)
   354	TEXT ·mulAddVWW(SB), NOSPLIT, $0
   355		MOVD    z+0(FP), R10      // R10 = z[]
   356		MOVD    x+24(FP), R8      // R8 = x[]
   357		MOVD    y+48(FP), R9      // R9 = y
   358		MOVD    r+56(FP), R4      // R4 = r = c
   359		MOVD    z_len+8(FP), R11  // R11 = z_len
   360	
   361		CMP     R0, R11
   362		BEQ     done
   363	
   364		MOVD    0(R8), R20
   365		ADD     $-1, R11
   366		MULLD   R9, R20, R6       // R6 = z0 = Low-order(x[i]*y)
   367		MULHDU  R9, R20, R7       // R7 = z1 = High-order(x[i]*y)
   368		ADDC    R4, R6            // R6 = z0 + r
   369		ADDZE   R7                // R7 = z1 + CA
   370		CMP     R0, R11
   371		MOVD    R7, R4            // R4 = c
   372		MOVD    R6, 0(R10)        // z[i]
   373		BEQ     done
   374	
   375		// We will read 4 elements per iteration
   376		SRD     $2, R11, R14      // R14 = z_len/4
   377		DCBT    (R8)
   378		CMP     R0, R14
   379		MOVD    R14, CTR          // Set up the loop counter
   380		BEQ     tail              // If R9 = 0, we can't use the loop
   381	
   382	loop:
   383		MOVD    8(R8), R20        // R20 = x[i]
   384		MOVD    16(R8), R21       // R21 = x[i+1]
   385		MOVD    24(R8), R22       // R22 = x[i+2]
   386		MOVDU   32(R8), R23       // R23 = x[i+3]
   387		MULLD   R9, R20, R24      // R24 = z0[i]
   388		MULHDU  R9, R20, R20      // R20 = z1[i]
   389		ADDC    R4, R24           // R24 = z0[i] + c
   390		ADDZE   R20               // R7 = z1[i] + CA
   391		MULLD   R9, R21, R25
   392		MULHDU  R9, R21, R21
   393		ADDC    R20, R25
   394		ADDZE   R21
   395		MULLD   R9, R22, R26
   396		MULHDU  R9, R22, R22
   397		ADDC    R21, R26
   398		ADDZE   R22
   399		MULLD   R9, R23, R27
   400		MULHDU  R9, R23, R23
   401		ADDC    R22, R27
   402		ADDZE   R23
   403		MOVD    R24, 8(R10)       // z[i]
   404		MOVD    R25, 16(R10)      // z[i+1]
   405		MOVD    R26, 24(R10)      // z[i+2]
   406		MOVDU   R27, 32(R10)      // z[i+3]
   407		MOVD    R23, R4           // R4 = c
   408		ADD     $-4, R11          // R11 = z_len - 4
   409		BC      16, 0, loop       // bdnz
   410	
   411		// We may have some elements to read
   412		CMP   R0, R11
   413		BEQ   done
   414	
   415		// Process the remaining elements, one at a time
   416	tail:
   417		MOVDU   8(R8), R20        // R20 = x[i]
   418		MULLD   R9, R20, R24      // R24 = z0[i]
   419		MULHDU  R9, R20, R25      // R25 = z1[i]
   420		ADD     $-1, R11          // R11 = z_len - 1
   421		ADDC    R4, R24
   422		ADDZE   R25
   423		MOVDU   R24, 8(R10)       // z[i]
   424		CMP     R0, R11
   425		MOVD    R25, R4           // R4 = c
   426		BEQ     done              // If R11 = 0, we are done
   427	
   428		MOVDU   8(R8), R20
   429		MULLD   R9, R20, R24
   430		MULHDU  R9, R20, R25
   431		ADD     $-1, R11
   432		ADDC    R4, R24
   433		ADDZE   R25
   434		MOVDU   R24, 8(R10)
   435		CMP     R0, R11
   436		MOVD    R25, R4
   437		BEQ     done
   438	
   439		MOVD    8(R8), R20
   440		MULLD   R9, R20, R24
   441		MULHDU  R9, R20, R25
   442		ADD     $-1, R11
   443		ADDC    R4, R24
   444		ADDZE   R25
   445		MOVD    R24, 8(R10)
   446		MOVD    R25, R4
   447	
   448	done:
   449		MOVD    R4, c+64(FP)
   450		RET
   451	
   452	// func addMulVVW(z, x []Word, y Word) (c Word)
   453	TEXT ·addMulVVW(SB), NOSPLIT, $0
   454		MOVD z+0(FP), R10	// R10 = z[]
   455		MOVD x+24(FP), R8	// R8 = x[]
   456		MOVD y+48(FP), R9	// R9 = y
   457		MOVD z_len+8(FP), R22	// R22 = z_len
   458	
   459		MOVD R0, R3		// R3 will be the index register
   460		CMP  R0, R22
   461		MOVD R0, R4		// R4 = c = 0
   462		MOVD R22, CTR		// Initialize loop counter
   463		BEQ  done
   464	
   465	loop:
   466		MOVD  (R8)(R3), R20	// Load x[i]
   467		MOVD  (R10)(R3), R21	// Load z[i]
   468		MULLD  R9, R20, R6	// R6 = Low-order(x[i]*y)
   469		MULHDU R9, R20, R7	// R7 = High-order(x[i]*y)
   470		ADDC   R21, R6		// R6 = z0
   471		ADDZE  R7		// R7 = z1
   472		ADDC   R4, R6		// R6 = z0 + c + 0
   473		ADDZE  R7, R4           // c += z1
   474		MOVD   R6, (R10)(R3)	// Store z[i]
   475		ADD    $8, R3
   476		BC  16, 0, loop		// bdnz
   477	
   478	done:
   479		MOVD R4, c+56(FP)
   480		RET
   481	
   482	// func divWW(x1, x0, y Word) (q, r Word)
   483	TEXT ·divWW(SB), NOSPLIT, $0
   484		MOVD x1+0(FP), R4
   485		MOVD x0+8(FP), R5
   486		MOVD y+16(FP), R6
   487	
   488		CMPU R4, R6
   489		BGE  divbigger
   490	
   491		// from the programmer's note in ch. 3 of the ISA manual, p.74
   492		DIVDEU R6, R4, R3
   493		DIVDU  R6, R5, R7
   494		MULLD  R6, R3, R8
   495		MULLD  R6, R7, R20
   496		SUB    R20, R5, R10
   497		ADD    R7, R3, R3
   498		SUB    R8, R10, R4
   499		CMPU   R4, R10
   500		BLT    adjust
   501		CMPU   R4, R6
   502		BLT    end
   503	
   504	adjust:
   505		MOVD $1, R21
   506		ADD  R21, R3, R3
   507		SUB  R6, R4, R4
   508	
   509	end:
   510		MOVD R3, q+24(FP)
   511		MOVD R4, r+32(FP)
   512	
   513		RET
   514	
   515	divbigger:
   516		MOVD $-1, R7
   517		MOVD R7, q+24(FP)
   518		MOVD R7, r+32(FP)
   519		RET
   520	
   521	TEXT ·divWVW(SB), NOSPLIT, $0
   522		BR ·divWVW_g(SB)

View as plain text