...

Text file src/math/big/arith_amd64.s

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// +build !math_big_pure_go
     6	
     7	#include "textflag.h"
     8	
     9	// This file provides fast assembly versions for the elementary
    10	// arithmetic operations on vectors implemented in arith.go.
    11	
    12	// func mulWW(x, y Word) (z1, z0 Word)
    13	TEXT ·mulWW(SB),NOSPLIT,$0
    14		MOVQ x+0(FP), AX
    15		MULQ y+8(FP)
    16		MOVQ DX, z1+16(FP)
    17		MOVQ AX, z0+24(FP)
    18		RET
    19	
    20	
    21	// func divWW(x1, x0, y Word) (q, r Word)
    22	TEXT ·divWW(SB),NOSPLIT,$0
    23		MOVQ x1+0(FP), DX
    24		MOVQ x0+8(FP), AX
    25		DIVQ y+16(FP)
    26		MOVQ AX, q+24(FP)
    27		MOVQ DX, r+32(FP)
    28		RET
    29	
    30	// The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
    31	// It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
    32	// This is faster than using rotate instructions.
    33	
    34	// func addVV(z, x, y []Word) (c Word)
    35	TEXT ·addVV(SB),NOSPLIT,$0
    36		MOVQ z_len+8(FP), DI
    37		MOVQ x+24(FP), R8
    38		MOVQ y+48(FP), R9
    39		MOVQ z+0(FP), R10
    40	
    41		MOVQ $0, CX		// c = 0
    42		MOVQ $0, SI		// i = 0
    43	
    44		// s/JL/JMP/ below to disable the unrolled loop
    45		SUBQ $4, DI		// n -= 4
    46		JL V1			// if n < 0 goto V1
    47	
    48	U1:	// n >= 0
    49		// regular loop body unrolled 4x
    50		ADDQ CX, CX		// restore CF
    51		MOVQ 0(R8)(SI*8), R11
    52		MOVQ 8(R8)(SI*8), R12
    53		MOVQ 16(R8)(SI*8), R13
    54		MOVQ 24(R8)(SI*8), R14
    55		ADCQ 0(R9)(SI*8), R11
    56		ADCQ 8(R9)(SI*8), R12
    57		ADCQ 16(R9)(SI*8), R13
    58		ADCQ 24(R9)(SI*8), R14
    59		MOVQ R11, 0(R10)(SI*8)
    60		MOVQ R12, 8(R10)(SI*8)
    61		MOVQ R13, 16(R10)(SI*8)
    62		MOVQ R14, 24(R10)(SI*8)
    63		SBBQ CX, CX		// save CF
    64	
    65		ADDQ $4, SI		// i += 4
    66		SUBQ $4, DI		// n -= 4
    67		JGE U1			// if n >= 0 goto U1
    68	
    69	V1:	ADDQ $4, DI		// n += 4
    70		JLE E1			// if n <= 0 goto E1
    71	
    72	L1:	// n > 0
    73		ADDQ CX, CX		// restore CF
    74		MOVQ 0(R8)(SI*8), R11
    75		ADCQ 0(R9)(SI*8), R11
    76		MOVQ R11, 0(R10)(SI*8)
    77		SBBQ CX, CX		// save CF
    78	
    79		ADDQ $1, SI		// i++
    80		SUBQ $1, DI		// n--
    81		JG L1			// if n > 0 goto L1
    82	
    83	E1:	NEGQ CX
    84		MOVQ CX, c+72(FP)	// return c
    85		RET
    86	
    87	
    88	// func subVV(z, x, y []Word) (c Word)
    89	// (same as addVV except for SBBQ instead of ADCQ and label names)
    90	TEXT ·subVV(SB),NOSPLIT,$0
    91		MOVQ z_len+8(FP), DI
    92		MOVQ x+24(FP), R8
    93		MOVQ y+48(FP), R9
    94		MOVQ z+0(FP), R10
    95	
    96		MOVQ $0, CX		// c = 0
    97		MOVQ $0, SI		// i = 0
    98	
    99		// s/JL/JMP/ below to disable the unrolled loop
   100		SUBQ $4, DI		// n -= 4
   101		JL V2			// if n < 0 goto V2
   102	
   103	U2:	// n >= 0
   104		// regular loop body unrolled 4x
   105		ADDQ CX, CX		// restore CF
   106		MOVQ 0(R8)(SI*8), R11
   107		MOVQ 8(R8)(SI*8), R12
   108		MOVQ 16(R8)(SI*8), R13
   109		MOVQ 24(R8)(SI*8), R14
   110		SBBQ 0(R9)(SI*8), R11
   111		SBBQ 8(R9)(SI*8), R12
   112		SBBQ 16(R9)(SI*8), R13
   113		SBBQ 24(R9)(SI*8), R14
   114		MOVQ R11, 0(R10)(SI*8)
   115		MOVQ R12, 8(R10)(SI*8)
   116		MOVQ R13, 16(R10)(SI*8)
   117		MOVQ R14, 24(R10)(SI*8)
   118		SBBQ CX, CX		// save CF
   119	
   120		ADDQ $4, SI		// i += 4
   121		SUBQ $4, DI		// n -= 4
   122		JGE U2			// if n >= 0 goto U2
   123	
   124	V2:	ADDQ $4, DI		// n += 4
   125		JLE E2			// if n <= 0 goto E2
   126	
   127	L2:	// n > 0
   128		ADDQ CX, CX		// restore CF
   129		MOVQ 0(R8)(SI*8), R11
   130		SBBQ 0(R9)(SI*8), R11
   131		MOVQ R11, 0(R10)(SI*8)
   132		SBBQ CX, CX		// save CF
   133	
   134		ADDQ $1, SI		// i++
   135		SUBQ $1, DI		// n--
   136		JG L2			// if n > 0 goto L2
   137	
   138	E2:	NEGQ CX
   139		MOVQ CX, c+72(FP)	// return c
   140		RET
   141	
   142	
   143	// func addVW(z, x []Word, y Word) (c Word)
   144	TEXT ·addVW(SB),NOSPLIT,$0
   145		MOVQ z_len+8(FP), DI
   146		CMPQ DI, $32
   147		JG large
   148		MOVQ x+24(FP), R8
   149		MOVQ y+48(FP), CX	// c = y
   150		MOVQ z+0(FP), R10
   151	
   152		MOVQ $0, SI		// i = 0
   153	
   154		// s/JL/JMP/ below to disable the unrolled loop
   155		SUBQ $4, DI		// n -= 4
   156		JL V3			// if n < 4 goto V3
   157	
   158	U3:	// n >= 0
   159		// regular loop body unrolled 4x
   160		MOVQ 0(R8)(SI*8), R11
   161		MOVQ 8(R8)(SI*8), R12
   162		MOVQ 16(R8)(SI*8), R13
   163		MOVQ 24(R8)(SI*8), R14
   164		ADDQ CX, R11
   165		ADCQ $0, R12
   166		ADCQ $0, R13
   167		ADCQ $0, R14
   168		SBBQ CX, CX		// save CF
   169		NEGQ CX
   170		MOVQ R11, 0(R10)(SI*8)
   171		MOVQ R12, 8(R10)(SI*8)
   172		MOVQ R13, 16(R10)(SI*8)
   173		MOVQ R14, 24(R10)(SI*8)
   174	
   175		ADDQ $4, SI		// i += 4
   176		SUBQ $4, DI		// n -= 4
   177		JGE U3			// if n >= 0 goto U3
   178	
   179	V3:	ADDQ $4, DI		// n += 4
   180		JLE E3			// if n <= 0 goto E3
   181	
   182	L3:	// n > 0
   183		ADDQ 0(R8)(SI*8), CX
   184		MOVQ CX, 0(R10)(SI*8)
   185		SBBQ CX, CX		// save CF
   186		NEGQ CX
   187	
   188		ADDQ $1, SI		// i++
   189		SUBQ $1, DI		// n--
   190		JG L3			// if n > 0 goto L3
   191	
   192	E3:	MOVQ CX, c+56(FP)	// return c
   193		RET
   194	large:
   195		JMP ·addVWlarge(SB)
   196	
   197	
   198	// func subVW(z, x []Word, y Word) (c Word)
   199	// (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
   200	TEXT ·subVW(SB),NOSPLIT,$0
   201		MOVQ z_len+8(FP), DI
   202		CMPQ DI, $32
   203		JG large
   204		MOVQ x+24(FP), R8
   205		MOVQ y+48(FP), CX	// c = y
   206		MOVQ z+0(FP), R10
   207	
   208		MOVQ $0, SI		// i = 0
   209	
   210		// s/JL/JMP/ below to disable the unrolled loop
   211		SUBQ $4, DI		// n -= 4
   212		JL V4			// if n < 4 goto V4
   213	
   214	U4:	// n >= 0
   215		// regular loop body unrolled 4x
   216		MOVQ 0(R8)(SI*8), R11
   217		MOVQ 8(R8)(SI*8), R12
   218		MOVQ 16(R8)(SI*8), R13
   219		MOVQ 24(R8)(SI*8), R14
   220		SUBQ CX, R11
   221		SBBQ $0, R12
   222		SBBQ $0, R13
   223		SBBQ $0, R14
   224		SBBQ CX, CX		// save CF
   225		NEGQ CX
   226		MOVQ R11, 0(R10)(SI*8)
   227		MOVQ R12, 8(R10)(SI*8)
   228		MOVQ R13, 16(R10)(SI*8)
   229		MOVQ R14, 24(R10)(SI*8)
   230	
   231		ADDQ $4, SI		// i += 4
   232		SUBQ $4, DI		// n -= 4
   233		JGE U4			// if n >= 0 goto U4
   234	
   235	V4:	ADDQ $4, DI		// n += 4
   236		JLE E4			// if n <= 0 goto E4
   237	
   238	L4:	// n > 0
   239		MOVQ 0(R8)(SI*8), R11
   240		SUBQ CX, R11
   241		MOVQ R11, 0(R10)(SI*8)
   242		SBBQ CX, CX		// save CF
   243		NEGQ CX
   244	
   245		ADDQ $1, SI		// i++
   246		SUBQ $1, DI		// n--
   247		JG L4			// if n > 0 goto L4
   248	
   249	E4:	MOVQ CX, c+56(FP)	// return c
   250		RET
   251	large:
   252		JMP ·subVWlarge(SB)
   253	
   254	
   255	// func shlVU(z, x []Word, s uint) (c Word)
   256	TEXT ·shlVU(SB),NOSPLIT,$0
   257		MOVQ z_len+8(FP), BX	// i = z
   258		SUBQ $1, BX		// i--
   259		JL X8b			// i < 0	(n <= 0)
   260	
   261		// n > 0
   262		MOVQ z+0(FP), R10
   263		MOVQ x+24(FP), R8
   264		MOVQ s+48(FP), CX
   265		MOVQ (R8)(BX*8), AX	// w1 = x[n-1]
   266		MOVQ $0, DX
   267		SHLQ CX, AX, DX		// w1>>ŝ
   268		MOVQ DX, c+56(FP)
   269	
   270		CMPQ BX, $0
   271		JLE X8a			// i <= 0
   272	
   273		// i > 0
   274	L8:	MOVQ AX, DX		// w = w1
   275		MOVQ -8(R8)(BX*8), AX	// w1 = x[i-1]
   276		SHLQ CX, AX, DX		// w<<s | w1>>ŝ
   277		MOVQ DX, (R10)(BX*8)	// z[i] = w<<s | w1>>ŝ
   278		SUBQ $1, BX		// i--
   279		JG L8			// i > 0
   280	
   281		// i <= 0
   282	X8a:	SHLQ CX, AX		// w1<<s
   283		MOVQ AX, (R10)		// z[0] = w1<<s
   284		RET
   285	
   286	X8b:	MOVQ $0, c+56(FP)
   287		RET
   288	
   289	
   290	// func shrVU(z, x []Word, s uint) (c Word)
   291	TEXT ·shrVU(SB),NOSPLIT,$0
   292		MOVQ z_len+8(FP), R11
   293		SUBQ $1, R11		// n--
   294		JL X9b			// n < 0	(n <= 0)
   295	
   296		// n > 0
   297		MOVQ z+0(FP), R10
   298		MOVQ x+24(FP), R8
   299		MOVQ s+48(FP), CX
   300		MOVQ (R8), AX		// w1 = x[0]
   301		MOVQ $0, DX
   302		SHRQ CX, AX, DX		// w1<<ŝ
   303		MOVQ DX, c+56(FP)
   304	
   305		MOVQ $0, BX		// i = 0
   306		JMP E9
   307	
   308		// i < n-1
   309	L9:	MOVQ AX, DX		// w = w1
   310		MOVQ 8(R8)(BX*8), AX	// w1 = x[i+1]
   311		SHRQ CX, AX, DX		// w>>s | w1<<ŝ
   312		MOVQ DX, (R10)(BX*8)	// z[i] = w>>s | w1<<ŝ
   313		ADDQ $1, BX		// i++
   314	
   315	E9:	CMPQ BX, R11
   316		JL L9			// i < n-1
   317	
   318		// i >= n-1
   319	X9a:	SHRQ CX, AX		// w1>>s
   320		MOVQ AX, (R10)(R11*8)	// z[n-1] = w1>>s
   321		RET
   322	
   323	X9b:	MOVQ $0, c+56(FP)
   324		RET
   325	
   326	
   327	// func mulAddVWW(z, x []Word, y, r Word) (c Word)
   328	TEXT ·mulAddVWW(SB),NOSPLIT,$0
   329		MOVQ z+0(FP), R10
   330		MOVQ x+24(FP), R8
   331		MOVQ y+48(FP), R9
   332		MOVQ r+56(FP), CX	// c = r
   333		MOVQ z_len+8(FP), R11
   334		MOVQ $0, BX		// i = 0
   335	
   336		CMPQ R11, $4
   337		JL E5
   338	
   339	U5:	// i+4 <= n
   340		// regular loop body unrolled 4x
   341		MOVQ (0*8)(R8)(BX*8), AX
   342		MULQ R9
   343		ADDQ CX, AX
   344		ADCQ $0, DX
   345		MOVQ AX, (0*8)(R10)(BX*8)
   346		MOVQ DX, CX
   347		MOVQ (1*8)(R8)(BX*8), AX
   348		MULQ R9
   349		ADDQ CX, AX
   350		ADCQ $0, DX
   351		MOVQ AX, (1*8)(R10)(BX*8)
   352		MOVQ DX, CX
   353		MOVQ (2*8)(R8)(BX*8), AX
   354		MULQ R9
   355		ADDQ CX, AX
   356		ADCQ $0, DX
   357		MOVQ AX, (2*8)(R10)(BX*8)
   358		MOVQ DX, CX
   359		MOVQ (3*8)(R8)(BX*8), AX
   360		MULQ R9
   361		ADDQ CX, AX
   362		ADCQ $0, DX
   363		MOVQ AX, (3*8)(R10)(BX*8)
   364		MOVQ DX, CX
   365		ADDQ $4, BX		// i += 4
   366	
   367		LEAQ 4(BX), DX
   368		CMPQ DX, R11
   369		JLE U5
   370		JMP E5
   371	
   372	L5:	MOVQ (R8)(BX*8), AX
   373		MULQ R9
   374		ADDQ CX, AX
   375		ADCQ $0, DX
   376		MOVQ AX, (R10)(BX*8)
   377		MOVQ DX, CX
   378		ADDQ $1, BX		// i++
   379	
   380	E5:	CMPQ BX, R11		// i < n
   381		JL L5
   382	
   383		MOVQ CX, c+64(FP)
   384		RET
   385	
   386	
   387	// func addMulVVW(z, x []Word, y Word) (c Word)
   388	TEXT ·addMulVVW(SB),NOSPLIT,$0
   389		CMPB    ·support_adx(SB), $1
   390		JEQ adx
   391		MOVQ z+0(FP), R10
   392		MOVQ x+24(FP), R8
   393		MOVQ y+48(FP), R9
   394		MOVQ z_len+8(FP), R11
   395		MOVQ $0, BX		// i = 0
   396		MOVQ $0, CX		// c = 0
   397		MOVQ R11, R12
   398		ANDQ $-2, R12
   399		CMPQ R11, $2
   400		JAE A6
   401		JMP E6
   402	
   403	A6:
   404		MOVQ (R8)(BX*8), AX
   405		MULQ R9
   406		ADDQ (R10)(BX*8), AX
   407		ADCQ $0, DX
   408		ADDQ CX, AX
   409		ADCQ $0, DX
   410		MOVQ DX, CX
   411		MOVQ AX, (R10)(BX*8)
   412	
   413		MOVQ (8)(R8)(BX*8), AX
   414		MULQ R9
   415		ADDQ (8)(R10)(BX*8), AX
   416		ADCQ $0, DX
   417		ADDQ CX, AX
   418		ADCQ $0, DX
   419		MOVQ DX, CX
   420		MOVQ AX, (8)(R10)(BX*8)
   421	
   422		ADDQ $2, BX
   423		CMPQ BX, R12
   424		JL A6
   425		JMP E6
   426	
   427	L6:	MOVQ (R8)(BX*8), AX
   428		MULQ R9
   429		ADDQ CX, AX
   430		ADCQ $0, DX
   431		ADDQ AX, (R10)(BX*8)
   432		ADCQ $0, DX
   433		MOVQ DX, CX
   434		ADDQ $1, BX		// i++
   435	
   436	E6:	CMPQ BX, R11		// i < n
   437		JL L6
   438	
   439		MOVQ CX, c+56(FP)
   440		RET
   441	
   442	adx:
   443		MOVQ z_len+8(FP), R11
   444		MOVQ z+0(FP), R10
   445		MOVQ x+24(FP), R8
   446		MOVQ y+48(FP), DX
   447		MOVQ $0, BX   // i = 0
   448		MOVQ $0, CX   // carry
   449		CMPQ R11, $8
   450		JAE  adx_loop_header
   451		CMPQ BX, R11
   452		JL adx_short
   453		MOVQ CX, c+56(FP)
   454		RET
   455	
   456	adx_loop_header:
   457		MOVQ  R11, R13
   458		ANDQ  $-8, R13
   459	adx_loop:
   460		XORQ  R9, R9  // unset flags
   461		MULXQ (R8), SI, DI
   462		ADCXQ CX,SI
   463		ADOXQ (R10), SI
   464		MOVQ  SI,(R10)
   465	
   466		MULXQ 8(R8), AX, CX
   467		ADCXQ DI, AX
   468		ADOXQ 8(R10), AX
   469		MOVQ  AX, 8(R10)
   470	
   471		MULXQ 16(R8), SI, DI
   472		ADCXQ CX, SI
   473		ADOXQ 16(R10), SI
   474		MOVQ  SI, 16(R10)
   475	
   476		MULXQ 24(R8), AX, CX
   477		ADCXQ DI, AX
   478		ADOXQ 24(R10), AX
   479		MOVQ  AX, 24(R10)
   480	
   481		MULXQ 32(R8), SI, DI
   482		ADCXQ CX, SI
   483		ADOXQ 32(R10), SI
   484		MOVQ  SI, 32(R10)
   485	
   486		MULXQ 40(R8), AX, CX
   487		ADCXQ DI, AX
   488		ADOXQ 40(R10), AX
   489		MOVQ  AX, 40(R10)
   490	
   491		MULXQ 48(R8), SI, DI
   492		ADCXQ CX, SI
   493		ADOXQ 48(R10), SI
   494		MOVQ  SI, 48(R10)
   495	
   496		MULXQ 56(R8), AX, CX
   497		ADCXQ DI, AX
   498		ADOXQ 56(R10), AX
   499		MOVQ  AX, 56(R10)
   500	
   501		ADCXQ R9, CX
   502		ADOXQ R9, CX
   503	
   504		ADDQ $64, R8
   505		ADDQ $64, R10
   506		ADDQ $8, BX
   507	
   508		CMPQ BX, R13
   509		JL adx_loop
   510		MOVQ z+0(FP), R10
   511		MOVQ x+24(FP), R8
   512		CMPQ BX, R11
   513		JL adx_short
   514		MOVQ CX, c+56(FP)
   515		RET
   516	
   517	adx_short:
   518		MULXQ (R8)(BX*8), SI, DI
   519		ADDQ CX, SI
   520		ADCQ $0, DI
   521		ADDQ SI, (R10)(BX*8)
   522		ADCQ $0, DI
   523		MOVQ DI, CX
   524		ADDQ $1, BX		// i++
   525	
   526		CMPQ BX, R11
   527		JL adx_short
   528	
   529		MOVQ CX, c+56(FP)
   530		RET
   531	
   532	
   533	
   534	// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
   535	TEXT ·divWVW(SB),NOSPLIT,$0
   536		MOVQ z+0(FP), R10
   537		MOVQ xn+24(FP), DX	// r = xn
   538		MOVQ x+32(FP), R8
   539		MOVQ y+56(FP), R9
   540		MOVQ z_len+8(FP), BX	// i = z
   541		JMP E7
   542	
   543	L7:	MOVQ (R8)(BX*8), AX
   544		DIVQ R9
   545		MOVQ AX, (R10)(BX*8)
   546	
   547	E7:	SUBQ $1, BX		// i--
   548		JGE L7			// i >= 0
   549	
   550		MOVQ DX, r+64(FP)
   551		RET

View as plain text