...

Text file src/pkg/math/big/arith_s390x.s

     1	// Copyright 2016 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// +build !math_big_pure_go,s390x
     6	
     7	#include "textflag.h"
     8	
     9	// This file provides fast assembly versions for the elementary
    10	// arithmetic operations on vectors implemented in arith.go.
    11	
    12	TEXT ·hasVectorFacility(SB),NOSPLIT,$24-1
    13	        MOVD    $x-24(SP), R1
    14	        XC      $24, 0(R1), 0(R1) // clear the storage
    15	        MOVD    $2, R0            // R0 is the number of double words stored -1
    16	        WORD    $0xB2B01000       // STFLE 0(R1)
    17	        XOR     R0, R0            // reset the value of R0
    18	        MOVBZ   z-8(SP), R1
    19	        AND     $0x40, R1
    20	        BEQ     novector
    21	vectorinstalled:
    22	        // check if the vector instruction has been enabled
    23	        VLEIB   $0, $0xF, V16
    24	        VLGVB   $0, V16, R1
    25	        CMPBNE  R1, $0xF, novector
    26	        MOVB    $1, ret+0(FP) // have vx
    27	        RET
    28	novector:
    29	        MOVB    $0, ret+0(FP) // no vx
    30	        RET
    31	
    32	TEXT ·mulWW(SB),NOSPLIT,$0
    33		MOVD	x+0(FP), R3
    34		MOVD	y+8(FP), R4
    35		MULHDU	R3, R4
    36		MOVD	R10, z1+16(FP)
    37		MOVD	R11, z0+24(FP)
    38		RET
    39	
    40	// func divWW(x1, x0, y Word) (q, r Word)
    41	TEXT ·divWW(SB),NOSPLIT,$0
    42		MOVD	x1+0(FP), R10
    43		MOVD	x0+8(FP), R11
    44		MOVD	y+16(FP), R5
    45		WORD	$0xb98700a5 // dlgr r10,r5
    46		MOVD	R11, q+24(FP)
    47		MOVD	R10, r+32(FP)
    48		RET
    49	
    50	// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
    51	// func addVV(z, x, y []Word) (c Word)
    52	
    53	
    54	TEXT ·addVV(SB),NOSPLIT,$0
    55		MOVD	addvectorfacility+0x00(SB),R1
    56		BR	(R1)
    57	
    58	TEXT ·addVV_check(SB),NOSPLIT, $0
    59		MOVB	·hasVX(SB), R1
    60		CMPBEQ	R1, $1, vectorimpl      // vectorfacility = 1, vector supported
    61		MOVD	$addvectorfacility+0x00(SB), R1
    62		MOVD	$·addVV_novec(SB), R2
    63		MOVD	R2, 0(R1)
    64		//MOVD	$·addVV_novec(SB), 0(R1)
    65		BR	·addVV_novec(SB)
    66	vectorimpl:
    67		MOVD	$addvectorfacility+0x00(SB), R1
    68		MOVD	$·addVV_vec(SB), R2
    69		MOVD	R2, 0(R1)
    70		//MOVD	$·addVV_vec(SB), 0(R1)
    71		BR	·addVV_vec(SB)
    72	
    73	GLOBL addvectorfacility+0x00(SB), NOPTR, $8
    74	DATA addvectorfacility+0x00(SB)/8, $·addVV_check(SB)
    75	
    76	TEXT ·addVV_vec(SB),NOSPLIT,$0
    77		MOVD	z_len+8(FP), R3
    78		MOVD	x+24(FP), R8
    79		MOVD	y+48(FP), R9
    80		MOVD	z+0(FP), R2
    81	
    82		MOVD	$0, R4		// c = 0
    83		MOVD	$0, R0		// make sure it's zero
    84		MOVD	$0, R10		// i = 0
    85	
    86	
    87		// s/JL/JMP/ below to disable the unrolled loop
    88		SUB	$4, R3
    89		BLT	v1
    90		SUB     $12, R3                 // n -= 16
    91	        BLT     A1                      // if n < 0 goto A1
    92	
    93		MOVD	R8, R5
    94		MOVD	R9, R6
    95		MOVD	R2, R7
    96		// n >= 0
    97		// regular loop body unrolled 16x
    98		VZERO	V0			// c = 0
    99	UU1:	VLM	0(R5), V1, V4		// 64-bytes into V1..V8
   100		ADD	$64, R5
   101		VPDI	$0x4,V1,V1,V1		// flip the doublewords to big-endian order
   102		VPDI	$0x4,V2,V2,V2		// flip the doublewords to big-endian order
   103	
   104	
   105		VLM	0(R6), V9, V12  	// 64-bytes into V9..V16
   106		ADD	$64, R6
   107		VPDI	$0x4,V9,V9,V9		// flip the doublewords to big-endian order
   108		VPDI	$0x4,V10,V10,V10	// flip the doublewords to big-endian order
   109	
   110		VACCCQ	V1, V9, V0, V25
   111		VACQ	V1, V9, V0, V17
   112		VACCCQ	V2, V10, V25, V26
   113		VACQ	V2, V10, V25, V18
   114	
   115	
   116		VLM	0(R5), V5, V6		// 32-bytes into V1..V8
   117		VLM	0(R6), V13, V14  	// 32-bytes into V9..V16
   118		ADD	$32, R5
   119		ADD	$32, R6
   120	
   121		VPDI	$0x4,V3,V3,V3		// flip the doublewords to big-endian order
   122		VPDI	$0x4,V4,V4,V4		// flip the doublewords to big-endian order
   123		VPDI	$0x4,V11,V11,V11	// flip the doublewords to big-endian order
   124		VPDI	$0x4,V12,V12,V12	// flip the doublewords to big-endian order
   125	
   126		VACCCQ	V3, V11, V26, V27
   127		VACQ	V3, V11, V26, V19
   128		VACCCQ	V4, V12, V27, V28
   129		VACQ	V4, V12, V27, V20
   130	
   131		VLM	0(R5), V7, V8		// 32-bytes into V1..V8
   132		VLM	0(R6), V15, V16  	// 32-bytes into V9..V16
   133		ADD	$32, R5
   134		ADD	$32, R6
   135	
   136		VPDI	$0x4,V5,V5,V5		// flip the doublewords to big-endian order
   137		VPDI	$0x4,V6,V6,V6		// flip the doublewords to big-endian order
   138		VPDI	$0x4,V13,V13,V13	// flip the doublewords to big-endian order
   139		VPDI	$0x4,V14,V14,V14	// flip the doublewords to big-endian order
   140	
   141		VACCCQ	V5, V13, V28, V29
   142		VACQ	V5, V13, V28, V21
   143		VACCCQ	V6, V14, V29, V30
   144		VACQ	V6, V14, V29, V22
   145	
   146		VPDI	$0x4,V7,V7,V7		// flip the doublewords to big-endian order
   147		VPDI	$0x4,V8,V8,V8		// flip the doublewords to big-endian order
   148		VPDI	$0x4,V15,V15,V15	// flip the doublewords to big-endian order
   149		VPDI	$0x4,V16,V16,V16	// flip the doublewords to big-endian order
   150	
   151		VACCCQ	V7, V15, V30, V31
   152		VACQ	V7, V15, V30, V23
   153		VACCCQ	V8, V16, V31, V0	//V0 has carry-over
   154		VACQ	V8, V16, V31, V24
   155	
   156		VPDI	$0x4,V17,V17,V17	// flip the doublewords to big-endian order
   157		VPDI	$0x4,V18,V18,V18	// flip the doublewords to big-endian order
   158		VPDI	$0x4,V19,V19,V19	// flip the doublewords to big-endian order
   159		VPDI	$0x4,V20,V20,V20	// flip the doublewords to big-endian order
   160		VPDI	$0x4,V21,V21,V21	// flip the doublewords to big-endian order
   161		VPDI	$0x4,V22,V22,V22	// flip the doublewords to big-endian order
   162		VPDI	$0x4,V23,V23,V23	// flip the doublewords to big-endian order
   163		VPDI	$0x4,V24,V24,V24	// flip the doublewords to big-endian order
   164		VSTM	V17, V24, 0(R7)  	// 128-bytes into z
   165		ADD	$128, R7
   166		ADD	$128, R10	// i += 16
   167		SUB	$16,  R3	// n -= 16
   168		BGE	UU1		// if n >= 0 goto U1
   169		VLGVG	$1, V0, R4	// put cf into R4
   170		NEG	R4, R4		// save cf
   171	
   172	A1:	ADD	$12, R3		// n += 16
   173	
   174	
   175		// s/JL/JMP/ below to disable the unrolled loop
   176		BLT	v1		// if n < 0 goto v1
   177	
   178	U1:	// n >= 0
   179		// regular loop body unrolled 4x
   180		MOVD	0(R8)(R10*1), R5
   181		MOVD	8(R8)(R10*1), R6
   182		MOVD	16(R8)(R10*1), R7
   183		MOVD	24(R8)(R10*1), R1
   184		ADDC	R4, R4		// restore CF
   185		MOVD	0(R9)(R10*1), R11
   186		ADDE	R11, R5
   187		MOVD	8(R9)(R10*1), R11
   188		ADDE	R11, R6
   189		MOVD	16(R9)(R10*1), R11
   190		ADDE	R11, R7
   191		MOVD	24(R9)(R10*1), R11
   192		ADDE	R11, R1
   193		MOVD	R0, R4
   194		ADDE	R4, R4		// save CF
   195		NEG	R4, R4
   196		MOVD	R5, 0(R2)(R10*1)
   197		MOVD	R6, 8(R2)(R10*1)
   198		MOVD	R7, 16(R2)(R10*1)
   199		MOVD	R1, 24(R2)(R10*1)
   200	
   201	
   202		ADD	$32, R10	// i += 4
   203		SUB	$4,  R3		// n -= 4
   204		BGE	U1		// if n >= 0 goto U1
   205	
   206	v1:	ADD	$4, R3		// n += 4
   207		BLE	E1		// if n <= 0 goto E1
   208	
   209	L1:	// n > 0
   210		ADDC	R4, R4		// restore CF
   211		MOVD	0(R8)(R10*1), R5
   212		MOVD	0(R9)(R10*1), R11
   213		ADDE	R11, R5
   214		MOVD	R5, 0(R2)(R10*1)
   215		MOVD	R0, R4
   216		ADDE	R4, R4		// save CF
   217		NEG 	R4, R4
   218	
   219		ADD	$8, R10		// i++
   220		SUB	$1, R3		// n--
   221		BGT	L1		// if n > 0 goto L1
   222	
   223	E1:	NEG	R4, R4
   224		MOVD	R4, c+72(FP)	// return c
   225		RET
   226	
   227	TEXT ·addVV_novec(SB),NOSPLIT,$0
   228	novec:
   229		MOVD	z_len+8(FP), R3
   230		MOVD	x+24(FP), R8
   231		MOVD	y+48(FP), R9
   232		MOVD	z+0(FP), R2
   233	
   234		MOVD	$0, R4		// c = 0
   235		MOVD	$0, R0		// make sure it's zero
   236		MOVD	$0, R10		// i = 0
   237	
   238		// s/JL/JMP/ below to disable the unrolled loop
   239		SUB	$4, R3		// n -= 4
   240		BLT	v1n		// if n < 0 goto v1n
   241	U1n:	// n >= 0
   242		// regular loop body unrolled 4x
   243		MOVD	0(R8)(R10*1), R5
   244		MOVD	8(R8)(R10*1), R6
   245		MOVD	16(R8)(R10*1), R7
   246		MOVD	24(R8)(R10*1), R1
   247		ADDC	R4, R4		// restore CF
   248		MOVD	0(R9)(R10*1), R11
   249		ADDE	R11, R5
   250		MOVD	8(R9)(R10*1), R11
   251		ADDE	R11, R6
   252		MOVD	16(R9)(R10*1), R11
   253		ADDE	R11, R7
   254		MOVD	24(R9)(R10*1), R11
   255		ADDE	R11, R1
   256		MOVD	R0, R4
   257		ADDE	R4, R4		// save CF
   258		NEG	R4, R4
   259		MOVD	R5, 0(R2)(R10*1)
   260		MOVD	R6, 8(R2)(R10*1)
   261		MOVD	R7, 16(R2)(R10*1)
   262		MOVD	R1, 24(R2)(R10*1)
   263	
   264	
   265		ADD	$32, R10	// i += 4
   266		SUB	$4,  R3		// n -= 4
   267		BGE	U1n		// if n >= 0 goto U1n
   268	
   269	v1n:	ADD	$4, R3		// n += 4
   270		BLE	E1n		// if n <= 0 goto E1n
   271	
   272	L1n:	// n > 0
   273		ADDC	R4, R4		// restore CF
   274		MOVD	0(R8)(R10*1), R5
   275		MOVD	0(R9)(R10*1), R11
   276		ADDE	R11, R5
   277		MOVD	R5, 0(R2)(R10*1)
   278		MOVD	R0, R4
   279		ADDE	R4, R4		// save CF
   280		NEG 	R4, R4
   281	
   282		ADD	$8, R10		// i++
   283		SUB	$1, R3		// n--
   284		BGT L1n			// if n > 0 goto L1n
   285	
   286	E1n:	NEG	R4, R4
   287		MOVD	R4, c+72(FP)	// return c
   288		RET
   289	
   290	
   291	TEXT ·subVV(SB),NOSPLIT,$0
   292		MOVD	subvectorfacility+0x00(SB),R1
   293		BR	(R1)
   294	
   295	TEXT ·subVV_check(SB),NOSPLIT,$0
   296		MOVB	·hasVX(SB), R1
   297		CMPBEQ	R1, $1, vectorimpl      // vectorfacility = 1, vector supported
   298		MOVD	$subvectorfacility+0x00(SB), R1
   299		MOVD	$·subVV_novec(SB), R2
   300		MOVD	R2, 0(R1)
   301		//MOVD	$·subVV_novec(SB), 0(R1)
   302		BR	·subVV_novec(SB)
   303	vectorimpl:
   304		MOVD	$subvectorfacility+0x00(SB), R1
   305		MOVD    $·subVV_vec(SB), R2
   306	        MOVD    R2, 0(R1)
   307		//MOVD	$·subVV_vec(SB), 0(R1)
   308		BR	·subVV_vec(SB)
   309	
   310	GLOBL subvectorfacility+0x00(SB), NOPTR, $8
   311	DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB)
   312	
   313	// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
   314	// func subVV(z, x, y []Word) (c Word)
   315	// (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
   316	TEXT ·subVV_vec(SB),NOSPLIT,$0
   317		MOVD	z_len+8(FP), R3
   318		MOVD	x+24(FP), R8
   319		MOVD	y+48(FP), R9
   320		MOVD	z+0(FP), R2
   321		MOVD	$0, R4		// c = 0
   322		MOVD	$0, R0		// make sure it's zero
   323		MOVD	$0, R10		// i = 0
   324	
   325		// s/JL/JMP/ below to disable the unrolled loop
   326		SUB	$4, R3		// n -= 4
   327		BLT	v1		// if n < 0 goto v1
   328		SUB     $12, R3         // n -= 16
   329	        BLT     A1              // if n < 0 goto A1
   330	
   331		MOVD	R8, R5
   332		MOVD	R9, R6
   333		MOVD	R2, R7
   334	
   335		// n >= 0
   336		// regular loop body unrolled 16x
   337		VZERO	V0		// cf = 0
   338		MOVD	$1, R4		// for 390 subtraction cf starts as 1 (no borrow)
   339		VLVGG	$1, R4, V0	//put carry into V0
   340	
   341	UU1:	VLM	0(R5), V1, V4		// 64-bytes into V1..V8
   342		ADD	$64, R5
   343		VPDI	$0x4,V1,V1,V1		// flip the doublewords to big-endian order
   344		VPDI	$0x4,V2,V2,V2		// flip the doublewords to big-endian order
   345	
   346	
   347		VLM	0(R6), V9, V12  	// 64-bytes into V9..V16
   348		ADD	$64, R6
   349		VPDI	$0x4,V9,V9,V9		// flip the doublewords to big-endian order
   350		VPDI	$0x4,V10,V10,V10	// flip the doublewords to big-endian order
   351	
   352		VSBCBIQ	V1, V9, V0, V25
   353		VSBIQ	V1, V9, V0, V17
   354		VSBCBIQ	V2, V10, V25, V26
   355		VSBIQ	V2, V10, V25, V18
   356	
   357	
   358		VLM	0(R5), V5, V6		// 32-bytes into V1..V8
   359		VLM	0(R6), V13, V14  	// 32-bytes into V9..V16
   360		ADD	$32, R5
   361		ADD	$32, R6
   362	
   363		VPDI	$0x4,V3,V3,V3		// flip the doublewords to big-endian order
   364		VPDI	$0x4,V4,V4,V4		// flip the doublewords to big-endian order
   365		VPDI	$0x4,V11,V11,V11	// flip the doublewords to big-endian order
   366		VPDI	$0x4,V12,V12,V12	// flip the doublewords to big-endian order
   367	
   368		VSBCBIQ	V3, V11, V26, V27
   369		VSBIQ	V3, V11, V26, V19
   370		VSBCBIQ	V4, V12, V27, V28
   371		VSBIQ	V4, V12, V27, V20
   372	
   373		VLM	0(R5), V7, V8		// 32-bytes into V1..V8
   374		VLM	0(R6), V15, V16  	// 32-bytes into V9..V16
   375		ADD	$32, R5
   376		ADD	$32, R6
   377	
   378		VPDI	$0x4,V5,V5,V5		// flip the doublewords to big-endian order
   379		VPDI	$0x4,V6,V6,V6		// flip the doublewords to big-endian order
   380		VPDI	$0x4,V13,V13,V13	// flip the doublewords to big-endian order
   381		VPDI	$0x4,V14,V14,V14	// flip the doublewords to big-endian order
   382	
   383		VSBCBIQ	V5, V13, V28, V29
   384		VSBIQ	V5, V13, V28, V21
   385		VSBCBIQ	V6, V14, V29, V30
   386		VSBIQ	V6, V14, V29, V22
   387	
   388		VPDI	$0x4,V7,V7,V7		// flip the doublewords to big-endian order
   389		VPDI	$0x4,V8,V8,V8		// flip the doublewords to big-endian order
   390		VPDI	$0x4,V15,V15,V15	// flip the doublewords to big-endian order
   391		VPDI	$0x4,V16,V16,V16	// flip the doublewords to big-endian order
   392	
   393		VSBCBIQ	V7, V15, V30, V31
   394		VSBIQ	V7, V15, V30, V23
   395		VSBCBIQ	V8, V16, V31, V0	//V0 has carry-over
   396		VSBIQ	V8, V16, V31, V24
   397	
   398		VPDI	$0x4,V17,V17,V17	// flip the doublewords to big-endian order
   399		VPDI	$0x4,V18,V18,V18	// flip the doublewords to big-endian order
   400		VPDI	$0x4,V19,V19,V19	// flip the doublewords to big-endian order
   401		VPDI	$0x4,V20,V20,V20	// flip the doublewords to big-endian order
   402		VPDI	$0x4,V21,V21,V21	// flip the doublewords to big-endian order
   403		VPDI	$0x4,V22,V22,V22	// flip the doublewords to big-endian order
   404		VPDI	$0x4,V23,V23,V23	// flip the doublewords to big-endian order
   405		VPDI	$0x4,V24,V24,V24	// flip the doublewords to big-endian order
   406		VSTM	V17, V24, 0(R7)   // 128-bytes into z
   407		ADD	$128, R7
   408		ADD	$128, R10	// i += 16
   409		SUB	$16,  R3	// n -= 16
   410		BGE	UU1		// if n >= 0 goto U1
   411		VLGVG	$1, V0, R4	// put cf into R4
   412		SUB	$1, R4		// save cf
   413	
   414	A1:	ADD	$12, R3		// n += 16
   415		BLT	v1		// if n < 0 goto v1
   416	
   417	U1:	// n >= 0
   418		// regular loop body unrolled 4x
   419		MOVD	0(R8)(R10*1), R5
   420		MOVD	8(R8)(R10*1), R6
   421		MOVD	16(R8)(R10*1), R7
   422		MOVD	24(R8)(R10*1), R1
   423		MOVD	R0, R11
   424		SUBC	R4, R11		// restore CF
   425		MOVD	0(R9)(R10*1), R11
   426		SUBE	R11, R5
   427		MOVD	8(R9)(R10*1), R11
   428		SUBE	R11, R6
   429		MOVD	16(R9)(R10*1), R11
   430		SUBE	R11, R7
   431		MOVD	24(R9)(R10*1), R11
   432		SUBE	R11, R1
   433		MOVD	R0, R4
   434		SUBE	R4, R4		// save CF
   435		MOVD	R5, 0(R2)(R10*1)
   436		MOVD	R6, 8(R2)(R10*1)
   437		MOVD	R7, 16(R2)(R10*1)
   438		MOVD	R1, 24(R2)(R10*1)
   439	
   440		ADD	$32, R10	// i += 4
   441		SUB	$4,  R3		// n -= 4
   442		BGE	U1		// if n >= 0 goto U1n
   443	
   444	v1:	ADD	$4, R3		// n += 4
   445		BLE	E1		// if n <= 0 goto E1
   446	
   447	L1:	// n > 0
   448		MOVD	R0, R11
   449		SUBC	R4, R11		// restore CF
   450		MOVD	0(R8)(R10*1), R5
   451		MOVD	0(R9)(R10*1), R11
   452		SUBE	R11, R5
   453		MOVD	R5, 0(R2)(R10*1)
   454		MOVD	R0, R4
   455		SUBE	R4, R4		// save CF
   456	
   457		ADD	$8, R10		// i++
   458		SUB	$1, R3		// n--
   459		BGT	L1		// if n > 0 goto L1n
   460	
   461	E1:	NEG	R4, R4
   462		MOVD	R4, c+72(FP)	// return c
   463		RET
   464	
   465	
   466	// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
   467	// func subVV(z, x, y []Word) (c Word)
   468	// (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
   469	TEXT ·subVV_novec(SB),NOSPLIT,$0
   470		MOVD z_len+8(FP), R3
   471		MOVD x+24(FP), R8
   472		MOVD y+48(FP), R9
   473		MOVD z+0(FP), R2
   474	
   475		MOVD $0, R4		// c = 0
   476		MOVD $0, R0		// make sure it's zero
   477		MOVD $0, R10		// i = 0
   478	
   479		// s/JL/JMP/ below to disable the unrolled loop
   480		SUB  $4, R3		// n -= 4
   481		BLT v1			// if n < 0 goto v1
   482	
   483	U1:	// n >= 0
   484		// regular loop body unrolled 4x
   485		MOVD 0(R8)(R10*1), R5
   486		MOVD 8(R8)(R10*1), R6
   487		MOVD 16(R8)(R10*1), R7
   488		MOVD 24(R8)(R10*1), R1
   489		MOVD R0, R11
   490		SUBC R4, R11		// restore CF
   491		MOVD 0(R9)(R10*1), R11
   492		SUBE R11, R5
   493		MOVD 8(R9)(R10*1), R11
   494		SUBE R11, R6
   495		MOVD 16(R9)(R10*1), R11
   496		SUBE R11, R7
   497		MOVD 24(R9)(R10*1), R11
   498		SUBE R11, R1
   499		MOVD R0, R4
   500		SUBE R4, R4		// save CF
   501		MOVD R5, 0(R2)(R10*1)
   502		MOVD R6, 8(R2)(R10*1)
   503		MOVD R7, 16(R2)(R10*1)
   504		MOVD R1, 24(R2)(R10*1)
   505	
   506	
   507		ADD  $32, R10		// i += 4
   508		SUB  $4,  R3		// n -= 4
   509		BGE  U1			// if n >= 0 goto U1
   510	
   511	v1:	ADD  $4, R3		// n += 4
   512		BLE E1			// if n <= 0 goto E1
   513	
   514	L1:	// n > 0
   515		MOVD R0, R11
   516		SUBC R4, R11		// restore CF
   517		MOVD 0(R8)(R10*1), R5
   518		MOVD 0(R9)(R10*1), R11
   519		SUBE R11, R5
   520		MOVD R5, 0(R2)(R10*1)
   521		MOVD R0, R4
   522		SUBE R4, R4		// save CF
   523	
   524		ADD  $8, R10		// i++
   525		SUB  $1, R3		// n--
   526		BGT L1			// if n > 0 goto L1
   527	
   528	E1:	NEG  R4, R4
   529		MOVD R4, c+72(FP)	// return c
   530		RET
   531	
   532	TEXT ·addVW(SB),NOSPLIT,$0
   533		MOVD	addwvectorfacility+0x00(SB),R1
   534		BR	(R1)
   535	
   536	TEXT ·addVW_check(SB),NOSPLIT,$0
   537		MOVB	·hasVX(SB), R1
   538		CMPBEQ	R1, $1, vectorimpl      // vectorfacility = 1, vector supported
   539		MOVD	$addwvectorfacility+0x00(SB), R1
   540		MOVD    $·addVW_novec(SB), R2
   541	        MOVD    R2, 0(R1)
   542		//MOVD	$·addVW_novec(SB), 0(R1)
   543		BR	·addVW_novec(SB)
   544	vectorimpl:
   545		MOVD	$addwvectorfacility+0x00(SB), R1
   546		MOVD    $·addVW_vec(SB), R2
   547	        MOVD    R2, 0(R1)
   548		//MOVD	$·addVW_vec(SB), 0(R1)
   549		BR	·addVW_vec(SB)
   550	
   551	GLOBL addwvectorfacility+0x00(SB), NOPTR, $8
   552	DATA addwvectorfacility+0x00(SB)/8, $·addVW_check(SB)
   553	
   554	
   555	// func addVW_vec(z, x []Word, y Word) (c Word)
   556	TEXT ·addVW_vec(SB),NOSPLIT,$0
   557		MOVD	z_len+8(FP), R3
   558		MOVD	x+24(FP), R8
   559		MOVD	y+48(FP), R4	// c = y
   560		MOVD	z+0(FP), R2
   561	
   562		MOVD	$0, R0		// make sure it's zero
   563		MOVD	$0, R10		// i = 0
   564		MOVD	R8, R5
   565		MOVD	R2, R7
   566	
   567		// s/JL/JMP/ below to disable the unrolled loop
   568		SUB	$4, R3			// n -= 4
   569		BLT	v10			// if n < 0 goto v10
   570		SUB	$12, R3
   571		BLT	A10
   572	
   573		// n >= 0
   574		// regular loop body unrolled 16x
   575	
   576		VZERO	V0			// prepare V0 to be final carry register
   577		VZERO	V9			// to ensure upper half is zero
   578		VLVGG	$1, R4, V9
   579	UU1:	VLM	0(R5), V1, V4		// 64-bytes into V1..V4
   580		ADD	$64, R5
   581		VPDI	$0x4,V1,V1,V1		// flip the doublewords to big-endian order
   582		VPDI	$0x4,V2,V2,V2		// flip the doublewords to big-endian order
   583	
   584	
   585		VACCCQ	V1, V9, V0, V25
   586		VACQ	V1, V9, V0, V17
   587		VZERO	V9
   588		VACCCQ	V2, V9, V25, V26
   589		VACQ	V2, V9, V25, V18
   590	
   591	
   592		VLM	0(R5), V5, V6		// 32-bytes into V5..V6
   593		ADD	$32, R5
   594	
   595		VPDI	$0x4,V3,V3,V3		// flip the doublewords to big-endian order
   596		VPDI	$0x4,V4,V4,V4		// flip the doublewords to big-endian order
   597	
   598		VACCCQ	V3, V9, V26, V27
   599		VACQ	V3, V9, V26, V19
   600		VACCCQ	V4, V9, V27, V28
   601		VACQ	V4, V9, V27, V20
   602	
   603		VLM	0(R5), V7, V8		// 32-bytes into V7..V8
   604		ADD	$32, R5
   605	
   606		VPDI	$0x4,V5,V5,V5		// flip the doublewords to big-endian order
   607		VPDI	$0x4,V6,V6,V6		// flip the doublewords to big-endian order
   608	
   609		VACCCQ	V5, V9, V28, V29
   610		VACQ	V5, V9, V28, V21
   611		VACCCQ	V6, V9, V29, V30
   612		VACQ	V6, V9, V29, V22
   613	
   614		VPDI	$0x4,V7,V7,V7		// flip the doublewords to big-endian order
   615		VPDI	$0x4,V8,V8,V8		// flip the doublewords to big-endian order
   616	
   617		VACCCQ	V7, V9, V30, V31
   618		VACQ	V7, V9, V30, V23
   619		VACCCQ	V8, V9, V31, V0	//V0 has carry-over
   620		VACQ	V8, V9, V31, V24
   621	
   622		VPDI	$0x4,V17,V17,V17	// flip the doublewords to big-endian order
   623		VPDI	$0x4,V18,V18,V18	// flip the doublewords to big-endian order
   624		VPDI	$0x4,V19,V19,V19	// flip the doublewords to big-endian order
   625		VPDI	$0x4,V20,V20,V20	// flip the doublewords to big-endian order
   626		VPDI	$0x4,V21,V21,V21	// flip the doublewords to big-endian order
   627		VPDI	$0x4,V22,V22,V22	// flip the doublewords to big-endian order
   628		VPDI	$0x4,V23,V23,V23	// flip the doublewords to big-endian order
   629		VPDI	$0x4,V24,V24,V24	// flip the doublewords to big-endian order
   630		VSTM	V17, V24, 0(R7)   	// 128-bytes into z
   631		ADD	$128, R7
   632		ADD	$128, R10		// i += 16
   633		SUB	$16,  R3		// n -= 16
   634		BGE	UU1		// if n >= 0 goto U1
   635		VLGVG	$1, V0, R4	// put cf into R4 in case we branch to v10
   636	
   637	A10:	ADD	$12, R3		// n += 16
   638	
   639	
   640		// s/JL/JMP/ below to disable the unrolled loop
   641	
   642		BLT	v10		// if n < 0 goto v10
   643	
   644	
   645	U4:	// n >= 0
   646		// regular loop body unrolled 4x
   647		MOVD 0(R8)(R10*1), R5
   648		MOVD 8(R8)(R10*1), R6
   649		MOVD 16(R8)(R10*1), R7
   650		MOVD 24(R8)(R10*1), R1
   651		ADDC R4, R5
   652		ADDE R0, R6
   653		ADDE R0, R7
   654		ADDE R0, R1
   655		ADDE R0, R0
   656		MOVD R0, R4		// save CF
   657		SUB  R0, R0
   658		MOVD R5, 0(R2)(R10*1)
   659		MOVD R6, 8(R2)(R10*1)
   660		MOVD R7, 16(R2)(R10*1)
   661		MOVD R1, 24(R2)(R10*1)
   662	
   663		ADD $32, R10		// i += 4 -> i +=32
   664		SUB $4, R3		// n -= 4
   665		BGE U4			// if n >= 0 goto U4
   666	
   667	v10:	ADD $4, R3		// n += 4
   668		BLE E10			// if n <= 0 goto E4
   669	
   670	
   671	L4:	// n > 0
   672		MOVD	0(R8)(R10*1), R5
   673		ADDC	R4, R5
   674		ADDE	R0, R0
   675		MOVD	R0, R4		// save CF
   676		SUB 	R0, R0
   677		MOVD	R5, 0(R2)(R10*1)
   678	
   679		ADD	$8, R10		// i++
   680		SUB	$1, R3		// n--
   681		BGT	L4		// if n > 0 goto L4
   682	
   683	E10:	MOVD	R4, c+56(FP)	// return c
   684	
   685		RET
   686	
   687	
   688	TEXT ·addVW_novec(SB),NOSPLIT,$0
   689	//DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0)
   690		MOVD z_len+8(FP), R3
   691		MOVD x+24(FP), R8
   692		MOVD y+48(FP), R4	// c = y
   693		MOVD z+0(FP), R2
   694		MOVD $0, R0		// make sure it's 0
   695		MOVD $0, R10		// i = 0
   696	
   697		// s/JL/JMP/ below to disable the unrolled loop
   698		SUB $4, R3		// n -= 4
   699		BLT v4			// if n < 4 goto v4
   700	
   701	U4:	// n >= 0
   702		// regular loop body unrolled 4x
   703		MOVD 0(R8)(R10*1), R5
   704		MOVD 8(R8)(R10*1), R6
   705		MOVD 16(R8)(R10*1), R7
   706		MOVD 24(R8)(R10*1), R1
   707		ADDC R4, R5
   708		ADDE R0, R6
   709		ADDE R0, R7
   710		ADDE R0, R1
   711		ADDE R0, R0
   712		MOVD R0, R4		// save CF
   713		SUB  R0, R0
   714		MOVD R5, 0(R2)(R10*1)
   715		MOVD R6, 8(R2)(R10*1)
   716		MOVD R7, 16(R2)(R10*1)
   717		MOVD R1, 24(R2)(R10*1)
   718	
   719		ADD $32, R10		// i += 4 -> i +=32
   720		SUB $4, R3		// n -= 4
   721		BGE U4			// if n >= 0 goto U4
   722	
   723	v4:	ADD $4, R3		// n += 4
   724		BLE E4			// if n <= 0 goto E4
   725	
   726	L4:	// n > 0
   727		MOVD 0(R8)(R10*1), R5
   728		ADDC R4, R5
   729		ADDE R0, R0
   730		MOVD R0, R4		// save CF
   731		SUB  R0, R0
   732		MOVD R5, 0(R2)(R10*1)
   733	
   734		ADD  $8, R10		// i++
   735		SUB  $1, R3		// n--
   736		BGT L4			// if n > 0 goto L4
   737	
   738	E4:	MOVD R4, c+56(FP)	// return c
   739	
   740		RET
   741	
   742	TEXT ·subVW(SB),NOSPLIT,$0
   743		MOVD	subwvectorfacility+0x00(SB),R1
   744		BR	(R1)
   745	
   746	TEXT ·subVW_check(SB),NOSPLIT,$0
   747		MOVB	·hasVX(SB), R1
   748		CMPBEQ	R1, $1, vectorimpl      // vectorfacility = 1, vector supported
   749		MOVD	$subwvectorfacility+0x00(SB), R1
   750		MOVD    $·subVW_novec(SB), R2
   751	        MOVD    R2, 0(R1)
   752		//MOVD	$·subVW_novec(SB), 0(R1)
   753		BR	·subVW_novec(SB)
   754	vectorimpl:
   755		MOVD	$subwvectorfacility+0x00(SB), R1
   756		MOVD    $·subVW_vec(SB), R2
   757	        MOVD    R2, 0(R1)
   758		//MOVD	$·subVW_vec(SB), 0(R1)
   759		BR	·subVW_vec(SB)
   760	
   761	GLOBL subwvectorfacility+0x00(SB), NOPTR, $8
   762	DATA subwvectorfacility+0x00(SB)/8, $·subVW_check(SB)
   763	
   764	// func subVW(z, x []Word, y Word) (c Word)
   765	TEXT ·subVW_vec(SB),NOSPLIT,$0
   766		MOVD	z_len+8(FP), R3
   767		MOVD	x+24(FP), R8
   768		MOVD	y+48(FP), R4	// c = y
   769		MOVD	z+0(FP), R2
   770	
   771		MOVD	$0, R0		// make sure it's zero
   772		MOVD	$0, R10		// i = 0
   773		MOVD	R8, R5
   774		MOVD	R2, R7
   775	
   776		// s/JL/JMP/ below to disable the unrolled loop
   777		SUB	$4, R3			// n -= 4
   778		BLT	v11			// if n < 0 goto v11
   779		SUB	$12, R3
   780		BLT	A11
   781	
   782		VZERO	V0
   783		MOVD	$1, R6			// prepare V0 to be final carry register
   784		VLVGG	$1, R6, V0		// borrow is initially "no borrow"
   785		VZERO	V9			// to ensure upper half is zero
   786		VLVGG	$1, R4, V9
   787	
   788		// n >= 0
   789		// regular loop body unrolled 16x
   790	
   791	
   792	UU1:	VLM	0(R5), V1, V4		// 64-bytes into V1..V4
   793		ADD	$64, R5
   794		VPDI	$0x4,V1,V1,V1		// flip the doublewords to big-endian order
   795		VPDI	$0x4,V2,V2,V2		// flip the doublewords to big-endian order
   796	
   797	
   798		VSBCBIQ	V1, V9, V0, V25
   799		VSBIQ	V1, V9, V0, V17
   800		VZERO	V9
   801		VSBCBIQ	V2, V9, V25, V26
   802		VSBIQ	V2, V9, V25, V18
   803	
   804		VLM	0(R5), V5, V6		// 32-bytes into V5..V6
   805		ADD	$32, R5
   806	
   807		VPDI	$0x4,V3,V3,V3		// flip the doublewords to big-endian order
   808		VPDI	$0x4,V4,V4,V4		// flip the doublewords to big-endian order
   809	
   810	
   811		VSBCBIQ	V3, V9, V26, V27
   812		VSBIQ	V3, V9, V26, V19
   813		VSBCBIQ	V4, V9, V27, V28
   814		VSBIQ	V4, V9, V27, V20
   815	
   816		VLM	0(R5), V7, V8		// 32-bytes into V7..V8
   817		ADD	$32, R5
   818	
   819		VPDI	$0x4,V5,V5,V5		// flip the doublewords to big-endian order
   820		VPDI	$0x4,V6,V6,V6		// flip the doublewords to big-endian order
   821	
   822		VSBCBIQ	V5, V9, V28, V29
   823		VSBIQ	V5, V9, V28, V21
   824		VSBCBIQ	V6, V9, V29, V30
   825		VSBIQ	V6, V9, V29, V22
   826	
   827		VPDI	$0x4,V7,V7,V7		// flip the doublewords to big-endian order
   828		VPDI	$0x4,V8,V8,V8		// flip the doublewords to big-endian order
   829	
   830		VSBCBIQ	V7, V9, V30, V31
   831		VSBIQ	V7, V9, V30, V23
   832		VSBCBIQ	V8, V9, V31, V0	// V0 has carry-over
   833		VSBIQ	V8, V9, V31, V24
   834	
   835		VPDI	$0x4,V17,V17,V17	// flip the doublewords to big-endian order
   836		VPDI	$0x4,V18,V18,V18	// flip the doublewords to big-endian order
   837		VPDI	$0x4,V19,V19,V19	// flip the doublewords to big-endian order
   838		VPDI	$0x4,V20,V20,V20	// flip the doublewords to big-endian order
   839		VPDI	$0x4,V21,V21,V21	// flip the doublewords to big-endian order
   840		VPDI	$0x4,V22,V22,V22	// flip the doublewords to big-endian order
   841		VPDI	$0x4,V23,V23,V23	// flip the doublewords to big-endian order
   842		VPDI	$0x4,V24,V24,V24	// flip the doublewords to big-endian order
   843		VSTM	V17, V24, 0(R7)   	// 128-bytes into z
   844		ADD	$128, R7
   845		ADD	$128, R10		// i += 16
   846		SUB	$16,  R3		// n -= 16
   847		BGE	UU1			// if n >= 0 goto U1
   848		VLGVG	$1, V0, R4		// put cf into R4 in case we branch to v10
   849		SUB	$1, R4			// save cf
   850		NEG	R4, R4
   851	A11:	ADD	$12, R3			// n += 16
   852	
   853		BLT	v11			// if n < 0 goto v11
   854	
   855		// n >= 0
   856		// regular loop body unrolled 4x
   857	
   858	U4:	// n >= 0
   859		// regular loop body unrolled 4x
   860		MOVD 0(R8)(R10*1), R5
   861		MOVD 8(R8)(R10*1), R6
   862		MOVD 16(R8)(R10*1), R7
   863		MOVD 24(R8)(R10*1), R1
   864		SUBC R4, R5 //SLGR  -> SUBC
   865		SUBE R0, R6 //SLBGR -> SUBE
   866		SUBE R0, R7
   867		SUBE R0, R1
   868		SUBE R4, R4		// save CF
   869		NEG  R4, R4
   870		MOVD R5, 0(R2)(R10*1)
   871		MOVD R6, 8(R2)(R10*1)
   872		MOVD R7, 16(R2)(R10*1)
   873		MOVD R1, 24(R2)(R10*1)
   874	
   875		ADD $32, R10		// i += 4 -> i +=32
   876		SUB $4, R3		// n -= 4
   877		BGE U4			// if n >= 0 goto U4
   878	
   879	v11:	ADD $4, R3		// n += 4
   880		BLE E11			// if n <= 0 goto E4
   881	
   882	L4:	// n > 0
   883	
   884		MOVD	0(R8)(R10*1), R5
   885		SUBC	R4, R5
   886		SUBE	R4, R4		// save CF
   887		NEG	R4, R4
   888		MOVD	R5, 0(R2)(R10*1)
   889	
   890		ADD	$8, R10		// i++
   891		SUB	$1, R3		// n--
   892		BGT	L4		// if n > 0 goto L4
   893	
   894	E11:	MOVD	R4, c+56(FP)	// return c
   895	
   896		RET
   897	
   898	//DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0)
   899	// func subVW(z, x []Word, y Word) (c Word)
   900	// (same as addVW except for SUBC/SUBE instead of ADDC/ADDE and label names)
   901	TEXT ·subVW_novec(SB),NOSPLIT,$0
   902		MOVD z_len+8(FP), R3
   903		MOVD x+24(FP), R8
   904		MOVD y+48(FP), R4	// c = y
   905		MOVD z+0(FP), R2
   906		MOVD $0, R0		// make sure it's 0
   907		MOVD $0, R10		// i = 0
   908	
   909		// s/JL/JMP/ below to disable the unrolled loop
   910		SUB $4, R3		// n -= 4
   911		BLT v4			// if n < 4 goto v4
   912	
   913	U4:	// n >= 0
   914		// regular loop body unrolled 4x
   915		MOVD 0(R8)(R10*1), R5
   916		MOVD 8(R8)(R10*1), R6
   917		MOVD 16(R8)(R10*1), R7
   918		MOVD 24(R8)(R10*1), R1
   919		SUBC R4, R5 //SLGR  -> SUBC
   920		SUBE R0, R6 //SLBGR -> SUBE
   921		SUBE R0, R7
   922		SUBE R0, R1
   923		SUBE R4, R4		// save CF
   924		NEG  R4, R4
   925		MOVD R5, 0(R2)(R10*1)
   926		MOVD R6, 8(R2)(R10*1)
   927		MOVD R7, 16(R2)(R10*1)
   928		MOVD R1, 24(R2)(R10*1)
   929	
   930		ADD $32, R10		// i += 4 -> i +=32
   931		SUB $4, R3		// n -= 4
   932		BGE U4			// if n >= 0 goto U4
   933	
   934	v4:	ADD $4, R3		// n += 4
   935		BLE E4			// if n <= 0 goto E4
   936	
   937	L4:	// n > 0
   938		MOVD 0(R8)(R10*1), R5
   939		SUBC R4, R5
   940		SUBE R4, R4		// save CF
   941		NEG  R4, R4
   942		MOVD R5, 0(R2)(R10*1)
   943	
   944		ADD  $8, R10		// i++
   945		SUB  $1, R3		// n--
   946		BGT L4			// if n > 0 goto L4
   947	
   948	E4:	MOVD R4, c+56(FP)	// return c
   949	
   950		RET
   951	
   952	// func shlVU(z, x []Word, s uint) (c Word)
   953	TEXT ·shlVU(SB),NOSPLIT,$0
   954		MOVD	z_len+8(FP), R5
   955		MOVD	$0, R0
   956		SUB	$1, R5             // n--
   957		BLT	X8b                // n < 0        (n <= 0)
   958	
   959		// n > 0
   960		MOVD	s+48(FP), R4
   961		CMPBEQ	R0, R4, Z80	   //handle 0 case beq
   962		MOVD	$64, R6
   963		CMPBEQ	R6, R4, Z864	   //handle 64 case beq
   964		MOVD	z+0(FP), R2
   965		MOVD	x+24(FP), R8
   966		SLD	$3, R5             // n = n*8
   967		SUB	R4, R6, R7
   968		MOVD	(R8)(R5*1), R10    // w1 = x[i-1]
   969		SRD	R7, R10, R3
   970		MOVD	R3, c+56(FP)
   971	
   972		MOVD	$0, R1             // i = 0
   973		BR	E8
   974	
   975		// i < n-1
   976	L8:	MOVD	R10, R3             // w = w1
   977		MOVD	-8(R8)(R5*1), R10   // w1 = x[i+1]
   978	
   979		SLD	R4,  R3             // w<<s | w1>>ŝ
   980		SRD	R7, R10, R6
   981		OR 	R6, R3
   982		MOVD	R3, (R2)(R5*1)      // z[i] = w<<s | w1>>ŝ
   983		SUB	$8, R5              // i--
   984	
   985	E8:	CMPBGT	R5, R0, L8	    // i < n-1
   986	
   987		// i >= n-1
   988	X8a:	SLD	R4, R10             // w1<<s
   989		MOVD	R10, (R2)           // z[0] = w1<<s
   990		RET
   991	
   992	X8b:	MOVD	R0, c+56(FP)
   993		RET
   994	
   995	Z80:	MOVD	z+0(FP), R2
   996		MOVD	x+24(FP), R8
   997		SLD	$3, R5             // n = n*8
   998	
   999		MOVD	(R8), R10
  1000		MOVD	$0, R3
  1001		MOVD	R3, c+56(FP)
  1002	
  1003		MOVD	$0, R1             // i = 0
  1004		BR	E8Z
  1005	
  1006		// i < n-1
  1007	L8Z:	MOVD	R10, R3
  1008		MOVD	8(R8)(R1*1), R10
  1009	
  1010		MOVD	R3, (R2)(R1*1)
  1011		ADD 	$8, R1
  1012	
  1013	E8Z:	CMPBLT	R1, R5, L8Z
  1014	
  1015		// i >= n-1
  1016		MOVD	R10, (R2)(R5*1)
  1017		RET
  1018	
  1019	Z864:	MOVD	z+0(FP), R2
  1020		MOVD	x+24(FP), R8
  1021		SLD	$3, R5             // n = n*8
  1022		MOVD	(R8)(R5*1), R3     // w1 = x[n-1]
  1023		MOVD	R3, c+56(FP)       // z[i] = x[n-1]
  1024	
  1025		BR	E864
  1026	
  1027		// i < n-1
  1028	L864:	MOVD	-8(R8)(R5*1), R3
  1029	
  1030		MOVD	R3, (R2)(R5*1)     // z[i] = x[n-1]
  1031		SUB	$8, R5             // i--
  1032	
  1033	E864:	CMPBGT	R5, R0, L864       // i < n-1
  1034	
  1035		MOVD	R0, (R2)           // z[n-1] = 0
  1036		RET
  1037	
  1038	
  1039	// CX = R4, r8 = r8, r10 = r2 , r11 = r5, DX = r3, AX = r10 , BX = R1 , 64-count = r7 (R0 set to 0) temp = R6
  1040	// func shrVU(z, x []Word, s uint) (c Word)
  1041	TEXT ·shrVU(SB),NOSPLIT,$0
  1042		MOVD	z_len+8(FP), R5
  1043		MOVD	$0, R0
  1044		SUB	$1, R5             // n--
  1045		BLT	X9b                // n < 0        (n <= 0)
  1046	
  1047		// n > 0
  1048		MOVD	s+48(FP), R4
  1049		CMPBEQ	R0, R4, ZB0	//handle 0 case beq
  1050		MOVD	$64, R6
  1051		CMPBEQ 	R6, R4, ZB64	//handle 64 case beq
  1052		MOVD	z+0(FP), R2
  1053		MOVD	x+24(FP), R8
  1054		SLD	$3, R5		// n = n*8
  1055		SUB	R4, R6, R7
  1056		MOVD	(R8), R10	// w1 = x[0]
  1057		SLD	R7, R10, R3
  1058		MOVD	R3, c+56(FP)
  1059	
  1060		MOVD	$0, R1		// i = 0
  1061		BR 	E9
  1062	
  1063		// i < n-1
  1064	L9:	MOVD	R10, R3		// w = w1
  1065		MOVD	8(R8)(R1*1), R10	// w1 = x[i+1]
  1066	
  1067		SRD	R4,  R3		// w>>s | w1<<s
  1068		SLD	R7, R10, R6
  1069		OR	R6, R3
  1070		MOVD	R3, (R2)(R1*1)	// z[i] = w>>s | w1<<s
  1071		ADD	$8, R1		// i++
  1072	
  1073	E9:	CMPBLT	R1, R5, L9	// i < n-1
  1074	
  1075		// i >= n-1
  1076	X9a:	SRD	R4, R10		// w1>>s
  1077		MOVD	R10, (R2)(R5*1)	// z[n-1] = w1>>s
  1078		RET
  1079	
  1080	X9b:	MOVD	R0, c+56(FP)
  1081		RET
  1082	
  1083	ZB0:	MOVD	z+0(FP), R2
  1084		MOVD	x+24(FP), R8
  1085		SLD	$3, R5		// n = n*8
  1086	
  1087		MOVD	(R8), R10	// w1 = x[0]
  1088		MOVD	$0, R3		// R10 << 64
  1089		MOVD	R3, c+56(FP)
  1090	
  1091		MOVD	$0, R1		// i = 0
  1092		BR	E9Z
  1093	
  1094		// i < n-1
  1095	L9Z:	MOVD	R10, R3		// w = w1
  1096		MOVD	8(R8)(R1*1), R10	// w1 = x[i+1]
  1097	
  1098		MOVD	R3, (R2)(R1*1)	// z[i] = w>>s | w1<<s
  1099		ADD	$8, R1		// i++
  1100	
  1101	E9Z:	CMPBLT	R1, R5, L9Z	// i < n-1
  1102	
  1103		// i >= n-1
  1104		MOVD	R10, (R2)(R5*1)	// z[n-1] = w1>>s
  1105		RET
  1106	
  1107	ZB64:	MOVD	z+0(FP), R2
  1108		MOVD	x+24(FP), R8
  1109		SLD	$3, R5		// n = n*8
  1110		MOVD	(R8), R3	// w1 = x[0]
  1111		MOVD	R3, c+56(FP)
  1112	
  1113		MOVD	$0, R1		// i = 0
  1114		BR	E964
  1115	
  1116		// i < n-1
  1117	L964:	MOVD	8(R8)(R1*1), R3	// w1 = x[i+1]
  1118	
  1119		MOVD	R3, (R2)(R1*1)	// z[i] = w>>s | w1<<s
  1120		ADD	$8, R1		// i++
  1121	
  1122	E964:	CMPBLT	R1, R5, L964	// i < n-1
  1123	
  1124		// i >= n-1
  1125		MOVD	$0, R10            // w1>>s
  1126		MOVD	R10, (R2)(R5*1)    // z[n-1] = w1>>s
  1127		RET
  1128	
  1129	// CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, DX = r3, AX = r6 , BX = R1 , (R0 set to 0) + use R11 + use R7 for i
  1130	// func mulAddVWW(z, x []Word, y, r Word) (c Word)
  1131	TEXT ·mulAddVWW(SB),NOSPLIT,$0
  1132		MOVD	z+0(FP), R2
  1133		MOVD	x+24(FP), R8
  1134		MOVD	y+48(FP), R9
  1135		MOVD	r+56(FP), R4	// c = r
  1136		MOVD	z_len+8(FP), R5
  1137		MOVD	$0, R1		// i = 0
  1138		MOVD	$0, R7		// i*8 = 0
  1139		MOVD	$0, R0		// make sure it's zero
  1140		BR	E5
  1141	
  1142	L5:	MOVD	(R8)(R1*1), R6
  1143		MULHDU	R9, R6
  1144		ADDC	R4, R11 	//add to low order bits
  1145		ADDE	R0, R6
  1146		MOVD	R11, (R2)(R1*1)
  1147		MOVD	R6, R4
  1148		ADD	$8, R1		// i*8 + 8
  1149		ADD	$1, R7		// i++
  1150	
  1151	E5:	CMPBLT	R7, R5, L5	// i < n
  1152	
  1153		MOVD	R4, c+64(FP)
  1154		RET
  1155	
  1156	// func addMulVVW(z, x []Word, y Word) (c Word)
  1157	// CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1 , (R0 set to 0) + use R11 + use R7 for i
  1158	TEXT ·addMulVVW(SB),NOSPLIT,$0
  1159		MOVD	z+0(FP), R2
  1160		MOVD	x+24(FP), R8
  1161		MOVD	y+48(FP), R9
  1162		MOVD	z_len+8(FP), R5
  1163	
  1164		MOVD	$0, R1		// i*8 = 0
  1165		MOVD	$0, R7		// i = 0
  1166		MOVD	$0, R0		// make sure it's zero
  1167		MOVD	$0, R4		// c = 0
  1168	
  1169		MOVD	R5, R12
  1170		AND	$-2, R12
  1171		CMPBGE	R5, $2, A6
  1172		BR	E6
  1173	
  1174	A6:	MOVD	(R8)(R1*1), R6
  1175		MULHDU	R9, R6
  1176		MOVD	(R2)(R1*1), R10
  1177		ADDC	R10, R11	//add to low order bits
  1178		ADDE	R0, R6
  1179		ADDC	R4, R11
  1180		ADDE	R0, R6
  1181		MOVD	R6, R4
  1182		MOVD	R11, (R2)(R1*1)
  1183	
  1184		MOVD	(8)(R8)(R1*1), R6
  1185		MULHDU	R9, R6
  1186		MOVD	(8)(R2)(R1*1), R10
  1187		ADDC	R10, R11	//add to low order bits
  1188		ADDE	R0, R6
  1189		ADDC	R4, R11
  1190		ADDE	R0, R6
  1191		MOVD	R6, R4
  1192		MOVD	R11, (8)(R2)(R1*1)
  1193	
  1194		ADD	$16, R1		// i*8 + 8
  1195		ADD	$2, R7		// i++
  1196	
  1197		CMPBLT	R7, R12, A6
  1198		BR	E6
  1199	
  1200	L6:	MOVD	(R8)(R1*1), R6
  1201		MULHDU	R9, R6
  1202		MOVD	(R2)(R1*1), R10
  1203		ADDC	R10, R11	//add to low order bits
  1204		ADDE	R0, R6
  1205		ADDC	R4, R11
  1206		ADDE	R0, R6
  1207		MOVD	R6, R4
  1208		MOVD	R11, (R2)(R1*1)
  1209	
  1210		ADD	$8, R1		// i*8 + 8
  1211		ADD	$1, R7		// i++
  1212	
  1213	E6:	CMPBLT	R7, R5, L6	// i < n
  1214	
  1215		MOVD	R4, c+56(FP)
  1216		RET
  1217	
  1218	// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
  1219	// CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1(*8) , (R0 set to 0) + use R11 + use R7 for i
  1220	TEXT ·divWVW(SB),NOSPLIT,$0
  1221		MOVD	z+0(FP), R2
  1222		MOVD	xn+24(FP), R10	// r = xn
  1223		MOVD	x+32(FP), R8
  1224		MOVD	y+56(FP), R9
  1225		MOVD	z_len+8(FP), R7	// i = z
  1226		SLD	$3, R7, R1		// i*8
  1227		MOVD	$0, R0		// make sure it's zero
  1228		BR	E7
  1229	
  1230	L7:	MOVD	(R8)(R1*1), R11
  1231		WORD	$0xB98700A9	//DLGR R10,R9
  1232		MOVD	R11, (R2)(R1*1)
  1233	
  1234	E7:	SUB	$1, R7		// i--
  1235		SUB	$8, R1
  1236		BGE	L7		// i >= 0
  1237	
  1238		MOVD	R10, r+64(FP)
  1239		RET

View as plain text