Text file src/hash/crc32/crc32_s390x.s

     1	// Copyright 2016 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	#include "textflag.h"
     6	
     7	// Vector register range containing CRC-32 constants
     8	
     9	#define CONST_PERM_LE2BE        V9
    10	#define CONST_R2R1              V10
    11	#define CONST_R4R3              V11
    12	#define CONST_R5                V12
    13	#define CONST_RU_POLY           V13
    14	#define CONST_CRC_POLY          V14
    15	
    16	
    17	// The CRC-32 constant block contains reduction constants to fold and
    18	// process particular chunks of the input data stream in parallel.
    19	//
    20	// Note that the constant definitions below are extended in order to compute
    21	// intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
    22	// The rightmost doubleword can be 0 to prevent contribution to the result or
    23	// can be multiplied by 1 to perform an XOR without the need for a separate
    24	// VECTOR EXCLUSIVE OR instruction.
    25	//
    26	// The polynomials used are bit-reflected:
    27	//
    28	//            IEEE: P'(x) = 0x0edb88320
    29	//      Castagnoli: P'(x) = 0x082f63b78
    30	
    31	
    32	// IEEE polynomial constants
    33	DATA    ·crclecons+0(SB)/8,  $0x0F0E0D0C0B0A0908       // LE-to-BE mask
    34	DATA    ·crclecons+8(SB)/8,  $0x0706050403020100
    35	DATA    ·crclecons+16(SB)/8, $0x00000001c6e41596       // R2
    36	DATA    ·crclecons+24(SB)/8, $0x0000000154442bd4       // R1
    37	DATA    ·crclecons+32(SB)/8, $0x00000000ccaa009e       // R4
    38	DATA    ·crclecons+40(SB)/8, $0x00000001751997d0       // R3
    39	DATA    ·crclecons+48(SB)/8, $0x0000000000000000
    40	DATA    ·crclecons+56(SB)/8, $0x0000000163cd6124       // R5
    41	DATA    ·crclecons+64(SB)/8, $0x0000000000000000
    42	DATA    ·crclecons+72(SB)/8, $0x00000001F7011641       // u'
    43	DATA    ·crclecons+80(SB)/8, $0x0000000000000000
    44	DATA    ·crclecons+88(SB)/8, $0x00000001DB710641       // P'(x) << 1
    45	
    46	GLOBL    ·crclecons(SB),RODATA, $144
    47	
    48	// Castagonli Polynomial constants
    49	DATA    ·crcclecons+0(SB)/8,  $0x0F0E0D0C0B0A0908      // LE-to-BE mask
    50	DATA    ·crcclecons+8(SB)/8,  $0x0706050403020100
    51	DATA    ·crcclecons+16(SB)/8, $0x000000009e4addf8      // R2
    52	DATA    ·crcclecons+24(SB)/8, $0x00000000740eef02      // R1
    53	DATA    ·crcclecons+32(SB)/8, $0x000000014cd00bd6      // R4
    54	DATA    ·crcclecons+40(SB)/8, $0x00000000f20c0dfe      // R3
    55	DATA    ·crcclecons+48(SB)/8, $0x0000000000000000
    56	DATA    ·crcclecons+56(SB)/8, $0x00000000dd45aab8      // R5
    57	DATA    ·crcclecons+64(SB)/8, $0x0000000000000000
    58	DATA    ·crcclecons+72(SB)/8, $0x00000000dea713f1      // u'
    59	DATA    ·crcclecons+80(SB)/8, $0x0000000000000000
    60	DATA    ·crcclecons+88(SB)/8, $0x0000000105ec76f0      // P'(x) << 1
    61	
    62	GLOBL   ·crcclecons(SB),RODATA, $144
    63	
    64	// func hasVectorFacility() bool
    65	TEXT ·hasVectorFacility(SB),NOSPLIT,$24-1
    66		MOVD    $x-24(SP), R1
    67		XC      $24, 0(R1), 0(R1) // clear the storage
    68		MOVD    $2, R0            // R0 is the number of double words stored -1
    69		WORD    $0xB2B01000       // STFLE 0(R1)
    70		XOR     R0, R0            // reset the value of R0
    71		MOVBZ   z-8(SP), R1
    72		AND     $0x40, R1
    73		BEQ     novector
    74	vectorinstalled:
    75		// check if the vector instruction has been enabled
    76		VLEIB   $0, $0xF, V16
    77		VLGVB   $0, V16, R1
    78		CMPBNE  R1, $0xF, novector
    79		MOVB    $1, ret+0(FP) // have vx
    80		RET
    81	novector:
    82		MOVB    $0, ret+0(FP) // no vx
    83		RET
    84	
    85	
    86	// The CRC-32 function(s) use these calling conventions:
    87	//
    88	// Parameters:
    89	//
    90	//      R2:    Initial CRC value, typically ~0; and final CRC (return) value.
    91	//      R3:    Input buffer pointer, performance might be improved if the
    92	//             buffer is on a doubleword boundary.
    93	//      R4:    Length of the buffer, must be 64 bytes or greater.
    94	//
    95	// Register usage:
    96	//
    97	//      R5:     CRC-32 constant pool base pointer.
    98	//      V0:     Initial CRC value and intermediate constants and results.
    99	//      V1..V4: Data for CRC computation.
   100	//      V5..V8: Next data chunks that are fetched from the input buffer.
   101	//
   102	//      V9..V14: CRC-32 constants.
   103	
   104	// func vectorizedIEEE(crc uint32, p []byte) uint32
   105	TEXT ·vectorizedIEEE(SB),NOSPLIT,$0
   106		MOVWZ   crc+0(FP), R2     // R2 stores the CRC value
   107		MOVD    p+8(FP), R3       // data pointer
   108		MOVD    p_len+16(FP), R4  // len(p)
   109	
   110		MOVD    $·crclecons(SB), R5
   111		BR      vectorizedBody<>(SB)
   112	
   113	// func vectorizedCastagnoli(crc uint32, p []byte) uint32
   114	TEXT ·vectorizedCastagnoli(SB),NOSPLIT,$0
   115		MOVWZ   crc+0(FP), R2     // R2 stores the CRC value
   116		MOVD    p+8(FP), R3       // data pointer
   117		MOVD    p_len+16(FP), R4  // len(p)
   118	
   119		// R5: crc-32 constant pool base pointer, constant is used to reduce crc
   120		MOVD    $·crcclecons(SB), R5
   121		BR      vectorizedBody<>(SB)
   122	
   123	TEXT vectorizedBody<>(SB),NOSPLIT,$0
   124		XOR     $0xffffffff, R2 // NOTW R2
   125		VLM     0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY
   126	
   127		// Load the initial CRC value into the rightmost word of V0
   128		VZERO   V0
   129		VLVGF   $3, R2, V0
   130	
   131		// Crash if the input size is less than 64-bytes.
   132		CMP     R4, $64
   133		BLT     crash
   134	
   135		// Load a 64-byte data chunk and XOR with CRC
   136		VLM     0(R3), V1, V4    // 64-bytes into V1..V4
   137	
   138		// Reflect the data if the CRC operation is in the bit-reflected domain
   139		VPERM   V1, V1, CONST_PERM_LE2BE, V1
   140		VPERM   V2, V2, CONST_PERM_LE2BE, V2
   141		VPERM   V3, V3, CONST_PERM_LE2BE, V3
   142		VPERM   V4, V4, CONST_PERM_LE2BE, V4
   143	
   144		VX      V0, V1, V1     // V1 ^= CRC
   145		ADD     $64, R3        // BUF = BUF + 64
   146		ADD     $(-64), R4
   147	
   148		// Check remaining buffer size and jump to proper folding method
   149		CMP     R4, $64
   150		BLT     less_than_64bytes
   151	
   152	fold_64bytes_loop:
   153		// Load the next 64-byte data chunk into V5 to V8
   154		VLM     0(R3), V5, V8
   155		VPERM   V5, V5, CONST_PERM_LE2BE, V5
   156		VPERM   V6, V6, CONST_PERM_LE2BE, V6
   157		VPERM   V7, V7, CONST_PERM_LE2BE, V7
   158		VPERM   V8, V8, CONST_PERM_LE2BE, V8
   159	
   160	
   161		// Perform a GF(2) multiplication of the doublewords in V1 with
   162		// the reduction constants in V0.  The intermediate result is
   163		// then folded (accumulated) with the next data chunk in V5 and
   164		// stored in V1.  Repeat this step for the register contents
   165		// in V2, V3, and V4 respectively.
   166	
   167		VGFMAG  CONST_R2R1, V1, V5, V1
   168		VGFMAG  CONST_R2R1, V2, V6, V2
   169		VGFMAG  CONST_R2R1, V3, V7, V3
   170		VGFMAG  CONST_R2R1, V4, V8 ,V4
   171	
   172		// Adjust buffer pointer and length for next loop
   173		ADD     $64, R3                  // BUF = BUF + 64
   174		ADD     $(-64), R4               // LEN = LEN - 64
   175	
   176		CMP     R4, $64
   177		BGE     fold_64bytes_loop
   178	
   179	less_than_64bytes:
   180		// Fold V1 to V4 into a single 128-bit value in V1
   181		VGFMAG  CONST_R4R3, V1, V2, V1
   182		VGFMAG  CONST_R4R3, V1, V3, V1
   183		VGFMAG  CONST_R4R3, V1, V4, V1
   184	
   185		// Check whether to continue with 64-bit folding
   186		CMP R4, $16
   187		BLT final_fold
   188	
   189	fold_16bytes_loop:
   190		VL      0(R3), V2               // Load next data chunk
   191		VPERM   V2, V2, CONST_PERM_LE2BE, V2
   192	
   193		VGFMAG  CONST_R4R3, V1, V2, V1  // Fold next data chunk
   194	
   195		// Adjust buffer pointer and size for folding next data chunk
   196		ADD     $16, R3
   197		ADD     $-16, R4
   198	
   199		// Process remaining data chunks
   200		CMP     R4 ,$16
   201		BGE     fold_16bytes_loop
   202	
   203	final_fold:
   204		VLEIB   $7, $0x40, V9
   205		VSRLB   V9, CONST_R4R3, V0
   206		VLEIG   $0, $1, V0
   207	
   208		VGFMG   V0, V1, V1
   209	
   210		VLEIB   $7, $0x20, V9         // Shift by words
   211		VSRLB   V9, V1, V2            // Store remaining bits in V2
   212		VUPLLF  V1, V1                // Split rightmost doubleword
   213		VGFMAG  CONST_R5, V1, V2, V1  // V1 = (V1 * R5) XOR V2
   214	
   215	
   216		// The input values to the Barret reduction are the degree-63 polynomial
   217		// in V1 (R(x)), degree-32 generator polynomial, and the reduction
   218		// constant u.  The Barret reduction result is the CRC value of R(x) mod
   219		// P(x).
   220		//
   221		// The Barret reduction algorithm is defined as:
   222		//
   223		//    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
   224		//    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
   225		//    3. C(x)  = R(x) XOR T2(x) mod x^32
   226		//
   227		// Note: To compensate the division by x^32, use the vector unpack
   228		// instruction to move the leftmost word into the leftmost doubleword
   229		// of the vector register.  The rightmost doubleword is multiplied
   230		// with zero to not contribute to the intermediate results.
   231	
   232	
   233		// T1(x) = floor( R(x) / x^32 ) GF2MUL u
   234		VUPLLF  V1, V2
   235		VGFMG   CONST_RU_POLY, V2, V2
   236	
   237	
   238		// Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
   239		// V2 and XOR the intermediate result, T2(x),  with the value in V1.
   240		// The final result is in the rightmost word of V2.
   241	
   242		VUPLLF  V2 , V2
   243		VGFMAG  CONST_CRC_POLY, V2, V1, V2
   244	
   245	done:
   246		VLGVF   $2, V2, R2
   247		XOR     $0xffffffff, R2 // NOTW R2
   248		MOVWZ   R2, ret + 32(FP)
   249		RET
   250	
   251	crash:
   252		MOVD    $0, (R0) // input size is less than 64-bytes
View as plain text