...

Text file src/hash/crc32/crc32_amd64.s

     1	// Copyright 2011 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	#include "textflag.h"
     6	
     7	// castagnoliSSE42 updates the (non-inverted) crc with the given buffer.
     8	//
     9	// func castagnoliSSE42(crc uint32, p []byte) uint32
    10	TEXT ·castagnoliSSE42(SB),NOSPLIT,$0
    11		MOVL crc+0(FP), AX  // CRC value
    12		MOVQ p+8(FP), SI  // data pointer
    13		MOVQ p_len+16(FP), CX  // len(p)
    14	
    15		// If there are fewer than 8 bytes to process, skip alignment.
    16		CMPQ CX, $8
    17		JL less_than_8
    18	
    19		MOVQ SI, BX
    20		ANDQ $7, BX
    21		JZ aligned
    22	
    23		// Process the first few bytes to 8-byte align the input.
    24	
    25		// BX = 8 - BX. We need to process this many bytes to align.
    26		SUBQ $1, BX
    27		XORQ $7, BX
    28	
    29		BTQ $0, BX
    30		JNC align_2
    31	
    32		CRC32B (SI), AX
    33		DECQ CX
    34		INCQ SI
    35	
    36	align_2:
    37		BTQ $1, BX
    38		JNC align_4
    39	
    40		CRC32W (SI), AX
    41	
    42		SUBQ $2, CX
    43		ADDQ $2, SI
    44	
    45	align_4:
    46		BTQ $2, BX
    47		JNC aligned
    48	
    49		CRC32L (SI), AX
    50	
    51		SUBQ $4, CX
    52		ADDQ $4, SI
    53	
    54	aligned:
    55		// The input is now 8-byte aligned and we can process 8-byte chunks.
    56		CMPQ CX, $8
    57		JL less_than_8
    58	
    59		CRC32Q (SI), AX
    60		ADDQ $8, SI
    61		SUBQ $8, CX
    62		JMP aligned
    63	
    64	less_than_8:
    65		// We may have some bytes left over; process 4 bytes, then 2, then 1.
    66		BTQ $2, CX
    67		JNC less_than_4
    68	
    69		CRC32L (SI), AX
    70		ADDQ $4, SI
    71	
    72	less_than_4:
    73		BTQ $1, CX
    74		JNC less_than_2
    75	
    76		CRC32W (SI), AX
    77		ADDQ $2, SI
    78	
    79	less_than_2:
    80		BTQ $0, CX
    81		JNC done
    82	
    83		CRC32B (SI), AX
    84	
    85	done:
    86		MOVL AX, ret+32(FP)
    87		RET
    88	
    89	// castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds)
    90	// bytes from each buffer.
    91	//
    92	// func castagnoliSSE42Triple(
    93	//     crc1, crc2, crc3 uint32,
    94	//     a, b, c []byte,
    95	//     rounds uint32,
    96	// ) (retA uint32, retB uint32, retC uint32)
    97	TEXT ·castagnoliSSE42Triple(SB),NOSPLIT,$0
    98		MOVL crcA+0(FP), AX
    99		MOVL crcB+4(FP), CX
   100		MOVL crcC+8(FP), DX
   101	
   102		MOVQ a+16(FP), R8   // data pointer
   103		MOVQ b+40(FP), R9   // data pointer
   104		MOVQ c+64(FP), R10  // data pointer
   105	
   106		MOVL rounds+88(FP), R11
   107	
   108	loop:
   109		CRC32Q (R8), AX
   110		CRC32Q (R9), CX
   111		CRC32Q (R10), DX
   112	
   113		CRC32Q 8(R8), AX
   114		CRC32Q 8(R9), CX
   115		CRC32Q 8(R10), DX
   116	
   117		CRC32Q 16(R8), AX
   118		CRC32Q 16(R9), CX
   119		CRC32Q 16(R10), DX
   120	
   121		ADDQ $24, R8
   122		ADDQ $24, R9
   123		ADDQ $24, R10
   124	
   125		DECQ R11
   126		JNZ loop
   127	
   128		MOVL AX, retA+96(FP)
   129		MOVL CX, retB+100(FP)
   130		MOVL DX, retC+104(FP)
   131		RET
   132	
   133	// CRC32 polynomial data
   134	//
   135	// These constants are lifted from the
   136	// Linux kernel, since they avoid the costly
   137	// PSHUFB 16 byte reversal proposed in the
   138	// original Intel paper.
   139	DATA r2r1<>+0(SB)/8, $0x154442bd4
   140	DATA r2r1<>+8(SB)/8, $0x1c6e41596
   141	DATA r4r3<>+0(SB)/8, $0x1751997d0
   142	DATA r4r3<>+8(SB)/8, $0x0ccaa009e
   143	DATA rupoly<>+0(SB)/8, $0x1db710641
   144	DATA rupoly<>+8(SB)/8, $0x1f7011641
   145	DATA r5<>+0(SB)/8, $0x163cd6124
   146	
   147	GLOBL r2r1<>(SB),RODATA,$16
   148	GLOBL r4r3<>(SB),RODATA,$16
   149	GLOBL rupoly<>(SB),RODATA,$16
   150	GLOBL r5<>(SB),RODATA,$8
   151	
   152	// Based on https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
   153	// len(p) must be at least 64, and must be a multiple of 16.
   154	
   155	// func ieeeCLMUL(crc uint32, p []byte) uint32
   156	TEXT ·ieeeCLMUL(SB),NOSPLIT,$0
   157		MOVL   crc+0(FP), X0             // Initial CRC value
   158		MOVQ   p+8(FP), SI  	         // data pointer
   159		MOVQ   p_len+16(FP), CX          // len(p)
   160	
   161		MOVOU  (SI), X1
   162		MOVOU  16(SI), X2
   163		MOVOU  32(SI), X3
   164		MOVOU  48(SI), X4
   165		PXOR   X0, X1
   166		ADDQ   $64, SI                  // buf+=64
   167		SUBQ   $64, CX                  // len-=64
   168		CMPQ   CX, $64                  // Less than 64 bytes left
   169		JB     remain64
   170	
   171		MOVOA  r2r1<>+0(SB), X0
   172	loopback64:
   173		MOVOA  X1, X5
   174		MOVOA  X2, X6
   175		MOVOA  X3, X7
   176		MOVOA  X4, X8
   177	
   178		PCLMULQDQ $0, X0, X1
   179		PCLMULQDQ $0, X0, X2
   180		PCLMULQDQ $0, X0, X3
   181		PCLMULQDQ $0, X0, X4
   182	
   183		/* Load next early */
   184		MOVOU    (SI), X11
   185		MOVOU    16(SI), X12
   186		MOVOU    32(SI), X13
   187		MOVOU    48(SI), X14
   188	
   189		PCLMULQDQ $0x11, X0, X5
   190		PCLMULQDQ $0x11, X0, X6
   191		PCLMULQDQ $0x11, X0, X7
   192		PCLMULQDQ $0x11, X0, X8
   193	
   194		PXOR     X5, X1
   195		PXOR     X6, X2
   196		PXOR     X7, X3
   197		PXOR     X8, X4
   198	
   199		PXOR     X11, X1
   200		PXOR     X12, X2
   201		PXOR     X13, X3
   202		PXOR     X14, X4
   203	
   204		ADDQ    $0x40, DI
   205		ADDQ    $64, SI      // buf+=64
   206		SUBQ    $64, CX      // len-=64
   207		CMPQ    CX, $64      // Less than 64 bytes left?
   208		JGE     loopback64
   209	
   210		/* Fold result into a single register (X1) */
   211	remain64:
   212		MOVOA       r4r3<>+0(SB), X0
   213	
   214		MOVOA       X1, X5
   215		PCLMULQDQ   $0, X0, X1
   216		PCLMULQDQ   $0x11, X0, X5
   217		PXOR        X5, X1
   218		PXOR        X2, X1
   219	
   220		MOVOA       X1, X5
   221		PCLMULQDQ   $0, X0, X1
   222		PCLMULQDQ   $0x11, X0, X5
   223		PXOR        X5, X1
   224		PXOR        X3, X1
   225	
   226		MOVOA       X1, X5
   227		PCLMULQDQ   $0, X0, X1
   228		PCLMULQDQ   $0x11, X0, X5
   229		PXOR        X5, X1
   230		PXOR        X4, X1
   231	
   232		/* If there is less than 16 bytes left we are done */
   233		CMPQ        CX, $16
   234		JB          finish
   235	
   236		/* Encode 16 bytes */
   237	remain16:
   238		MOVOU       (SI), X10
   239		MOVOA       X1, X5
   240		PCLMULQDQ   $0, X0, X1
   241		PCLMULQDQ   $0x11, X0, X5
   242		PXOR        X5, X1
   243		PXOR        X10, X1
   244		SUBQ        $16, CX
   245		ADDQ        $16, SI
   246		CMPQ        CX, $16
   247		JGE         remain16
   248	
   249	finish:
   250		/* Fold final result into 32 bits and return it */
   251		PCMPEQB     X3, X3
   252		PCLMULQDQ   $1, X1, X0
   253		PSRLDQ      $8, X1
   254		PXOR        X0, X1
   255	
   256		MOVOA       X1, X2
   257		MOVQ        r5<>+0(SB), X0
   258	
   259		/* Creates 32 bit mask. Note that we don't care about upper half. */
   260		PSRLQ       $32, X3
   261	
   262		PSRLDQ      $4, X2
   263		PAND        X3, X1
   264		PCLMULQDQ   $0, X0, X1
   265		PXOR        X2, X1
   266	
   267		MOVOA       rupoly<>+0(SB), X0
   268	
   269		MOVOA       X1, X2
   270		PAND        X3, X1
   271		PCLMULQDQ   $0x10, X0, X1
   272		PAND        X3, X1
   273		PCLMULQDQ   $0, X0, X1
   274		PXOR        X2, X1
   275	
   276		PEXTRD	$1, X1, AX
   277		MOVL        AX, ret+32(FP)
   278	
   279		RET

View as plain text