...

Text file src/internal/bytealg/count_amd64.s

     1	// Copyright 2018 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	#include "go_asm.h"
     6	#include "textflag.h"
     7	
     8	TEXT ·Count(SB),NOSPLIT,$0-40
     9		CMPB	internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
    10		JEQ	2(PC)
    11		JMP	·countGeneric(SB)
    12		MOVQ	b_base+0(FP), SI
    13		MOVQ	b_len+8(FP), BX
    14		MOVB	c+24(FP), AL
    15		LEAQ	ret+32(FP), R8
    16		JMP	countbody<>(SB)
    17	
    18	TEXT ·CountString(SB),NOSPLIT,$0-32
    19		CMPB	internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
    20		JEQ	2(PC)
    21		JMP	·countGenericString(SB)
    22		MOVQ	s_base+0(FP), SI
    23		MOVQ	s_len+8(FP), BX
    24		MOVB	c+16(FP), AL
    25		LEAQ	ret+24(FP), R8
    26		JMP	countbody<>(SB)
    27	
    28	// input:
    29	//   SI: data
    30	//   BX: data len
    31	//   AL: byte sought
    32	//   R8: address to put result
    33	// This function requires the POPCNT instruction.
    34	TEXT countbody<>(SB),NOSPLIT,$0
    35		// Shuffle X0 around so that each byte contains
    36		// the character we're looking for.
    37		MOVD AX, X0
    38		PUNPCKLBW X0, X0
    39		PUNPCKLBW X0, X0
    40		PSHUFL $0, X0, X0
    41	
    42		CMPQ BX, $16
    43		JLT small
    44	
    45		MOVQ $0, R12 // Accumulator
    46	
    47		MOVQ SI, DI
    48	
    49		CMPQ BX, $32
    50		JA avx2
    51	sse:
    52		LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
    53		JMP	sseloopentry
    54	
    55	sseloop:
    56		// Move the next 16-byte chunk of the data into X1.
    57		MOVOU	(DI), X1
    58		// Compare bytes in X0 to X1.
    59		PCMPEQB	X0, X1
    60		// Take the top bit of each byte in X1 and put the result in DX.
    61		PMOVMSKB X1, DX
    62		// Count number of matching bytes
    63		POPCNTL DX, DX
    64		// Accumulate into R12
    65		ADDQ DX, R12
    66		// Advance to next block.
    67		ADDQ	$16, DI
    68	sseloopentry:
    69		CMPQ	DI, AX
    70		JBE	sseloop
    71	
    72		// Get the number of bytes to consider in the last 16 bytes
    73		ANDQ $15, BX
    74		JZ end
    75	
    76		// Create mask to ignore overlap between previous 16 byte block
    77		// and the next.
    78		MOVQ $16,CX
    79		SUBQ BX, CX
    80		MOVQ $0xFFFF, R10
    81		SARQ CL, R10
    82		SALQ CL, R10
    83	
    84		// Process the last 16-byte chunk. This chunk may overlap with the
    85		// chunks we've already searched so we need to mask part of it.
    86		MOVOU	(AX), X1
    87		PCMPEQB	X0, X1
    88		PMOVMSKB X1, DX
    89		// Apply mask
    90		ANDQ R10, DX
    91		POPCNTL DX, DX
    92		ADDQ DX, R12
    93	end:
    94		MOVQ R12, (R8)
    95		RET
    96	
    97	// handle for lengths < 16
    98	small:
    99		TESTQ	BX, BX
   100		JEQ	endzero
   101	
   102		// Check if we'll load across a page boundary.
   103		LEAQ	16(SI), AX
   104		TESTW	$0xff0, AX
   105		JEQ	endofpage
   106	
   107		// We must ignore high bytes as they aren't part of our slice.
   108		// Create mask.
   109		MOVB BX, CX
   110		MOVQ $1, R10
   111		SALQ CL, R10
   112		SUBQ $1, R10
   113	
   114		// Load data
   115		MOVOU	(SI), X1
   116		// Compare target byte with each byte in data.
   117		PCMPEQB	X0, X1
   118		// Move result bits to integer register.
   119		PMOVMSKB X1, DX
   120		// Apply mask
   121		ANDQ R10, DX
   122		POPCNTL DX, DX
   123		// Directly return DX, we don't need to accumulate
   124		// since we have <16 bytes.
   125		MOVQ	DX, (R8)
   126		RET
   127	endzero:
   128		MOVQ $0, (R8)
   129		RET
   130	
   131	endofpage:
   132		// We must ignore low bytes as they aren't part of our slice.
   133		MOVQ $16,CX
   134		SUBQ BX, CX
   135		MOVQ $0xFFFF, R10
   136		SARQ CL, R10
   137		SALQ CL, R10
   138	
   139		// Load data into the high end of X1.
   140		MOVOU	-16(SI)(BX*1), X1
   141		// Compare target byte with each byte in data.
   142		PCMPEQB	X0, X1
   143		// Move result bits to integer register.
   144		PMOVMSKB X1, DX
   145		// Apply mask
   146		ANDQ R10, DX
   147		// Directly return DX, we don't need to accumulate
   148		// since we have <16 bytes.
   149		POPCNTL DX, DX
   150		MOVQ	DX, (R8)
   151		RET
   152	
   153	avx2:
   154		CMPB   internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
   155		JNE sse
   156		MOVD AX, X0
   157		LEAQ -32(SI)(BX*1), R11
   158		VPBROADCASTB  X0, Y1
   159	avx2_loop:
   160		VMOVDQU (DI), Y2
   161		VPCMPEQB Y1, Y2, Y3
   162		VPMOVMSKB Y3, DX
   163		POPCNTL DX, DX
   164		ADDQ DX, R12
   165		ADDQ $32, DI
   166		CMPQ DI, R11
   167		JLE avx2_loop
   168	
   169		// If last block is already processed,
   170		// skip to the end.
   171		CMPQ DI, R11
   172		JEQ endavx
   173	
   174		// Load address of the last 32 bytes.
   175		// There is an overlap with the previous block.
   176		MOVQ R11, DI
   177		VMOVDQU (DI), Y2
   178		VPCMPEQB Y1, Y2, Y3
   179		VPMOVMSKB Y3, DX
   180		// Exit AVX mode.
   181		VZEROUPPER
   182	
   183		// Create mask to ignore overlap between previous 32 byte block
   184		// and the next.
   185		ANDQ $31, BX
   186		MOVQ $32,CX
   187		SUBQ BX, CX
   188		MOVQ $0xFFFFFFFF, R10
   189		SARQ CL, R10
   190		SALQ CL, R10
   191		// Apply mask
   192		ANDQ R10, DX
   193		POPCNTL DX, DX
   194		ADDQ DX, R12
   195		MOVQ R12, (R8)
   196		RET
   197	endavx:
   198		// Exit AVX mode.
   199		VZEROUPPER
   200		MOVQ R12, (R8)
   201		RET

View as plain text