...

Text file src/internal/bytealg/indexbyte_amd64.s

     1	// Copyright 2018 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	#include "go_asm.h"
     6	#include "textflag.h"
     7	
     8	TEXT	·IndexByte(SB), NOSPLIT, $0-40
     9		MOVQ b_base+0(FP), SI
    10		MOVQ b_len+8(FP), BX
    11		MOVB c+24(FP), AL
    12		LEAQ ret+32(FP), R8
    13		JMP  indexbytebody<>(SB)
    14	
    15	TEXT	·IndexByteString(SB), NOSPLIT, $0-32
    16		MOVQ s_base+0(FP), SI
    17		MOVQ s_len+8(FP), BX
    18		MOVB c+16(FP), AL
    19		LEAQ ret+24(FP), R8
    20		JMP  indexbytebody<>(SB)
    21	
    22	// input:
    23	//   SI: data
    24	//   BX: data len
    25	//   AL: byte sought
    26	//   R8: address to put result
    27	TEXT	indexbytebody<>(SB), NOSPLIT, $0
    28		// Shuffle X0 around so that each byte contains
    29		// the character we're looking for.
    30		MOVD AX, X0
    31		PUNPCKLBW X0, X0
    32		PUNPCKLBW X0, X0
    33		PSHUFL $0, X0, X0
    34	
    35		CMPQ BX, $16
    36		JLT small
    37	
    38		MOVQ SI, DI
    39	
    40		CMPQ BX, $32
    41		JA avx2
    42	sse:
    43		LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
    44		JMP	sseloopentry
    45	
    46	sseloop:
    47		// Move the next 16-byte chunk of the data into X1.
    48		MOVOU	(DI), X1
    49		// Compare bytes in X0 to X1.
    50		PCMPEQB	X0, X1
    51		// Take the top bit of each byte in X1 and put the result in DX.
    52		PMOVMSKB X1, DX
    53		// Find first set bit, if any.
    54		BSFL	DX, DX
    55		JNZ	ssesuccess
    56		// Advance to next block.
    57		ADDQ	$16, DI
    58	sseloopentry:
    59		CMPQ	DI, AX
    60		JB	sseloop
    61	
    62		// Search the last 16-byte chunk. This chunk may overlap with the
    63		// chunks we've already searched, but that's ok.
    64		MOVQ	AX, DI
    65		MOVOU	(AX), X1
    66		PCMPEQB	X0, X1
    67		PMOVMSKB X1, DX
    68		BSFL	DX, DX
    69		JNZ	ssesuccess
    70	
    71	failure:
    72		MOVQ $-1, (R8)
    73		RET
    74	
    75	// We've found a chunk containing the byte.
    76	// The chunk was loaded from DI.
    77	// The index of the matching byte in the chunk is DX.
    78	// The start of the data is SI.
    79	ssesuccess:
    80		SUBQ SI, DI	// Compute offset of chunk within data.
    81		ADDQ DX, DI	// Add offset of byte within chunk.
    82		MOVQ DI, (R8)
    83		RET
    84	
    85	// handle for lengths < 16
    86	small:
    87		TESTQ	BX, BX
    88		JEQ	failure
    89	
    90		// Check if we'll load across a page boundary.
    91		LEAQ	16(SI), AX
    92		TESTW	$0xff0, AX
    93		JEQ	endofpage
    94	
    95		MOVOU	(SI), X1 // Load data
    96		PCMPEQB	X0, X1	// Compare target byte with each byte in data.
    97		PMOVMSKB X1, DX	// Move result bits to integer register.
    98		BSFL	DX, DX	// Find first set bit.
    99		JZ	failure	// No set bit, failure.
   100		CMPL	DX, BX
   101		JAE	failure	// Match is past end of data.
   102		MOVQ	DX, (R8)
   103		RET
   104	
   105	endofpage:
   106		MOVOU	-16(SI)(BX*1), X1	// Load data into the high end of X1.
   107		PCMPEQB	X0, X1	// Compare target byte with each byte in data.
   108		PMOVMSKB X1, DX	// Move result bits to integer register.
   109		MOVL	BX, CX
   110		SHLL	CX, DX
   111		SHRL	$16, DX	// Shift desired bits down to bottom of register.
   112		BSFL	DX, DX	// Find first set bit.
   113		JZ	failure	// No set bit, failure.
   114		MOVQ	DX, (R8)
   115		RET
   116	
   117	avx2:
   118		CMPB   internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
   119		JNE sse
   120		MOVD AX, X0
   121		LEAQ -32(SI)(BX*1), R11
   122		VPBROADCASTB  X0, Y1
   123	avx2_loop:
   124		VMOVDQU (DI), Y2
   125		VPCMPEQB Y1, Y2, Y3
   126		VPTEST Y3, Y3
   127		JNZ avx2success
   128		ADDQ $32, DI
   129		CMPQ DI, R11
   130		JLT avx2_loop
   131		MOVQ R11, DI
   132		VMOVDQU (DI), Y2
   133		VPCMPEQB Y1, Y2, Y3
   134		VPTEST Y3, Y3
   135		JNZ avx2success
   136		VZEROUPPER
   137		MOVQ $-1, (R8)
   138		RET
   139	
   140	avx2success:
   141		VPMOVMSKB Y3, DX
   142		BSFL DX, DX
   143		SUBQ SI, DI
   144		ADDQ DI, DX
   145		MOVQ DX, (R8)
   146		VZEROUPPER
   147		RET

View as plain text