...

Text file src/pkg/internal/bytealg/index_amd64.s

     1	// Copyright 2018 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	#include "go_asm.h"
     6	#include "textflag.h"
     7	
     8	TEXT ·Index(SB),NOSPLIT,$0-56
     9		MOVQ a_base+0(FP), DI
    10		MOVQ a_len+8(FP), DX
    11		MOVQ b_base+24(FP), BP
    12		MOVQ b_len+32(FP), AX
    13		MOVQ DI, R10
    14		LEAQ ret+48(FP), R11
    15		JMP  indexbody<>(SB)
    16	
    17	TEXT ·IndexString(SB),NOSPLIT,$0-40
    18		MOVQ a_base+0(FP), DI
    19		MOVQ a_len+8(FP), DX
    20		MOVQ b_base+16(FP), BP
    21		MOVQ b_len+24(FP), AX
    22		MOVQ DI, R10
    23		LEAQ ret+32(FP), R11
    24		JMP  indexbody<>(SB)
    25	
    26	// AX: length of string, that we are searching for
    27	// DX: length of string, in which we are searching
    28	// DI: pointer to string, in which we are searching
    29	// BP: pointer to string, that we are searching for
    30	// R11: address, where to put return value
    31	// Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
    32	TEXT indexbody<>(SB),NOSPLIT,$0
    33		CMPQ AX, DX
    34		JA fail
    35		CMPQ DX, $16
    36		JAE sse42
    37	no_sse42:
    38		CMPQ AX, $2
    39		JA   _3_or_more
    40		MOVW (BP), BP
    41		LEAQ -1(DI)(DX*1), DX
    42	loop2:
    43		MOVW (DI), SI
    44		CMPW SI,BP
    45		JZ success
    46		ADDQ $1,DI
    47		CMPQ DI,DX
    48		JB loop2
    49		JMP fail
    50	_3_or_more:
    51		CMPQ AX, $3
    52		JA   _4_or_more
    53		MOVW 1(BP), BX
    54		MOVW (BP), BP
    55		LEAQ -2(DI)(DX*1), DX
    56	loop3:
    57		MOVW (DI), SI
    58		CMPW SI,BP
    59		JZ   partial_success3
    60		ADDQ $1,DI
    61		CMPQ DI,DX
    62		JB loop3
    63		JMP fail
    64	partial_success3:
    65		MOVW 1(DI), SI
    66		CMPW SI,BX
    67		JZ success
    68		ADDQ $1,DI
    69		CMPQ DI,DX
    70		JB loop3
    71		JMP fail
    72	_4_or_more:
    73		CMPQ AX, $4
    74		JA   _5_or_more
    75		MOVL (BP), BP
    76		LEAQ -3(DI)(DX*1), DX
    77	loop4:
    78		MOVL (DI), SI
    79		CMPL SI,BP
    80		JZ   success
    81		ADDQ $1,DI
    82		CMPQ DI,DX
    83		JB loop4
    84		JMP fail
    85	_5_or_more:
    86		CMPQ AX, $7
    87		JA   _8_or_more
    88		LEAQ 1(DI)(DX*1), DX
    89		SUBQ AX, DX
    90		MOVL -4(BP)(AX*1), BX
    91		MOVL (BP), BP
    92	loop5to7:
    93		MOVL (DI), SI
    94		CMPL SI,BP
    95		JZ   partial_success5to7
    96		ADDQ $1,DI
    97		CMPQ DI,DX
    98		JB loop5to7
    99		JMP fail
   100	partial_success5to7:
   101		MOVL -4(AX)(DI*1), SI
   102		CMPL SI,BX
   103		JZ success
   104		ADDQ $1,DI
   105		CMPQ DI,DX
   106		JB loop5to7
   107		JMP fail
   108	_8_or_more:
   109		CMPQ AX, $8
   110		JA   _9_or_more
   111		MOVQ (BP), BP
   112		LEAQ -7(DI)(DX*1), DX
   113	loop8:
   114		MOVQ (DI), SI
   115		CMPQ SI,BP
   116		JZ   success
   117		ADDQ $1,DI
   118		CMPQ DI,DX
   119		JB loop8
   120		JMP fail
   121	_9_or_more:
   122		CMPQ AX, $15
   123		JA   _16_or_more
   124		LEAQ 1(DI)(DX*1), DX
   125		SUBQ AX, DX
   126		MOVQ -8(BP)(AX*1), BX
   127		MOVQ (BP), BP
   128	loop9to15:
   129		MOVQ (DI), SI
   130		CMPQ SI,BP
   131		JZ   partial_success9to15
   132		ADDQ $1,DI
   133		CMPQ DI,DX
   134		JB loop9to15
   135		JMP fail
   136	partial_success9to15:
   137		MOVQ -8(AX)(DI*1), SI
   138		CMPQ SI,BX
   139		JZ success
   140		ADDQ $1,DI
   141		CMPQ DI,DX
   142		JB loop9to15
   143		JMP fail
   144	_16_or_more:
   145		CMPQ AX, $16
   146		JA   _17_or_more
   147		MOVOU (BP), X1
   148		LEAQ -15(DI)(DX*1), DX
   149	loop16:
   150		MOVOU (DI), X2
   151		PCMPEQB X1, X2
   152		PMOVMSKB X2, SI
   153		CMPQ  SI, $0xffff
   154		JE   success
   155		ADDQ $1,DI
   156		CMPQ DI,DX
   157		JB loop16
   158		JMP fail
   159	_17_or_more:
   160		CMPQ AX, $31
   161		JA   _32_or_more
   162		LEAQ 1(DI)(DX*1), DX
   163		SUBQ AX, DX
   164		MOVOU -16(BP)(AX*1), X0
   165		MOVOU (BP), X1
   166	loop17to31:
   167		MOVOU (DI), X2
   168		PCMPEQB X1,X2
   169		PMOVMSKB X2, SI
   170		CMPQ  SI, $0xffff
   171		JE   partial_success17to31
   172		ADDQ $1,DI
   173		CMPQ DI,DX
   174		JB loop17to31
   175		JMP fail
   176	partial_success17to31:
   177		MOVOU -16(AX)(DI*1), X3
   178		PCMPEQB X0, X3
   179		PMOVMSKB X3, SI
   180		CMPQ  SI, $0xffff
   181		JE success
   182		ADDQ $1,DI
   183		CMPQ DI,DX
   184		JB loop17to31
   185		JMP fail
   186	// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
   187	// So no need to check cpuid
   188	_32_or_more:
   189		CMPQ AX, $32
   190		JA   _33_to_63
   191		VMOVDQU (BP), Y1
   192		LEAQ -31(DI)(DX*1), DX
   193	loop32:
   194		VMOVDQU (DI), Y2
   195		VPCMPEQB Y1, Y2, Y3
   196		VPMOVMSKB Y3, SI
   197		CMPL  SI, $0xffffffff
   198		JE   success_avx2
   199		ADDQ $1,DI
   200		CMPQ DI,DX
   201		JB loop32
   202		JMP fail_avx2
   203	_33_to_63:
   204		LEAQ 1(DI)(DX*1), DX
   205		SUBQ AX, DX
   206		VMOVDQU -32(BP)(AX*1), Y0
   207		VMOVDQU (BP), Y1
   208	loop33to63:
   209		VMOVDQU (DI), Y2
   210		VPCMPEQB Y1, Y2, Y3
   211		VPMOVMSKB Y3, SI
   212		CMPL  SI, $0xffffffff
   213		JE   partial_success33to63
   214		ADDQ $1,DI
   215		CMPQ DI,DX
   216		JB loop33to63
   217		JMP fail_avx2
   218	partial_success33to63:
   219		VMOVDQU -32(AX)(DI*1), Y3
   220		VPCMPEQB Y0, Y3, Y4
   221		VPMOVMSKB Y4, SI
   222		CMPL  SI, $0xffffffff
   223		JE success_avx2
   224		ADDQ $1,DI
   225		CMPQ DI,DX
   226		JB loop33to63
   227	fail_avx2:
   228		VZEROUPPER
   229	fail:
   230		MOVQ $-1, (R11)
   231		RET
   232	success_avx2:
   233		VZEROUPPER
   234		JMP success
   235	sse42:
   236		CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1
   237		JNE no_sse42
   238		CMPQ AX, $12
   239		// PCMPESTRI is slower than normal compare,
   240		// so using it makes sense only if we advance 4+ bytes per compare
   241		// This value was determined experimentally and is the ~same
   242		// on Nehalem (first with SSE42) and Haswell.
   243		JAE _9_or_more
   244		LEAQ 16(BP), SI
   245		TESTW $0xff0, SI
   246		JEQ no_sse42
   247		MOVOU (BP), X1
   248		LEAQ -15(DI)(DX*1), SI
   249		MOVQ $16, R9
   250		SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
   251	loop_sse42:
   252		// 0x0c means: unsigned byte compare (bits 0,1 are 00)
   253		// for equality (bits 2,3 are 11)
   254		// result is not masked or inverted (bits 4,5 are 00)
   255		// and corresponds to first matching byte (bit 6 is 0)
   256		PCMPESTRI $0x0c, (DI), X1
   257		// CX == 16 means no match,
   258		// CX > R9 means partial match at the end of the string,
   259		// otherwise sep is at offset CX from X1 start
   260		CMPQ CX, R9
   261		JBE sse42_success
   262		ADDQ R9, DI
   263		CMPQ DI, SI
   264		JB loop_sse42
   265		PCMPESTRI $0x0c, -1(SI), X1
   266		CMPQ CX, R9
   267		JA fail
   268		LEAQ -1(SI), DI
   269	sse42_success:
   270		ADDQ CX, DI
   271	success:
   272		SUBQ R10, DI
   273		MOVQ DI, (R11)
   274		RET

View as plain text