...

Text file src/pkg/internal/bytealg/equal_amd64.s

     1	// Copyright 2018 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	#include "go_asm.h"
     6	#include "textflag.h"
     7	
     8	// memequal(a, b unsafe.Pointer, size uintptr) bool
     9	TEXT runtime·memequal(SB),NOSPLIT,$0-25
    10		MOVQ	a+0(FP), SI
    11		MOVQ	b+8(FP), DI
    12		CMPQ	SI, DI
    13		JEQ	eq
    14		MOVQ	size+16(FP), BX
    15		LEAQ	ret+24(FP), AX
    16		JMP	memeqbody<>(SB)
    17	eq:
    18		MOVB	$1, ret+24(FP)
    19		RET
    20	
    21	// memequal_varlen(a, b unsafe.Pointer) bool
    22	TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
    23		MOVQ	a+0(FP), SI
    24		MOVQ	b+8(FP), DI
    25		CMPQ	SI, DI
    26		JEQ	eq
    27		MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
    28		LEAQ	ret+16(FP), AX
    29		JMP	memeqbody<>(SB)
    30	eq:
    31		MOVB	$1, ret+16(FP)
    32		RET
    33	
    34	// a in SI
    35	// b in DI
    36	// count in BX
    37	// address of result byte in AX
    38	TEXT memeqbody<>(SB),NOSPLIT,$0-0
    39		CMPQ	BX, $8
    40		JB	small
    41		CMPQ	BX, $64
    42		JB	bigloop
    43		CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
    44		JE	hugeloop_avx2
    45	
    46		// 64 bytes at a time using xmm registers
    47	hugeloop:
    48		CMPQ	BX, $64
    49		JB	bigloop
    50		MOVOU	(SI), X0
    51		MOVOU	(DI), X1
    52		MOVOU	16(SI), X2
    53		MOVOU	16(DI), X3
    54		MOVOU	32(SI), X4
    55		MOVOU	32(DI), X5
    56		MOVOU	48(SI), X6
    57		MOVOU	48(DI), X7
    58		PCMPEQB	X1, X0
    59		PCMPEQB	X3, X2
    60		PCMPEQB	X5, X4
    61		PCMPEQB	X7, X6
    62		PAND	X2, X0
    63		PAND	X6, X4
    64		PAND	X4, X0
    65		PMOVMSKB X0, DX
    66		ADDQ	$64, SI
    67		ADDQ	$64, DI
    68		SUBQ	$64, BX
    69		CMPL	DX, $0xffff
    70		JEQ	hugeloop
    71		MOVB	$0, (AX)
    72		RET
    73	
    74		// 64 bytes at a time using ymm registers
    75	hugeloop_avx2:
    76		CMPQ	BX, $64
    77		JB	bigloop_avx2
    78		VMOVDQU	(SI), Y0
    79		VMOVDQU	(DI), Y1
    80		VMOVDQU	32(SI), Y2
    81		VMOVDQU	32(DI), Y3
    82		VPCMPEQB	Y1, Y0, Y4
    83		VPCMPEQB	Y2, Y3, Y5
    84		VPAND	Y4, Y5, Y6
    85		VPMOVMSKB Y6, DX
    86		ADDQ	$64, SI
    87		ADDQ	$64, DI
    88		SUBQ	$64, BX
    89		CMPL	DX, $0xffffffff
    90		JEQ	hugeloop_avx2
    91		VZEROUPPER
    92		MOVB	$0, (AX)
    93		RET
    94	
    95	bigloop_avx2:
    96		VZEROUPPER
    97	
    98		// 8 bytes at a time using 64-bit register
    99	bigloop:
   100		CMPQ	BX, $8
   101		JBE	leftover
   102		MOVQ	(SI), CX
   103		MOVQ	(DI), DX
   104		ADDQ	$8, SI
   105		ADDQ	$8, DI
   106		SUBQ	$8, BX
   107		CMPQ	CX, DX
   108		JEQ	bigloop
   109		MOVB	$0, (AX)
   110		RET
   111	
   112		// remaining 0-8 bytes
   113	leftover:
   114		MOVQ	-8(SI)(BX*1), CX
   115		MOVQ	-8(DI)(BX*1), DX
   116		CMPQ	CX, DX
   117		SETEQ	(AX)
   118		RET
   119	
   120	small:
   121		CMPQ	BX, $0
   122		JEQ	equal
   123	
   124		LEAQ	0(BX*8), CX
   125		NEGQ	CX
   126	
   127		CMPB	SI, $0xf8
   128		JA	si_high
   129	
   130		// load at SI won't cross a page boundary.
   131		MOVQ	(SI), SI
   132		JMP	si_finish
   133	si_high:
   134		// address ends in 11111xxx. Load up to bytes we want, move to correct position.
   135		MOVQ	-8(SI)(BX*1), SI
   136		SHRQ	CX, SI
   137	si_finish:
   138	
   139		// same for DI.
   140		CMPB	DI, $0xf8
   141		JA	di_high
   142		MOVQ	(DI), DI
   143		JMP	di_finish
   144	di_high:
   145		MOVQ	-8(DI)(BX*1), DI
   146		SHRQ	CX, DI
   147	di_finish:
   148	
   149		SUBQ	SI, DI
   150		SHLQ	CX, DI
   151	equal:
   152		SETEQ	(AX)
   153		RET
   154	

View as plain text