Text file src/pkg/internal/bytealg/equal_amd64.s
1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "go_asm.h"
6 #include "textflag.h"
7
8 // memequal(a, b unsafe.Pointer, size uintptr) bool
9 TEXT runtime·memequal(SB),NOSPLIT,$0-25
10 MOVQ a+0(FP), SI
11 MOVQ b+8(FP), DI
12 CMPQ SI, DI
13 JEQ eq
14 MOVQ size+16(FP), BX
15 LEAQ ret+24(FP), AX
16 JMP memeqbody<>(SB)
17 eq:
18 MOVB $1, ret+24(FP)
19 RET
20
21 // memequal_varlen(a, b unsafe.Pointer) bool
22 TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
23 MOVQ a+0(FP), SI
24 MOVQ b+8(FP), DI
25 CMPQ SI, DI
26 JEQ eq
27 MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure
28 LEAQ ret+16(FP), AX
29 JMP memeqbody<>(SB)
30 eq:
31 MOVB $1, ret+16(FP)
32 RET
33
34 // a in SI
35 // b in DI
36 // count in BX
37 // address of result byte in AX
38 TEXT memeqbody<>(SB),NOSPLIT,$0-0
39 CMPQ BX, $8
40 JB small
41 CMPQ BX, $64
42 JB bigloop
43 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
44 JE hugeloop_avx2
45
46 // 64 bytes at a time using xmm registers
47 hugeloop:
48 CMPQ BX, $64
49 JB bigloop
50 MOVOU (SI), X0
51 MOVOU (DI), X1
52 MOVOU 16(SI), X2
53 MOVOU 16(DI), X3
54 MOVOU 32(SI), X4
55 MOVOU 32(DI), X5
56 MOVOU 48(SI), X6
57 MOVOU 48(DI), X7
58 PCMPEQB X1, X0
59 PCMPEQB X3, X2
60 PCMPEQB X5, X4
61 PCMPEQB X7, X6
62 PAND X2, X0
63 PAND X6, X4
64 PAND X4, X0
65 PMOVMSKB X0, DX
66 ADDQ $64, SI
67 ADDQ $64, DI
68 SUBQ $64, BX
69 CMPL DX, $0xffff
70 JEQ hugeloop
71 MOVB $0, (AX)
72 RET
73
74 // 64 bytes at a time using ymm registers
75 hugeloop_avx2:
76 CMPQ BX, $64
77 JB bigloop_avx2
78 VMOVDQU (SI), Y0
79 VMOVDQU (DI), Y1
80 VMOVDQU 32(SI), Y2
81 VMOVDQU 32(DI), Y3
82 VPCMPEQB Y1, Y0, Y4
83 VPCMPEQB Y2, Y3, Y5
84 VPAND Y4, Y5, Y6
85 VPMOVMSKB Y6, DX
86 ADDQ $64, SI
87 ADDQ $64, DI
88 SUBQ $64, BX
89 CMPL DX, $0xffffffff
90 JEQ hugeloop_avx2
91 VZEROUPPER
92 MOVB $0, (AX)
93 RET
94
95 bigloop_avx2:
96 VZEROUPPER
97
98 // 8 bytes at a time using 64-bit register
99 bigloop:
100 CMPQ BX, $8
101 JBE leftover
102 MOVQ (SI), CX
103 MOVQ (DI), DX
104 ADDQ $8, SI
105 ADDQ $8, DI
106 SUBQ $8, BX
107 CMPQ CX, DX
108 JEQ bigloop
109 MOVB $0, (AX)
110 RET
111
112 // remaining 0-8 bytes
113 leftover:
114 MOVQ -8(SI)(BX*1), CX
115 MOVQ -8(DI)(BX*1), DX
116 CMPQ CX, DX
117 SETEQ (AX)
118 RET
119
120 small:
121 CMPQ BX, $0
122 JEQ equal
123
124 LEAQ 0(BX*8), CX
125 NEGQ CX
126
127 CMPB SI, $0xf8
128 JA si_high
129
130 // load at SI won't cross a page boundary.
131 MOVQ (SI), SI
132 JMP si_finish
133 si_high:
134 // address ends in 11111xxx. Load up to bytes we want, move to correct position.
135 MOVQ -8(SI)(BX*1), SI
136 SHRQ CX, SI
137 si_finish:
138
139 // same for DI.
140 CMPB DI, $0xf8
141 JA di_high
142 MOVQ (DI), DI
143 JMP di_finish
144 di_high:
145 MOVQ -8(DI)(BX*1), DI
146 SHRQ CX, DI
147 di_finish:
148
149 SUBQ SI, DI
150 SHLQ CX, DI
151 equal:
152 SETEQ (AX)
153 RET
154
View as plain text