Text file src/internal/bytealg/compare_amd64.s
1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "go_asm.h"
6 #include "textflag.h"
7
8 TEXT ·Compare(SB),NOSPLIT,$0-56
9 MOVQ a_base+0(FP), SI
10 MOVQ a_len+8(FP), BX
11 MOVQ b_base+24(FP), DI
12 MOVQ b_len+32(FP), DX
13 LEAQ ret+48(FP), R9
14 JMP cmpbody<>(SB)
15
16 TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
17 MOVQ a_base+0(FP), SI
18 MOVQ a_len+8(FP), BX
19 MOVQ b_base+16(FP), DI
20 MOVQ b_len+24(FP), DX
21 LEAQ ret+32(FP), R9
22 JMP cmpbody<>(SB)
23
24 // input:
25 // SI = a
26 // DI = b
27 // BX = alen
28 // DX = blen
29 // R9 = address of output word (stores -1/0/1 here)
30 TEXT cmpbody<>(SB),NOSPLIT,$0-0
31 CMPQ SI, DI
32 JEQ allsame
33 CMPQ BX, DX
34 MOVQ DX, R8
35 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare
36 CMPQ R8, $8
37 JB small
38
39 CMPQ R8, $63
40 JBE loop
41 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
42 JEQ big_loop_avx2
43 JMP big_loop
44 loop:
45 CMPQ R8, $16
46 JBE _0through16
47 MOVOU (SI), X0
48 MOVOU (DI), X1
49 PCMPEQB X0, X1
50 PMOVMSKB X1, AX
51 XORQ $0xffff, AX // convert EQ to NE
52 JNE diff16 // branch if at least one byte is not equal
53 ADDQ $16, SI
54 ADDQ $16, DI
55 SUBQ $16, R8
56 JMP loop
57
58 diff64:
59 ADDQ $48, SI
60 ADDQ $48, DI
61 JMP diff16
62 diff48:
63 ADDQ $32, SI
64 ADDQ $32, DI
65 JMP diff16
66 diff32:
67 ADDQ $16, SI
68 ADDQ $16, DI
69 // AX = bit mask of differences
70 diff16:
71 BSFQ AX, BX // index of first byte that differs
72 XORQ AX, AX
73 MOVB (SI)(BX*1), CX
74 CMPB CX, (DI)(BX*1)
75 SETHI AX
76 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1
77 MOVQ AX, (R9)
78 RET
79
80 // 0 through 16 bytes left, alen>=8, blen>=8
81 _0through16:
82 CMPQ R8, $8
83 JBE _0through8
84 MOVQ (SI), AX
85 MOVQ (DI), CX
86 CMPQ AX, CX
87 JNE diff8
88 _0through8:
89 MOVQ -8(SI)(R8*1), AX
90 MOVQ -8(DI)(R8*1), CX
91 CMPQ AX, CX
92 JEQ allsame
93
94 // AX and CX contain parts of a and b that differ.
95 diff8:
96 BSWAPQ AX // reverse order of bytes
97 BSWAPQ CX
98 XORQ AX, CX
99 BSRQ CX, CX // index of highest bit difference
100 SHRQ CX, AX // move a's bit to bottom
101 ANDQ $1, AX // mask bit
102 LEAQ -1(AX*2), AX // 1/0 => +1/-1
103 MOVQ AX, (R9)
104 RET
105
106 // 0-7 bytes in common
107 small:
108 LEAQ (R8*8), CX // bytes left -> bits left
109 NEGQ CX // - bits lift (== 64 - bits left mod 64)
110 JEQ allsame
111
112 // load bytes of a into high bytes of AX
113 CMPB SI, $0xf8
114 JA si_high
115 MOVQ (SI), SI
116 JMP si_finish
117 si_high:
118 MOVQ -8(SI)(R8*1), SI
119 SHRQ CX, SI
120 si_finish:
121 SHLQ CX, SI
122
123 // load bytes of b in to high bytes of BX
124 CMPB DI, $0xf8
125 JA di_high
126 MOVQ (DI), DI
127 JMP di_finish
128 di_high:
129 MOVQ -8(DI)(R8*1), DI
130 SHRQ CX, DI
131 di_finish:
132 SHLQ CX, DI
133
134 BSWAPQ SI // reverse order of bytes
135 BSWAPQ DI
136 XORQ SI, DI // find bit differences
137 JEQ allsame
138 BSRQ DI, CX // index of highest bit difference
139 SHRQ CX, SI // move a's bit to bottom
140 ANDQ $1, SI // mask bit
141 LEAQ -1(SI*2), AX // 1/0 => +1/-1
142 MOVQ AX, (R9)
143 RET
144
145 allsame:
146 XORQ AX, AX
147 XORQ CX, CX
148 CMPQ BX, DX
149 SETGT AX // 1 if alen > blen
150 SETEQ CX // 1 if alen == blen
151 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result
152 MOVQ AX, (R9)
153 RET
154
155 // this works for >= 64 bytes of data.
156 big_loop:
157 MOVOU (SI), X0
158 MOVOU (DI), X1
159 PCMPEQB X0, X1
160 PMOVMSKB X1, AX
161 XORQ $0xffff, AX
162 JNE diff16
163
164 MOVOU 16(SI), X0
165 MOVOU 16(DI), X1
166 PCMPEQB X0, X1
167 PMOVMSKB X1, AX
168 XORQ $0xffff, AX
169 JNE diff32
170
171 MOVOU 32(SI), X0
172 MOVOU 32(DI), X1
173 PCMPEQB X0, X1
174 PMOVMSKB X1, AX
175 XORQ $0xffff, AX
176 JNE diff48
177
178 MOVOU 48(SI), X0
179 MOVOU 48(DI), X1
180 PCMPEQB X0, X1
181 PMOVMSKB X1, AX
182 XORQ $0xffff, AX
183 JNE diff64
184
185 ADDQ $64, SI
186 ADDQ $64, DI
187 SUBQ $64, R8
188 CMPQ R8, $64
189 JBE loop
190 JMP big_loop
191
192 // Compare 64-bytes per loop iteration.
193 // Loop is unrolled and uses AVX2.
194 big_loop_avx2:
195 VMOVDQU (SI), Y2
196 VMOVDQU (DI), Y3
197 VMOVDQU 32(SI), Y4
198 VMOVDQU 32(DI), Y5
199 VPCMPEQB Y2, Y3, Y0
200 VPMOVMSKB Y0, AX
201 XORL $0xffffffff, AX
202 JNE diff32_avx2
203 VPCMPEQB Y4, Y5, Y6
204 VPMOVMSKB Y6, AX
205 XORL $0xffffffff, AX
206 JNE diff64_avx2
207
208 ADDQ $64, SI
209 ADDQ $64, DI
210 SUBQ $64, R8
211 CMPQ R8, $64
212 JB big_loop_avx2_exit
213 JMP big_loop_avx2
214
215 // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
216 diff32_avx2:
217 VZEROUPPER
218 JMP diff16
219
220 // Same as diff32_avx2, but for last 32 bytes.
221 diff64_avx2:
222 VZEROUPPER
223 JMP diff48
224
225 // For <64 bytes remainder jump to normal loop.
226 big_loop_avx2_exit:
227 VZEROUPPER
228 JMP loop
View as plain text