Text file src/runtime/memclr_amd64.s
1 // Copyright 2014 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // +build !plan9
6
7 #include "go_asm.h"
8 #include "textflag.h"
9
10 // NOTE: Windows externalthreadhandler expects memclr to preserve DX.
11
12 // func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
13 TEXT runtime·memclrNoHeapPointers(SB), NOSPLIT, $0-16
14 MOVQ ptr+0(FP), DI
15 MOVQ n+8(FP), BX
16 XORQ AX, AX
17
18 // MOVOU seems always faster than REP STOSQ.
19 tail:
20 // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
21 TESTQ BX, BX
22 JEQ _0
23 CMPQ BX, $2
24 JBE _1or2
25 CMPQ BX, $4
26 JBE _3or4
27 CMPQ BX, $8
28 JB _5through7
29 JE _8
30 CMPQ BX, $16
31 JBE _9through16
32 PXOR X0, X0
33 CMPQ BX, $32
34 JBE _17through32
35 CMPQ BX, $64
36 JBE _33through64
37 CMPQ BX, $128
38 JBE _65through128
39 CMPQ BX, $256
40 JBE _129through256
41 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
42 JE loop_preheader_avx2
43 // TODO: for really big clears, use MOVNTDQ, even without AVX2.
44
45 loop:
46 MOVOU X0, 0(DI)
47 MOVOU X0, 16(DI)
48 MOVOU X0, 32(DI)
49 MOVOU X0, 48(DI)
50 MOVOU X0, 64(DI)
51 MOVOU X0, 80(DI)
52 MOVOU X0, 96(DI)
53 MOVOU X0, 112(DI)
54 MOVOU X0, 128(DI)
55 MOVOU X0, 144(DI)
56 MOVOU X0, 160(DI)
57 MOVOU X0, 176(DI)
58 MOVOU X0, 192(DI)
59 MOVOU X0, 208(DI)
60 MOVOU X0, 224(DI)
61 MOVOU X0, 240(DI)
62 SUBQ $256, BX
63 ADDQ $256, DI
64 CMPQ BX, $256
65 JAE loop
66 JMP tail
67
68 loop_preheader_avx2:
69 VPXOR Y0, Y0, Y0
70 // For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
71 // For larger sizes it is always faster, even on dual Xeons with 30M cache.
72 // TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
73 CMPQ BX, $0x2000000
74 JAE loop_preheader_avx2_huge
75 loop_avx2:
76 VMOVDQU Y0, 0(DI)
77 VMOVDQU Y0, 32(DI)
78 VMOVDQU Y0, 64(DI)
79 VMOVDQU Y0, 96(DI)
80 SUBQ $128, BX
81 ADDQ $128, DI
82 CMPQ BX, $128
83 JAE loop_avx2
84 VMOVDQU Y0, -32(DI)(BX*1)
85 VMOVDQU Y0, -64(DI)(BX*1)
86 VMOVDQU Y0, -96(DI)(BX*1)
87 VMOVDQU Y0, -128(DI)(BX*1)
88 VZEROUPPER
89 RET
90 loop_preheader_avx2_huge:
91 // Align to 32 byte boundary
92 VMOVDQU Y0, 0(DI)
93 MOVQ DI, SI
94 ADDQ $32, DI
95 ANDQ $~31, DI
96 SUBQ DI, SI
97 ADDQ SI, BX
98 loop_avx2_huge:
99 VMOVNTDQ Y0, 0(DI)
100 VMOVNTDQ Y0, 32(DI)
101 VMOVNTDQ Y0, 64(DI)
102 VMOVNTDQ Y0, 96(DI)
103 SUBQ $128, BX
104 ADDQ $128, DI
105 CMPQ BX, $128
106 JAE loop_avx2_huge
107 // In the description of MOVNTDQ in [1]
108 // "... fencing operation implemented with the SFENCE or MFENCE instruction
109 // should be used in conjunction with MOVNTDQ instructions..."
110 // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
111 SFENCE
112 VMOVDQU Y0, -32(DI)(BX*1)
113 VMOVDQU Y0, -64(DI)(BX*1)
114 VMOVDQU Y0, -96(DI)(BX*1)
115 VMOVDQU Y0, -128(DI)(BX*1)
116 VZEROUPPER
117 RET
118
119 _1or2:
120 MOVB AX, (DI)
121 MOVB AX, -1(DI)(BX*1)
122 RET
123 _0:
124 RET
125 _3or4:
126 MOVW AX, (DI)
127 MOVW AX, -2(DI)(BX*1)
128 RET
129 _5through7:
130 MOVL AX, (DI)
131 MOVL AX, -4(DI)(BX*1)
132 RET
133 _8:
134 // We need a separate case for 8 to make sure we clear pointers atomically.
135 MOVQ AX, (DI)
136 RET
137 _9through16:
138 MOVQ AX, (DI)
139 MOVQ AX, -8(DI)(BX*1)
140 RET
141 _17through32:
142 MOVOU X0, (DI)
143 MOVOU X0, -16(DI)(BX*1)
144 RET
145 _33through64:
146 MOVOU X0, (DI)
147 MOVOU X0, 16(DI)
148 MOVOU X0, -32(DI)(BX*1)
149 MOVOU X0, -16(DI)(BX*1)
150 RET
151 _65through128:
152 MOVOU X0, (DI)
153 MOVOU X0, 16(DI)
154 MOVOU X0, 32(DI)
155 MOVOU X0, 48(DI)
156 MOVOU X0, -64(DI)(BX*1)
157 MOVOU X0, -48(DI)(BX*1)
158 MOVOU X0, -32(DI)(BX*1)
159 MOVOU X0, -16(DI)(BX*1)
160 RET
161 _129through256:
162 MOVOU X0, (DI)
163 MOVOU X0, 16(DI)
164 MOVOU X0, 32(DI)
165 MOVOU X0, 48(DI)
166 MOVOU X0, 64(DI)
167 MOVOU X0, 80(DI)
168 MOVOU X0, 96(DI)
169 MOVOU X0, 112(DI)
170 MOVOU X0, -128(DI)(BX*1)
171 MOVOU X0, -112(DI)(BX*1)
172 MOVOU X0, -96(DI)(BX*1)
173 MOVOU X0, -80(DI)(BX*1)
174 MOVOU X0, -64(DI)(BX*1)
175 MOVOU X0, -48(DI)(BX*1)
176 MOVOU X0, -32(DI)(BX*1)
177 MOVOU X0, -16(DI)(BX*1)
178 RET
View as plain text