Text file src/runtime/memmove_amd64.s
1 // Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
2 // https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/memmove-386.s
3 //
4 // Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
5 // Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved.
6 // Portions Copyright 2009 The Go Authors. All rights reserved.
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a copy
9 // of this software and associated documentation files (the "Software"), to deal
10 // in the Software without restriction, including without limitation the rights
11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 // copies of the Software, and to permit persons to whom the Software is
13 // furnished to do so, subject to the following conditions:
14 //
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
17 //
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 // THE SOFTWARE.
25
26 // +build !plan9
27
28 #include "go_asm.h"
29 #include "textflag.h"
30
31 // func memmove(to, from unsafe.Pointer, n uintptr)
32 TEXT runtime·memmove(SB), NOSPLIT, $0-24
33
34 MOVQ to+0(FP), DI
35 MOVQ from+8(FP), SI
36 MOVQ n+16(FP), BX
37
38 // REP instructions have a high startup cost, so we handle small sizes
39 // with some straightline code. The REP MOVSQ instruction is really fast
40 // for large sizes. The cutover is approximately 2K.
41 tail:
42 // move_129through256 or smaller work whether or not the source and the
43 // destination memory regions overlap because they load all data into
44 // registers before writing it back. move_256through2048 on the other
45 // hand can be used only when the memory regions don't overlap or the copy
46 // direction is forward.
47 //
48 // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
49 TESTQ BX, BX
50 JEQ move_0
51 CMPQ BX, $2
52 JBE move_1or2
53 CMPQ BX, $4
54 JB move_3
55 JBE move_4
56 CMPQ BX, $8
57 JB move_5through7
58 JE move_8
59 CMPQ BX, $16
60 JBE move_9through16
61 CMPQ BX, $32
62 JBE move_17through32
63 CMPQ BX, $64
64 JBE move_33through64
65 CMPQ BX, $128
66 JBE move_65through128
67 CMPQ BX, $256
68 JBE move_129through256
69
70 TESTB $1, runtime·useAVXmemmove(SB)
71 JNZ avxUnaligned
72
73 /*
74 * check and set for backwards
75 */
76 CMPQ SI, DI
77 JLS back
78
79 /*
80 * forward copy loop
81 */
82 forward:
83 CMPQ BX, $2048
84 JLS move_256through2048
85
86 // If REP MOVSB isn't fast, don't use it
87 CMPB internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
88 JNE fwdBy8
89
90 // Check alignment
91 MOVL SI, AX
92 ORL DI, AX
93 TESTL $7, AX
94 JEQ fwdBy8
95
96 // Do 1 byte at a time
97 MOVQ BX, CX
98 REP; MOVSB
99 RET
100
101 fwdBy8:
102 // Do 8 bytes at a time
103 MOVQ BX, CX
104 SHRQ $3, CX
105 ANDQ $7, BX
106 REP; MOVSQ
107 JMP tail
108
109 back:
110 /*
111 * check overlap
112 */
113 MOVQ SI, CX
114 ADDQ BX, CX
115 CMPQ CX, DI
116 JLS forward
117 /*
118 * whole thing backwards has
119 * adjusted addresses
120 */
121 ADDQ BX, DI
122 ADDQ BX, SI
123 STD
124
125 /*
126 * copy
127 */
128 MOVQ BX, CX
129 SHRQ $3, CX
130 ANDQ $7, BX
131
132 SUBQ $8, DI
133 SUBQ $8, SI
134 REP; MOVSQ
135
136 CLD
137 ADDQ $8, DI
138 ADDQ $8, SI
139 SUBQ BX, DI
140 SUBQ BX, SI
141 JMP tail
142
143 move_1or2:
144 MOVB (SI), AX
145 MOVB -1(SI)(BX*1), CX
146 MOVB AX, (DI)
147 MOVB CX, -1(DI)(BX*1)
148 RET
149 move_0:
150 RET
151 move_4:
152 MOVL (SI), AX
153 MOVL AX, (DI)
154 RET
155 move_3:
156 MOVW (SI), AX
157 MOVB 2(SI), CX
158 MOVW AX, (DI)
159 MOVB CX, 2(DI)
160 RET
161 move_5through7:
162 MOVL (SI), AX
163 MOVL -4(SI)(BX*1), CX
164 MOVL AX, (DI)
165 MOVL CX, -4(DI)(BX*1)
166 RET
167 move_8:
168 // We need a separate case for 8 to make sure we write pointers atomically.
169 MOVQ (SI), AX
170 MOVQ AX, (DI)
171 RET
172 move_9through16:
173 MOVQ (SI), AX
174 MOVQ -8(SI)(BX*1), CX
175 MOVQ AX, (DI)
176 MOVQ CX, -8(DI)(BX*1)
177 RET
178 move_17through32:
179 MOVOU (SI), X0
180 MOVOU -16(SI)(BX*1), X1
181 MOVOU X0, (DI)
182 MOVOU X1, -16(DI)(BX*1)
183 RET
184 move_33through64:
185 MOVOU (SI), X0
186 MOVOU 16(SI), X1
187 MOVOU -32(SI)(BX*1), X2
188 MOVOU -16(SI)(BX*1), X3
189 MOVOU X0, (DI)
190 MOVOU X1, 16(DI)
191 MOVOU X2, -32(DI)(BX*1)
192 MOVOU X3, -16(DI)(BX*1)
193 RET
194 move_65through128:
195 MOVOU (SI), X0
196 MOVOU 16(SI), X1
197 MOVOU 32(SI), X2
198 MOVOU 48(SI), X3
199 MOVOU -64(SI)(BX*1), X4
200 MOVOU -48(SI)(BX*1), X5
201 MOVOU -32(SI)(BX*1), X6
202 MOVOU -16(SI)(BX*1), X7
203 MOVOU X0, (DI)
204 MOVOU X1, 16(DI)
205 MOVOU X2, 32(DI)
206 MOVOU X3, 48(DI)
207 MOVOU X4, -64(DI)(BX*1)
208 MOVOU X5, -48(DI)(BX*1)
209 MOVOU X6, -32(DI)(BX*1)
210 MOVOU X7, -16(DI)(BX*1)
211 RET
212 move_129through256:
213 MOVOU (SI), X0
214 MOVOU 16(SI), X1
215 MOVOU 32(SI), X2
216 MOVOU 48(SI), X3
217 MOVOU 64(SI), X4
218 MOVOU 80(SI), X5
219 MOVOU 96(SI), X6
220 MOVOU 112(SI), X7
221 MOVOU -128(SI)(BX*1), X8
222 MOVOU -112(SI)(BX*1), X9
223 MOVOU -96(SI)(BX*1), X10
224 MOVOU -80(SI)(BX*1), X11
225 MOVOU -64(SI)(BX*1), X12
226 MOVOU -48(SI)(BX*1), X13
227 MOVOU -32(SI)(BX*1), X14
228 MOVOU -16(SI)(BX*1), X15
229 MOVOU X0, (DI)
230 MOVOU X1, 16(DI)
231 MOVOU X2, 32(DI)
232 MOVOU X3, 48(DI)
233 MOVOU X4, 64(DI)
234 MOVOU X5, 80(DI)
235 MOVOU X6, 96(DI)
236 MOVOU X7, 112(DI)
237 MOVOU X8, -128(DI)(BX*1)
238 MOVOU X9, -112(DI)(BX*1)
239 MOVOU X10, -96(DI)(BX*1)
240 MOVOU X11, -80(DI)(BX*1)
241 MOVOU X12, -64(DI)(BX*1)
242 MOVOU X13, -48(DI)(BX*1)
243 MOVOU X14, -32(DI)(BX*1)
244 MOVOU X15, -16(DI)(BX*1)
245 RET
246 move_256through2048:
247 SUBQ $256, BX
248 MOVOU (SI), X0
249 MOVOU 16(SI), X1
250 MOVOU 32(SI), X2
251 MOVOU 48(SI), X3
252 MOVOU 64(SI), X4
253 MOVOU 80(SI), X5
254 MOVOU 96(SI), X6
255 MOVOU 112(SI), X7
256 MOVOU 128(SI), X8
257 MOVOU 144(SI), X9
258 MOVOU 160(SI), X10
259 MOVOU 176(SI), X11
260 MOVOU 192(SI), X12
261 MOVOU 208(SI), X13
262 MOVOU 224(SI), X14
263 MOVOU 240(SI), X15
264 MOVOU X0, (DI)
265 MOVOU X1, 16(DI)
266 MOVOU X2, 32(DI)
267 MOVOU X3, 48(DI)
268 MOVOU X4, 64(DI)
269 MOVOU X5, 80(DI)
270 MOVOU X6, 96(DI)
271 MOVOU X7, 112(DI)
272 MOVOU X8, 128(DI)
273 MOVOU X9, 144(DI)
274 MOVOU X10, 160(DI)
275 MOVOU X11, 176(DI)
276 MOVOU X12, 192(DI)
277 MOVOU X13, 208(DI)
278 MOVOU X14, 224(DI)
279 MOVOU X15, 240(DI)
280 CMPQ BX, $256
281 LEAQ 256(SI), SI
282 LEAQ 256(DI), DI
283 JGE move_256through2048
284 JMP tail
285
286 avxUnaligned:
287 // There are two implementations of move algorithm.
288 // The first one for non-overlapped memory regions. It uses forward copying.
289 // The second one for overlapped regions. It uses backward copying
290 MOVQ DI, CX
291 SUBQ SI, CX
292 // Now CX contains distance between SRC and DEST
293 CMPQ CX, BX
294 // If the distance lesser than region length it means that regions are overlapped
295 JC copy_backward
296
297 // Non-temporal copy would be better for big sizes.
298 CMPQ BX, $0x100000
299 JAE gobble_big_data_fwd
300
301 // Memory layout on the source side
302 // SI CX
303 // |<---------BX before correction--------->|
304 // | |<--BX corrected-->| |
305 // | | |<--- AX --->|
306 // |<-R11->| |<-128 bytes->|
307 // +----------------------------------------+
308 // | Head | Body | Tail |
309 // +-------+------------------+-------------+
310 // ^ ^ ^
311 // | | |
312 // Save head into Y4 Save tail into X5..X12
313 // |
314 // SI+R11, where R11 = ((DI & -32) + 32) - DI
315 // Algorithm:
316 // 1. Unaligned save of the tail's 128 bytes
317 // 2. Unaligned save of the head's 32 bytes
318 // 3. Destination-aligned copying of body (128 bytes per iteration)
319 // 4. Put head on the new place
320 // 5. Put the tail on the new place
321 // It can be important to satisfy processor's pipeline requirements for
322 // small sizes as the cost of unaligned memory region copying is
323 // comparable with the cost of main loop. So code is slightly messed there.
324 // There is more clean implementation of that algorithm for bigger sizes
325 // where the cost of unaligned part copying is negligible.
326 // You can see it after gobble_big_data_fwd label.
327 LEAQ (SI)(BX*1), CX
328 MOVQ DI, R10
329 // CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
330 MOVOU -0x80(CX), X5
331 MOVOU -0x70(CX), X6
332 MOVQ $0x80, AX
333 // Align destination address
334 ANDQ $-32, DI
335 ADDQ $32, DI
336 // Continue tail saving.
337 MOVOU -0x60(CX), X7
338 MOVOU -0x50(CX), X8
339 // Make R11 delta between aligned and unaligned destination addresses.
340 MOVQ DI, R11
341 SUBQ R10, R11
342 // Continue tail saving.
343 MOVOU -0x40(CX), X9
344 MOVOU -0x30(CX), X10
345 // Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
346 SUBQ R11, BX
347 // Continue tail saving.
348 MOVOU -0x20(CX), X11
349 MOVOU -0x10(CX), X12
350 // The tail will be put on its place after main body copying.
351 // It's time for the unaligned heading part.
352 VMOVDQU (SI), Y4
353 // Adjust source address to point past head.
354 ADDQ R11, SI
355 SUBQ AX, BX
356 // Aligned memory copying there
357 gobble_128_loop:
358 VMOVDQU (SI), Y0
359 VMOVDQU 0x20(SI), Y1
360 VMOVDQU 0x40(SI), Y2
361 VMOVDQU 0x60(SI), Y3
362 ADDQ AX, SI
363 VMOVDQA Y0, (DI)
364 VMOVDQA Y1, 0x20(DI)
365 VMOVDQA Y2, 0x40(DI)
366 VMOVDQA Y3, 0x60(DI)
367 ADDQ AX, DI
368 SUBQ AX, BX
369 JA gobble_128_loop
370 // Now we can store unaligned parts.
371 ADDQ AX, BX
372 ADDQ DI, BX
373 VMOVDQU Y4, (R10)
374 VZEROUPPER
375 MOVOU X5, -0x80(BX)
376 MOVOU X6, -0x70(BX)
377 MOVOU X7, -0x60(BX)
378 MOVOU X8, -0x50(BX)
379 MOVOU X9, -0x40(BX)
380 MOVOU X10, -0x30(BX)
381 MOVOU X11, -0x20(BX)
382 MOVOU X12, -0x10(BX)
383 RET
384
385 gobble_big_data_fwd:
386 // There is forward copying for big regions.
387 // It uses non-temporal mov instructions.
388 // Details of this algorithm are commented previously for small sizes.
389 LEAQ (SI)(BX*1), CX
390 MOVOU -0x80(SI)(BX*1), X5
391 MOVOU -0x70(CX), X6
392 MOVOU -0x60(CX), X7
393 MOVOU -0x50(CX), X8
394 MOVOU -0x40(CX), X9
395 MOVOU -0x30(CX), X10
396 MOVOU -0x20(CX), X11
397 MOVOU -0x10(CX), X12
398 VMOVDQU (SI), Y4
399 MOVQ DI, R8
400 ANDQ $-32, DI
401 ADDQ $32, DI
402 MOVQ DI, R10
403 SUBQ R8, R10
404 SUBQ R10, BX
405 ADDQ R10, SI
406 LEAQ (DI)(BX*1), CX
407 SUBQ $0x80, BX
408 gobble_mem_fwd_loop:
409 PREFETCHNTA 0x1C0(SI)
410 PREFETCHNTA 0x280(SI)
411 // Prefetch values were chosen empirically.
412 // Approach for prefetch usage as in 7.6.6 of [1]
413 // [1] 64-ia-32-architectures-optimization-manual.pdf
414 // https://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
415 VMOVDQU (SI), Y0
416 VMOVDQU 0x20(SI), Y1
417 VMOVDQU 0x40(SI), Y2
418 VMOVDQU 0x60(SI), Y3
419 ADDQ $0x80, SI
420 VMOVNTDQ Y0, (DI)
421 VMOVNTDQ Y1, 0x20(DI)
422 VMOVNTDQ Y2, 0x40(DI)
423 VMOVNTDQ Y3, 0x60(DI)
424 ADDQ $0x80, DI
425 SUBQ $0x80, BX
426 JA gobble_mem_fwd_loop
427 // NT instructions don't follow the normal cache-coherency rules.
428 // We need SFENCE there to make copied data available timely.
429 SFENCE
430 VMOVDQU Y4, (R8)
431 VZEROUPPER
432 MOVOU X5, -0x80(CX)
433 MOVOU X6, -0x70(CX)
434 MOVOU X7, -0x60(CX)
435 MOVOU X8, -0x50(CX)
436 MOVOU X9, -0x40(CX)
437 MOVOU X10, -0x30(CX)
438 MOVOU X11, -0x20(CX)
439 MOVOU X12, -0x10(CX)
440 RET
441
442 copy_backward:
443 MOVQ DI, AX
444 // Backward copying is about the same as the forward one.
445 // Firstly we load unaligned tail in the beginning of region.
446 MOVOU (SI), X5
447 MOVOU 0x10(SI), X6
448 ADDQ BX, DI
449 MOVOU 0x20(SI), X7
450 MOVOU 0x30(SI), X8
451 LEAQ -0x20(DI), R10
452 MOVQ DI, R11
453 MOVOU 0x40(SI), X9
454 MOVOU 0x50(SI), X10
455 ANDQ $0x1F, R11
456 MOVOU 0x60(SI), X11
457 MOVOU 0x70(SI), X12
458 XORQ R11, DI
459 // Let's point SI to the end of region
460 ADDQ BX, SI
461 // and load unaligned head into X4.
462 VMOVDQU -0x20(SI), Y4
463 SUBQ R11, SI
464 SUBQ R11, BX
465 // If there is enough data for non-temporal moves go to special loop
466 CMPQ BX, $0x100000
467 JA gobble_big_data_bwd
468 SUBQ $0x80, BX
469 gobble_mem_bwd_loop:
470 VMOVDQU -0x20(SI), Y0
471 VMOVDQU -0x40(SI), Y1
472 VMOVDQU -0x60(SI), Y2
473 VMOVDQU -0x80(SI), Y3
474 SUBQ $0x80, SI
475 VMOVDQA Y0, -0x20(DI)
476 VMOVDQA Y1, -0x40(DI)
477 VMOVDQA Y2, -0x60(DI)
478 VMOVDQA Y3, -0x80(DI)
479 SUBQ $0x80, DI
480 SUBQ $0x80, BX
481 JA gobble_mem_bwd_loop
482 // Let's store unaligned data
483 VMOVDQU Y4, (R10)
484 VZEROUPPER
485 MOVOU X5, (AX)
486 MOVOU X6, 0x10(AX)
487 MOVOU X7, 0x20(AX)
488 MOVOU X8, 0x30(AX)
489 MOVOU X9, 0x40(AX)
490 MOVOU X10, 0x50(AX)
491 MOVOU X11, 0x60(AX)
492 MOVOU X12, 0x70(AX)
493 RET
494
495 gobble_big_data_bwd:
496 SUBQ $0x80, BX
497 gobble_big_mem_bwd_loop:
498 PREFETCHNTA -0x1C0(SI)
499 PREFETCHNTA -0x280(SI)
500 VMOVDQU -0x20(SI), Y0
501 VMOVDQU -0x40(SI), Y1
502 VMOVDQU -0x60(SI), Y2
503 VMOVDQU -0x80(SI), Y3
504 SUBQ $0x80, SI
505 VMOVNTDQ Y0, -0x20(DI)
506 VMOVNTDQ Y1, -0x40(DI)
507 VMOVNTDQ Y2, -0x60(DI)
508 VMOVNTDQ Y3, -0x80(DI)
509 SUBQ $0x80, DI
510 SUBQ $0x80, BX
511 JA gobble_big_mem_bwd_loop
512 SFENCE
513 VMOVDQU Y4, (R10)
514 VZEROUPPER
515 MOVOU X5, (AX)
516 MOVOU X6, 0x10(AX)
517 MOVOU X7, 0x20(AX)
518 MOVOU X8, 0x30(AX)
519 MOVOU X9, 0x40(AX)
520 MOVOU X10, 0x50(AX)
521 MOVOU X11, 0x60(AX)
522 MOVOU X12, 0x70(AX)
523 RET
View as plain text