Text file src/runtime/memmove_amd64.s

     1	// Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
     2	// https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/memmove-386.s
     3	//
     4	//         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
     5	//         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
     6	//         Portions Copyright 2009 The Go Authors. All rights reserved.
     7	//
     8	// Permission is hereby granted, free of charge, to any person obtaining a copy
     9	// of this software and associated documentation files (the "Software"), to deal
    10	// in the Software without restriction, including without limitation the rights
    11	// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    12	// copies of the Software, and to permit persons to whom the Software is
    13	// furnished to do so, subject to the following conditions:
    14	//
    15	// The above copyright notice and this permission notice shall be included in
    16	// all copies or substantial portions of the Software.
    17	//
    18	// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    19	// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    20	// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
    21	// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    22	// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    23	// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    24	// THE SOFTWARE.
    25	
    26	// +build !plan9
    27	
    28	#include "go_asm.h"
    29	#include "textflag.h"
    30	
    31	// func memmove(to, from unsafe.Pointer, n uintptr)
    32	TEXT runtime·memmove(SB), NOSPLIT, $0-24
    33	
    34		MOVQ	to+0(FP), DI
    35		MOVQ	from+8(FP), SI
    36		MOVQ	n+16(FP), BX
    37	
    38		// REP instructions have a high startup cost, so we handle small sizes
    39		// with some straightline code. The REP MOVSQ instruction is really fast
    40		// for large sizes. The cutover is approximately 2K.
    41	tail:
    42		// move_129through256 or smaller work whether or not the source and the
    43		// destination memory regions overlap because they load all data into
    44		// registers before writing it back.  move_256through2048 on the other
    45		// hand can be used only when the memory regions don't overlap or the copy
    46		// direction is forward.
    47		//
    48		// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
    49		TESTQ	BX, BX
    50		JEQ	move_0
    51		CMPQ	BX, $2
    52		JBE	move_1or2
    53		CMPQ	BX, $4
    54		JB	move_3
    55		JBE	move_4
    56		CMPQ	BX, $8
    57		JB	move_5through7
    58		JE	move_8
    59		CMPQ	BX, $16
    60		JBE	move_9through16
    61		CMPQ	BX, $32
    62		JBE	move_17through32
    63		CMPQ	BX, $64
    64		JBE	move_33through64
    65		CMPQ	BX, $128
    66		JBE	move_65through128
    67		CMPQ	BX, $256
    68		JBE	move_129through256
    69	
    70		TESTB	$1, runtime·useAVXmemmove(SB)
    71		JNZ	avxUnaligned
    72	
    73	/*
    74	 * check and set for backwards
    75	 */
    76		CMPQ	SI, DI
    77		JLS	back
    78	
    79	/*
    80	 * forward copy loop
    81	 */
    82	forward:
    83		CMPQ	BX, $2048
    84		JLS	move_256through2048
    85	
    86		// If REP MOVSB isn't fast, don't use it
    87		CMPB	internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
    88		JNE	fwdBy8
    89	
    90		// Check alignment
    91		MOVL	SI, AX
    92		ORL	DI, AX
    93		TESTL	$7, AX
    94		JEQ	fwdBy8
    95	
    96		// Do 1 byte at a time
    97		MOVQ	BX, CX
    98		REP;	MOVSB
    99		RET
   100	
   101	fwdBy8:
   102		// Do 8 bytes at a time
   103		MOVQ	BX, CX
   104		SHRQ	$3, CX
   105		ANDQ	$7, BX
   106		REP;	MOVSQ
   107		JMP	tail
   108	
   109	back:
   110	/*
   111	 * check overlap
   112	 */
   113		MOVQ	SI, CX
   114		ADDQ	BX, CX
   115		CMPQ	CX, DI
   116		JLS	forward
   117	/*
   118	 * whole thing backwards has
   119	 * adjusted addresses
   120	 */
   121		ADDQ	BX, DI
   122		ADDQ	BX, SI
   123		STD
   124	
   125	/*
   126	 * copy
   127	 */
   128		MOVQ	BX, CX
   129		SHRQ	$3, CX
   130		ANDQ	$7, BX
   131	
   132		SUBQ	$8, DI
   133		SUBQ	$8, SI
   134		REP;	MOVSQ
   135	
   136		CLD
   137		ADDQ	$8, DI
   138		ADDQ	$8, SI
   139		SUBQ	BX, DI
   140		SUBQ	BX, SI
   141		JMP	tail
   142	
   143	move_1or2:
   144		MOVB	(SI), AX
   145		MOVB	-1(SI)(BX*1), CX
   146		MOVB	AX, (DI)
   147		MOVB	CX, -1(DI)(BX*1)
   148		RET
   149	move_0:
   150		RET
   151	move_4:
   152		MOVL	(SI), AX
   153		MOVL	AX, (DI)
   154		RET
   155	move_3:
   156		MOVW	(SI), AX
   157		MOVB	2(SI), CX
   158		MOVW	AX, (DI)
   159		MOVB	CX, 2(DI)
   160		RET
   161	move_5through7:
   162		MOVL	(SI), AX
   163		MOVL	-4(SI)(BX*1), CX
   164		MOVL	AX, (DI)
   165		MOVL	CX, -4(DI)(BX*1)
   166		RET
   167	move_8:
   168		// We need a separate case for 8 to make sure we write pointers atomically.
   169		MOVQ	(SI), AX
   170		MOVQ	AX, (DI)
   171		RET
   172	move_9through16:
   173		MOVQ	(SI), AX
   174		MOVQ	-8(SI)(BX*1), CX
   175		MOVQ	AX, (DI)
   176		MOVQ	CX, -8(DI)(BX*1)
   177		RET
   178	move_17through32:
   179		MOVOU	(SI), X0
   180		MOVOU	-16(SI)(BX*1), X1
   181		MOVOU	X0, (DI)
   182		MOVOU	X1, -16(DI)(BX*1)
   183		RET
   184	move_33through64:
   185		MOVOU	(SI), X0
   186		MOVOU	16(SI), X1
   187		MOVOU	-32(SI)(BX*1), X2
   188		MOVOU	-16(SI)(BX*1), X3
   189		MOVOU	X0, (DI)
   190		MOVOU	X1, 16(DI)
   191		MOVOU	X2, -32(DI)(BX*1)
   192		MOVOU	X3, -16(DI)(BX*1)
   193		RET
   194	move_65through128:
   195		MOVOU	(SI), X0
   196		MOVOU	16(SI), X1
   197		MOVOU	32(SI), X2
   198		MOVOU	48(SI), X3
   199		MOVOU	-64(SI)(BX*1), X4
   200		MOVOU	-48(SI)(BX*1), X5
   201		MOVOU	-32(SI)(BX*1), X6
   202		MOVOU	-16(SI)(BX*1), X7
   203		MOVOU	X0, (DI)
   204		MOVOU	X1, 16(DI)
   205		MOVOU	X2, 32(DI)
   206		MOVOU	X3, 48(DI)
   207		MOVOU	X4, -64(DI)(BX*1)
   208		MOVOU	X5, -48(DI)(BX*1)
   209		MOVOU	X6, -32(DI)(BX*1)
   210		MOVOU	X7, -16(DI)(BX*1)
   211		RET
   212	move_129through256:
   213		MOVOU	(SI), X0
   214		MOVOU	16(SI), X1
   215		MOVOU	32(SI), X2
   216		MOVOU	48(SI), X3
   217		MOVOU	64(SI), X4
   218		MOVOU	80(SI), X5
   219		MOVOU	96(SI), X6
   220		MOVOU	112(SI), X7
   221		MOVOU	-128(SI)(BX*1), X8
   222		MOVOU	-112(SI)(BX*1), X9
   223		MOVOU	-96(SI)(BX*1), X10
   224		MOVOU	-80(SI)(BX*1), X11
   225		MOVOU	-64(SI)(BX*1), X12
   226		MOVOU	-48(SI)(BX*1), X13
   227		MOVOU	-32(SI)(BX*1), X14
   228		MOVOU	-16(SI)(BX*1), X15
   229		MOVOU	X0, (DI)
   230		MOVOU	X1, 16(DI)
   231		MOVOU	X2, 32(DI)
   232		MOVOU	X3, 48(DI)
   233		MOVOU	X4, 64(DI)
   234		MOVOU	X5, 80(DI)
   235		MOVOU	X6, 96(DI)
   236		MOVOU	X7, 112(DI)
   237		MOVOU	X8, -128(DI)(BX*1)
   238		MOVOU	X9, -112(DI)(BX*1)
   239		MOVOU	X10, -96(DI)(BX*1)
   240		MOVOU	X11, -80(DI)(BX*1)
   241		MOVOU	X12, -64(DI)(BX*1)
   242		MOVOU	X13, -48(DI)(BX*1)
   243		MOVOU	X14, -32(DI)(BX*1)
   244		MOVOU	X15, -16(DI)(BX*1)
   245		RET
   246	move_256through2048:
   247		SUBQ	$256, BX
   248		MOVOU	(SI), X0
   249		MOVOU	16(SI), X1
   250		MOVOU	32(SI), X2
   251		MOVOU	48(SI), X3
   252		MOVOU	64(SI), X4
   253		MOVOU	80(SI), X5
   254		MOVOU	96(SI), X6
   255		MOVOU	112(SI), X7
   256		MOVOU	128(SI), X8
   257		MOVOU	144(SI), X9
   258		MOVOU	160(SI), X10
   259		MOVOU	176(SI), X11
   260		MOVOU	192(SI), X12
   261		MOVOU	208(SI), X13
   262		MOVOU	224(SI), X14
   263		MOVOU	240(SI), X15
   264		MOVOU	X0, (DI)
   265		MOVOU	X1, 16(DI)
   266		MOVOU	X2, 32(DI)
   267		MOVOU	X3, 48(DI)
   268		MOVOU	X4, 64(DI)
   269		MOVOU	X5, 80(DI)
   270		MOVOU	X6, 96(DI)
   271		MOVOU	X7, 112(DI)
   272		MOVOU	X8, 128(DI)
   273		MOVOU	X9, 144(DI)
   274		MOVOU	X10, 160(DI)
   275		MOVOU	X11, 176(DI)
   276		MOVOU	X12, 192(DI)
   277		MOVOU	X13, 208(DI)
   278		MOVOU	X14, 224(DI)
   279		MOVOU	X15, 240(DI)
   280		CMPQ	BX, $256
   281		LEAQ	256(SI), SI
   282		LEAQ	256(DI), DI
   283		JGE	move_256through2048
   284		JMP	tail
   285	
   286	avxUnaligned:
   287		// There are two implementations of move algorithm.
   288		// The first one for non-overlapped memory regions. It uses forward copying.
   289		// The second one for overlapped regions. It uses backward copying
   290		MOVQ	DI, CX
   291		SUBQ	SI, CX
   292		// Now CX contains distance between SRC and DEST
   293		CMPQ	CX, BX
   294		// If the distance lesser than region length it means that regions are overlapped
   295		JC	copy_backward
   296	
   297		// Non-temporal copy would be better for big sizes.
   298		CMPQ	BX, $0x100000
   299		JAE	gobble_big_data_fwd
   300	
   301		// Memory layout on the source side
   302		// SI                                       CX
   303		// |<---------BX before correction--------->|
   304		// |       |<--BX corrected-->|             |
   305		// |       |                  |<--- AX  --->|
   306		// |<-R11->|                  |<-128 bytes->|
   307		// +----------------------------------------+
   308		// | Head  | Body             | Tail        |
   309		// +-------+------------------+-------------+
   310		// ^       ^                  ^
   311		// |       |                  |
   312		// Save head into Y4          Save tail into X5..X12
   313		//         |
   314		//         SI+R11, where R11 = ((DI & -32) + 32) - DI
   315		// Algorithm:
   316		// 1. Unaligned save of the tail's 128 bytes
   317		// 2. Unaligned save of the head's 32  bytes
   318		// 3. Destination-aligned copying of body (128 bytes per iteration)
   319		// 4. Put head on the new place
   320		// 5. Put the tail on the new place
   321		// It can be important to satisfy processor's pipeline requirements for
   322		// small sizes as the cost of unaligned memory region copying is
   323		// comparable with the cost of main loop. So code is slightly messed there.
   324		// There is more clean implementation of that algorithm for bigger sizes
   325		// where the cost of unaligned part copying is negligible.
   326		// You can see it after gobble_big_data_fwd label.
   327		LEAQ	(SI)(BX*1), CX
   328		MOVQ	DI, R10
   329		// CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
   330		MOVOU	-0x80(CX), X5
   331		MOVOU	-0x70(CX), X6
   332		MOVQ	$0x80, AX
   333		// Align destination address
   334		ANDQ	$-32, DI
   335		ADDQ	$32, DI
   336		// Continue tail saving.
   337		MOVOU	-0x60(CX), X7
   338		MOVOU	-0x50(CX), X8
   339		// Make R11 delta between aligned and unaligned destination addresses.
   340		MOVQ	DI, R11
   341		SUBQ	R10, R11
   342		// Continue tail saving.
   343		MOVOU	-0x40(CX), X9
   344		MOVOU	-0x30(CX), X10
   345		// Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
   346		SUBQ	R11, BX
   347		// Continue tail saving.
   348		MOVOU	-0x20(CX), X11
   349		MOVOU	-0x10(CX), X12
   350		// The tail will be put on its place after main body copying.
   351		// It's time for the unaligned heading part.
   352		VMOVDQU	(SI), Y4
   353		// Adjust source address to point past head.
   354		ADDQ	R11, SI
   355		SUBQ	AX, BX
   356		// Aligned memory copying there
   357	gobble_128_loop:
   358		VMOVDQU	(SI), Y0
   359		VMOVDQU	0x20(SI), Y1
   360		VMOVDQU	0x40(SI), Y2
   361		VMOVDQU	0x60(SI), Y3
   362		ADDQ	AX, SI
   363		VMOVDQA	Y0, (DI)
   364		VMOVDQA	Y1, 0x20(DI)
   365		VMOVDQA	Y2, 0x40(DI)
   366		VMOVDQA	Y3, 0x60(DI)
   367		ADDQ	AX, DI
   368		SUBQ	AX, BX
   369		JA	gobble_128_loop
   370		// Now we can store unaligned parts.
   371		ADDQ	AX, BX
   372		ADDQ	DI, BX
   373		VMOVDQU	Y4, (R10)
   374		VZEROUPPER
   375		MOVOU	X5, -0x80(BX)
   376		MOVOU	X6, -0x70(BX)
   377		MOVOU	X7, -0x60(BX)
   378		MOVOU	X8, -0x50(BX)
   379		MOVOU	X9, -0x40(BX)
   380		MOVOU	X10, -0x30(BX)
   381		MOVOU	X11, -0x20(BX)
   382		MOVOU	X12, -0x10(BX)
   383		RET
   384	
   385	gobble_big_data_fwd:
   386		// There is forward copying for big regions.
   387		// It uses non-temporal mov instructions.
   388		// Details of this algorithm are commented previously for small sizes.
   389		LEAQ	(SI)(BX*1), CX
   390		MOVOU	-0x80(SI)(BX*1), X5
   391		MOVOU	-0x70(CX), X6
   392		MOVOU	-0x60(CX), X7
   393		MOVOU	-0x50(CX), X8
   394		MOVOU	-0x40(CX), X9
   395		MOVOU	-0x30(CX), X10
   396		MOVOU	-0x20(CX), X11
   397		MOVOU	-0x10(CX), X12
   398		VMOVDQU	(SI), Y4
   399		MOVQ	DI, R8
   400		ANDQ	$-32, DI
   401		ADDQ	$32, DI
   402		MOVQ	DI, R10
   403		SUBQ	R8, R10
   404		SUBQ	R10, BX
   405		ADDQ	R10, SI
   406		LEAQ	(DI)(BX*1), CX
   407		SUBQ	$0x80, BX
   408	gobble_mem_fwd_loop:
   409		PREFETCHNTA 0x1C0(SI)
   410		PREFETCHNTA 0x280(SI)
   411		// Prefetch values were chosen empirically.
   412		// Approach for prefetch usage as in 7.6.6 of [1]
   413		// [1] 64-ia-32-architectures-optimization-manual.pdf
   414		// https://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
   415		VMOVDQU	(SI), Y0
   416		VMOVDQU	0x20(SI), Y1
   417		VMOVDQU	0x40(SI), Y2
   418		VMOVDQU	0x60(SI), Y3
   419		ADDQ	$0x80, SI
   420		VMOVNTDQ Y0, (DI)
   421		VMOVNTDQ Y1, 0x20(DI)
   422		VMOVNTDQ Y2, 0x40(DI)
   423		VMOVNTDQ Y3, 0x60(DI)
   424		ADDQ	$0x80, DI
   425		SUBQ	$0x80, BX
   426		JA		gobble_mem_fwd_loop
   427		// NT instructions don't follow the normal cache-coherency rules.
   428		// We need SFENCE there to make copied data available timely.
   429		SFENCE
   430		VMOVDQU	Y4, (R8)
   431		VZEROUPPER
   432		MOVOU	X5, -0x80(CX)
   433		MOVOU	X6, -0x70(CX)
   434		MOVOU	X7, -0x60(CX)
   435		MOVOU	X8, -0x50(CX)
   436		MOVOU	X9, -0x40(CX)
   437		MOVOU	X10, -0x30(CX)
   438		MOVOU	X11, -0x20(CX)
   439		MOVOU	X12, -0x10(CX)
   440		RET
   441	
   442	copy_backward:
   443		MOVQ	DI, AX
   444		// Backward copying is about the same as the forward one.
   445		// Firstly we load unaligned tail in the beginning of region.
   446		MOVOU	(SI), X5
   447		MOVOU	0x10(SI), X6
   448		ADDQ	BX, DI
   449		MOVOU	0x20(SI), X7
   450		MOVOU	0x30(SI), X8
   451		LEAQ	-0x20(DI), R10
   452		MOVQ	DI, R11
   453		MOVOU	0x40(SI), X9
   454		MOVOU	0x50(SI), X10
   455		ANDQ	$0x1F, R11
   456		MOVOU	0x60(SI), X11
   457		MOVOU	0x70(SI), X12
   458		XORQ	R11, DI
   459		// Let's point SI to the end of region
   460		ADDQ	BX, SI
   461		// and load unaligned head into X4.
   462		VMOVDQU	-0x20(SI), Y4
   463		SUBQ	R11, SI
   464		SUBQ	R11, BX
   465		// If there is enough data for non-temporal moves go to special loop
   466		CMPQ	BX, $0x100000
   467		JA		gobble_big_data_bwd
   468		SUBQ	$0x80, BX
   469	gobble_mem_bwd_loop:
   470		VMOVDQU	-0x20(SI), Y0
   471		VMOVDQU	-0x40(SI), Y1
   472		VMOVDQU	-0x60(SI), Y2
   473		VMOVDQU	-0x80(SI), Y3
   474		SUBQ	$0x80, SI
   475		VMOVDQA	Y0, -0x20(DI)
   476		VMOVDQA	Y1, -0x40(DI)
   477		VMOVDQA	Y2, -0x60(DI)
   478		VMOVDQA	Y3, -0x80(DI)
   479		SUBQ	$0x80, DI
   480		SUBQ	$0x80, BX
   481		JA		gobble_mem_bwd_loop
   482		// Let's store unaligned data
   483		VMOVDQU	Y4, (R10)
   484		VZEROUPPER
   485		MOVOU	X5, (AX)
   486		MOVOU	X6, 0x10(AX)
   487		MOVOU	X7, 0x20(AX)
   488		MOVOU	X8, 0x30(AX)
   489		MOVOU	X9, 0x40(AX)
   490		MOVOU	X10, 0x50(AX)
   491		MOVOU	X11, 0x60(AX)
   492		MOVOU	X12, 0x70(AX)
   493		RET
   494	
   495	gobble_big_data_bwd:
   496		SUBQ	$0x80, BX
   497	gobble_big_mem_bwd_loop:
   498		PREFETCHNTA -0x1C0(SI)
   499		PREFETCHNTA -0x280(SI)
   500		VMOVDQU	-0x20(SI), Y0
   501		VMOVDQU	-0x40(SI), Y1
   502		VMOVDQU	-0x60(SI), Y2
   503		VMOVDQU	-0x80(SI), Y3
   504		SUBQ	$0x80, SI
   505		VMOVNTDQ	Y0, -0x20(DI)
   506		VMOVNTDQ	Y1, -0x40(DI)
   507		VMOVNTDQ	Y2, -0x60(DI)
   508		VMOVNTDQ	Y3, -0x80(DI)
   509		SUBQ	$0x80, DI
   510		SUBQ	$0x80, BX
   511		JA	gobble_big_mem_bwd_loop
   512		SFENCE
   513		VMOVDQU	Y4, (R10)
   514		VZEROUPPER
   515		MOVOU	X5, (AX)
   516		MOVOU	X6, 0x10(AX)
   517		MOVOU	X7, 0x20(AX)
   518		MOVOU	X8, 0x30(AX)
   519		MOVOU	X9, 0x40(AX)
   520		MOVOU	X10, 0x50(AX)
   521		MOVOU	X11, 0x60(AX)
   522		MOVOU	X12, 0x70(AX)
   523		RET
View as plain text