Text file src/runtime/memmove_386.s

     1	// Inferno's libkern/memmove-386.s
     2	// https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/memmove-386.s
     3	//
     4	//         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
     5	//         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
     6	//         Portions Copyright 2009 The Go Authors. All rights reserved.
     7	//
     8	// Permission is hereby granted, free of charge, to any person obtaining a copy
     9	// of this software and associated documentation files (the "Software"), to deal
    10	// in the Software without restriction, including without limitation the rights
    11	// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    12	// copies of the Software, and to permit persons to whom the Software is
    13	// furnished to do so, subject to the following conditions:
    14	//
    15	// The above copyright notice and this permission notice shall be included in
    16	// all copies or substantial portions of the Software.
    17	//
    18	// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    19	// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    20	// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
    21	// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    22	// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    23	// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    24	// THE SOFTWARE.
    25	
    26	// +build !plan9
    27	
    28	#include "go_asm.h"
    29	#include "textflag.h"
    30	
    31	// func memmove(to, from unsafe.Pointer, n uintptr)
    32	TEXT runtime·memmove(SB), NOSPLIT, $0-12
    33		MOVL	to+0(FP), DI
    34		MOVL	from+4(FP), SI
    35		MOVL	n+8(FP), BX
    36	
    37		// REP instructions have a high startup cost, so we handle small sizes
    38		// with some straightline code. The REP MOVSL instruction is really fast
    39		// for large sizes. The cutover is approximately 1K.  We implement up to
    40		// 128 because that is the maximum SSE register load (loading all data
    41		// into registers lets us ignore copy direction).
    42	tail:
    43		// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
    44		TESTL	BX, BX
    45		JEQ	move_0
    46		CMPL	BX, $2
    47		JBE	move_1or2
    48		CMPL	BX, $4
    49		JB	move_3
    50		JE	move_4
    51		CMPL	BX, $8
    52		JBE	move_5through8
    53		CMPL	BX, $16
    54		JBE	move_9through16
    55		CMPB	internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1
    56		JNE	nosse2
    57		CMPL	BX, $32
    58		JBE	move_17through32
    59		CMPL	BX, $64
    60		JBE	move_33through64
    61		CMPL	BX, $128
    62		JBE	move_65through128
    63	
    64	nosse2:
    65	/*
    66	 * check and set for backwards
    67	 */
    68		CMPL	SI, DI
    69		JLS	back
    70	
    71	/*
    72	 * forward copy loop
    73	 */
    74	forward:
    75		// If REP MOVSB isn't fast, don't use it
    76		CMPB	internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
    77		JNE	fwdBy4
    78	
    79		// Check alignment
    80		MOVL	SI, AX
    81		ORL	DI, AX
    82		TESTL	$3, AX
    83		JEQ	fwdBy4
    84	
    85		// Do 1 byte at a time
    86		MOVL	BX, CX
    87		REP;	MOVSB
    88		RET
    89	
    90	fwdBy4:
    91		// Do 4 bytes at a time
    92		MOVL	BX, CX
    93		SHRL	$2, CX
    94		ANDL	$3, BX
    95		REP;	MOVSL
    96		JMP	tail
    97	
    98	/*
    99	 * check overlap
   100	 */
   101	back:
   102		MOVL	SI, CX
   103		ADDL	BX, CX
   104		CMPL	CX, DI
   105		JLS	forward
   106	/*
   107	 * whole thing backwards has
   108	 * adjusted addresses
   109	 */
   110	
   111		ADDL	BX, DI
   112		ADDL	BX, SI
   113		STD
   114	
   115	/*
   116	 * copy
   117	 */
   118		MOVL	BX, CX
   119		SHRL	$2, CX
   120		ANDL	$3, BX
   121	
   122		SUBL	$4, DI
   123		SUBL	$4, SI
   124		REP;	MOVSL
   125	
   126		CLD
   127		ADDL	$4, DI
   128		ADDL	$4, SI
   129		SUBL	BX, DI
   130		SUBL	BX, SI
   131		JMP	tail
   132	
   133	move_1or2:
   134		MOVB	(SI), AX
   135		MOVB	-1(SI)(BX*1), CX
   136		MOVB	AX, (DI)
   137		MOVB	CX, -1(DI)(BX*1)
   138		RET
   139	move_0:
   140		RET
   141	move_3:
   142		MOVW	(SI), AX
   143		MOVB	2(SI), CX
   144		MOVW	AX, (DI)
   145		MOVB	CX, 2(DI)
   146		RET
   147	move_4:
   148		// We need a separate case for 4 to make sure we write pointers atomically.
   149		MOVL	(SI), AX
   150		MOVL	AX, (DI)
   151		RET
   152	move_5through8:
   153		MOVL	(SI), AX
   154		MOVL	-4(SI)(BX*1), CX
   155		MOVL	AX, (DI)
   156		MOVL	CX, -4(DI)(BX*1)
   157		RET
   158	move_9through16:
   159		MOVL	(SI), AX
   160		MOVL	4(SI), CX
   161		MOVL	-8(SI)(BX*1), DX
   162		MOVL	-4(SI)(BX*1), BP
   163		MOVL	AX, (DI)
   164		MOVL	CX, 4(DI)
   165		MOVL	DX, -8(DI)(BX*1)
   166		MOVL	BP, -4(DI)(BX*1)
   167		RET
   168	move_17through32:
   169		MOVOU	(SI), X0
   170		MOVOU	-16(SI)(BX*1), X1
   171		MOVOU	X0, (DI)
   172		MOVOU	X1, -16(DI)(BX*1)
   173		RET
   174	move_33through64:
   175		MOVOU	(SI), X0
   176		MOVOU	16(SI), X1
   177		MOVOU	-32(SI)(BX*1), X2
   178		MOVOU	-16(SI)(BX*1), X3
   179		MOVOU	X0, (DI)
   180		MOVOU	X1, 16(DI)
   181		MOVOU	X2, -32(DI)(BX*1)
   182		MOVOU	X3, -16(DI)(BX*1)
   183		RET
   184	move_65through128:
   185		MOVOU	(SI), X0
   186		MOVOU	16(SI), X1
   187		MOVOU	32(SI), X2
   188		MOVOU	48(SI), X3
   189		MOVOU	-64(SI)(BX*1), X4
   190		MOVOU	-48(SI)(BX*1), X5
   191		MOVOU	-32(SI)(BX*1), X6
   192		MOVOU	-16(SI)(BX*1), X7
   193		MOVOU	X0, (DI)
   194		MOVOU	X1, 16(DI)
   195		MOVOU	X2, 32(DI)
   196		MOVOU	X3, 48(DI)
   197		MOVOU	X4, -64(DI)(BX*1)
   198		MOVOU	X5, -48(DI)(BX*1)
   199		MOVOU	X6, -32(DI)(BX*1)
   200		MOVOU	X7, -16(DI)(BX*1)
   201		RET
View as plain text