...

Text file src/runtime/memmove_ppc64x.s

     1	// Copyright 2014 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// +build ppc64 ppc64le
     6	
     7	#include "textflag.h"
     8	
     9	// func memmove(to, from unsafe.Pointer, n uintptr)
    10	TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
    11		MOVD	to+0(FP), R3
    12		MOVD	from+8(FP), R4
    13		MOVD	n+16(FP), R5
    14	
    15		// Determine if there are doublewords to
    16		// copy so a more efficient move can be done
    17	check:
    18		ANDCC	$7, R5, R7	// R7: bytes to copy
    19		SRD	$3, R5, R6	// R6: double words to copy
    20		CMP	R6, $0, CR1	// CR1[EQ] set if no double words to copy
    21	
    22		// Determine overlap by subtracting dest - src and comparing against the
    23		// length.  The catches the cases where src and dest are in different types
    24		// of storage such as stack and static to avoid doing backward move when not
    25		// necessary.
    26	
    27		SUB	R4, R3, R8	// dest - src
    28		CMPU	R8, R5, CR2	// < len?
    29		BC	12, 8, backward // BLT CR2 backward
    30	
    31		// Copying forward if no overlap.
    32	
    33		BC	12, 6, noforwardlarge	// "BEQ CR1, noforwardlarge"
    34		SRDCC	$2,R6,R8		// 32 byte chunks?
    35		BNE	forward32setup		//
    36		MOVD	R6,CTR			// R6 = number of double words
    37	
    38		// Move double words
    39	
    40	forward8:
    41		MOVD    0(R4), R8		// double word
    42		ADD     $8,R4
    43		MOVD    R8, 0(R3)		//
    44		ADD     $8,R3
    45		BC      16, 0, forward8
    46		BR	noforwardlarge		// handle remainder
    47	
    48		// Prepare for moves of 32 bytes at a time.
    49	
    50	forward32setup:
    51		DCBTST	(R3)			// prepare data cache
    52		DCBT	(R4)
    53		MOVD	R8, CTR			// double work count
    54		MOVD	$16, R8
    55	
    56	forward32:
    57		LXVD2X	(R4+R0), VS32		// load 16 bytes
    58		LXVD2X	(R4+R8), VS33
    59		ADD	$32, R4
    60		STXVD2X	VS32, (R3+R0)		// store 16 bytes
    61		STXVD2X	VS33, (R3+R8)
    62		ADD	$32,R3			// bump up for next set
    63		BC	16, 0, forward32	// continue
    64		RLDCLCC	$61,R5,$3,R6		// remaining doublewords
    65		BEQ	noforwardlarge
    66		MOVD	R6,CTR			// set up the CTR
    67		BR	forward8
    68	
    69	noforwardlarge:
    70		CMP	R7,$0			// any remaining bytes
    71		BC	4, 1, LR		// ble lr
    72	
    73	forwardtail:
    74		MOVD	R7, CTR			// move tail bytes
    75	
    76	forwardtailloop:
    77		MOVBZ	0(R4), R8		// move single bytes
    78		ADD	$1,R4
    79		MOVBZ	R8, 0(R3)
    80		ADD	$1,R3
    81		BC	16, 0, forwardtailloop
    82		RET
    83	
    84	backward:
    85		// Copying backwards proceeds by copying R7 bytes then copying R6 double words.
    86		// R3 and R4 are advanced to the end of the destination/source buffers
    87		// respectively and moved back as we copy.
    88	
    89		ADD	R5, R4, R4		// end of source
    90		ADD	R3, R5, R3		// end of dest
    91	
    92		BEQ	nobackwardtail		// earlier condition
    93	
    94		MOVD	R7, CTR			// bytes to move
    95	
    96	backwardtailloop:
    97		MOVBZ 	-1(R4), R8		// point to last byte
    98		SUB	$1,R4
    99		MOVBZ 	R8, -1(R3)
   100		SUB	$1,R3
   101		BC	16, 0, backwardtailloop // bndz
   102	
   103	nobackwardtail:
   104		BC	4, 5, LR		// ble CR1 lr
   105	
   106	backwardlarge:
   107		MOVD	R6, CTR
   108		SUB	R3, R4, R9		// Use vsx if moving
   109		CMP	R9, $32			// at least 32 byte chunks
   110		BLT	backwardlargeloop	// and distance >= 32
   111		SRDCC	$2,R6,R8		// 32 byte chunks
   112		BNE	backward32setup
   113	
   114	backwardlargeloop:
   115		MOVD 	-8(R4), R8
   116		SUB	$8,R4
   117		MOVD 	R8, -8(R3)
   118		SUB	$8,R3
   119		BC	16, 0, backwardlargeloop // bndz
   120		RET
   121	
   122	backward32setup:
   123		MOVD	R8, CTR			// set up loop ctr
   124		MOVD	$16, R8			// 32 bytes at at time
   125	
   126	backward32loop:
   127		SUB	$32, R4
   128		SUB	$32, R3
   129		LXVD2X	(R4+R0), VS32           // load 16 bytes
   130		LXVD2X	(R4+R8), VS33
   131		STXVD2X	VS32, (R3+R0)           // store 16 bytes
   132		STXVD2X	VS33, (R3+R8)
   133		BC      16, 0, backward32loop   // bndz
   134		BC	4, 5, LR		// ble CR1 lr
   135		MOVD	R6, CTR
   136		BR	backwardlargeloop

View as plain text