Text file src/pkg/internal/bytealg/indexbyte_ppc64x.s

     1	// Copyright 2018 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// +build ppc64 ppc64le
     6	
     7	#include "go_asm.h"
     8	#include "textflag.h"
     9	
    10	TEXT ·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
    11		MOVD	b_base+0(FP), R3	// R3 = byte array pointer
    12		MOVD	b_len+8(FP), R4		// R4 = length
    13		MOVBZ	c+24(FP), R5		// R5 = byte
    14		MOVD	$ret+32(FP), R14	// R14 = &ret
    15		BR	indexbytebody<>(SB)
    16	
    17	TEXT ·IndexByteString(SB),NOSPLIT|NOFRAME,$0-32
    18		MOVD	s_base+0(FP), R3  // R3 = string
    19		MOVD	s_len+8(FP), R4	  // R4 = length
    20		MOVBZ	c+16(FP), R5	  // R5 = byte
    21		MOVD	$ret+24(FP), R14  // R14 = &ret
    22		BR	indexbytebody<>(SB)
    23	
    24	TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
    25		MOVD	R3,R17		// Save base address for calculating the index later.
    26		RLDICR	$0,R3,$60,R8	// Align address to doubleword boundary in R8.
    27		RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
    28		ADD	R4,R3,R7	// Last acceptable address in R7.
    29		DCBT	(R8)		// Prepare cache line.
    30	
    31		RLDIMI	$16,R5,$32,R5
    32		CMPU	R4,$32		// Check if it's a small string (≤32 bytes). Those will be processed differently.
    33		MOVD	$-1,R9
    34		WORD	$0x54661EB8	// Calculate padding in R6 (rlwinm r6,r3,3,26,28).
    35		RLDIMI	$32,R5,$0,R5
    36		MOVD	R7,R10		// Save last acceptable address in R10 for later.
    37		ADD	$-1,R7,R7
    38	#ifdef GOARCH_ppc64le
    39		SLD	R6,R9,R9	// Prepare mask for Little Endian
    40	#else
    41		SRD	R6,R9,R9	// Same for Big Endian
    42	#endif
    43		BLE	small_string	// Jump to the small string case if it's ≤32 bytes.
    44	
    45		// If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
    46		// in V0, V1 and V10, then branch to the preloop.
    47		ANDCC	$63,R3,R11
    48		BEQ	CR0,qw_align
    49		RLDICL	$0,R3,$61,R11
    50	
    51		MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
    52		CMPB	R12,R5,R3	// Check for a match.
    53		AND	R9,R3,R3	// Mask bytes below s_base
    54		RLDICL	$0,R7,$61,R6	// length-1
    55		RLDICR	$0,R7,$60,R7	// Last doubleword in R7
    56		CMPU	R3,$0,CR7	// If we have a match, jump to the final computation
    57		BNE	CR7,done
    58		ADD	$8,R8,R8
    59		ADD	$-8,R4,R4
    60		ADD	R4,R11,R4
    61	
    62		// Check for quadword alignment
    63		ANDCC	$15,R8,R11
    64		BEQ	CR0,qw_align
    65	
    66		// Not aligned, so handle the next doubleword
    67		MOVD	0(R8),R12
    68		CMPB	R12,R5,R3
    69		CMPU	R3,$0,CR7
    70		BNE	CR7,done
    71		ADD	$8,R8,R8
    72		ADD	$-8,R4,R4
    73	
    74		// Either quadword aligned or 64-byte at this point. We can use LVX.
    75	qw_align:
    76	
    77		// Set up auxiliary data for the vectorized algorithm.
    78		VSPLTISB  $0,V0		// Replicate 0 across V0
    79		VSPLTISB  $3,V10	// Use V10 as control for VBPERMQ
    80		MTVRD	  R5,V1
    81		LVSL	  (R0+R0),V11
    82		VSLB	  V11,V10,V10
    83		VSPLTB	  $7,V1,V1	// Replicate byte across V1
    84		CMPU	  R4, $64	// If len ≤ 64, don't use the vectorized loop
    85		BLE	  tail
    86	
    87		// We will load 4 quardwords per iteration in the loop, so check for
    88		// 64-byte alignment. If 64-byte aligned, then branch to the preloop.
    89		ANDCC	  $63,R8,R11
    90		BEQ	  CR0,preloop
    91	
    92		// Not 64-byte aligned. Load one quadword at a time until aligned.
    93		LVX	    (R8+R0),V4
    94		VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
    95		BNE	    CR6,found_qw_align
    96		ADD	    $16,R8,R8
    97		ADD	    $-16,R4,R4
    98	
    99		ANDCC	    $63,R8,R11
   100		BEQ	    CR0,preloop
   101		LVX	    (R8+R0),V4
   102		VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
   103		BNE	    CR6,found_qw_align
   104		ADD	    $16,R8,R8
   105		ADD	    $-16,R4,R4
   106	
   107		ANDCC	    $63,R8,R11
   108		BEQ	    CR0,preloop
   109		LVX	    (R8+R0),V4
   110		VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
   111		BNE	    CR6,found_qw_align
   112		ADD	    $-16,R4,R4
   113		ADD	    $16,R8,R8
   114	
   115		// 64-byte aligned. Prepare for the main loop.
   116	preloop:
   117		CMPU	R4,$64
   118		BLE	tail	      // If len ≤ 64, don't use the vectorized loop
   119	
   120		// We are now aligned to a 64-byte boundary. We will load 4 quadwords
   121		// per loop iteration. The last doubleword is in R10, so our loop counter
   122		// starts at (R10-R8)/64.
   123		SUB	R8,R10,R6
   124		SRD	$6,R6,R9      // Loop counter in R9
   125		MOVD	R9,CTR
   126	
   127		ADD	$-64,R8,R8   // Adjust index for loop entry
   128		MOVD	$16,R11      // Load offsets for the vector loads
   129		MOVD	$32,R9
   130		MOVD	$48,R7
   131	
   132		// Main loop we will load 64 bytes per iteration
   133	loop:
   134		ADD	    $64,R8,R8	      // Fuse addi+lvx for performance
   135		LVX	    (R8+R0),V2	      // Load 4 16-byte vectors
   136		LVX	    (R8+R11),V3
   137		VCMPEQUB    V1,V2,V6	      // Look for byte in each vector
   138		VCMPEQUB    V1,V3,V7
   139	
   140		LVX	    (R8+R9),V4
   141		LVX	    (R8+R7),V5
   142		VCMPEQUB    V1,V4,V8
   143		VCMPEQUB    V1,V5,V9
   144	
   145		VOR	    V6,V7,V11	      // Compress the result in a single vector
   146		VOR	    V8,V9,V12
   147		VOR	    V11,V12,V13
   148		VCMPEQUBCC  V0,V13,V14	      // Check for byte
   149		BGE	    CR6,found
   150		BC	    16,0,loop	      // bdnz loop
   151	
   152		// Handle the tailing bytes or R4 ≤ 64
   153		RLDICL	$0,R6,$58,R4
   154		ADD	$64,R8,R8
   155	tail:
   156		CMPU	    R4,$0
   157		BEQ	    notfound
   158		LVX	    (R8+R0),V4
   159		VCMPEQUBCC  V1,V4,V6
   160		BNE	    CR6,found_qw_align
   161		ADD	    $16,R8,R8
   162		CMPU	    R4,$16,CR6
   163		BLE	    CR6,notfound
   164		ADD	    $-16,R4,R4
   165	
   166		LVX	    (R8+R0),V4
   167		VCMPEQUBCC  V1,V4,V6
   168		BNE	    CR6,found_qw_align
   169		ADD	    $16,R8,R8
   170		CMPU	    R4,$16,CR6
   171		BLE	    CR6,notfound
   172		ADD	    $-16,R4,R4
   173	
   174		LVX	    (R8+R0),V4
   175		VCMPEQUBCC  V1,V4,V6
   176		BNE	    CR6,found_qw_align
   177		ADD	    $16,R8,R8
   178		CMPU	    R4,$16,CR6
   179		BLE	    CR6,notfound
   180		ADD	    $-16,R4,R4
   181	
   182		LVX	    (R8+R0),V4
   183		VCMPEQUBCC  V1,V4,V6
   184		BNE	    CR6,found_qw_align
   185	
   186	notfound:
   187		MOVD	$-1,R3
   188		MOVD	R3,(R14)
   189		RET
   190	
   191	found:
   192		// We will now compress the results into a single doubleword,
   193		// so it can be moved to a GPR for the final index calculation.
   194	
   195		// The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
   196		// first bit of each byte into bits 48-63.
   197		VBPERMQ	  V6,V10,V6
   198		VBPERMQ	  V7,V10,V7
   199		VBPERMQ	  V8,V10,V8
   200		VBPERMQ	  V9,V10,V9
   201	
   202		// Shift each 16-bit component into its correct position for
   203		// merging into a single doubleword.
   204	#ifdef GOARCH_ppc64le
   205		VSLDOI	  $2,V7,V7,V7
   206		VSLDOI	  $4,V8,V8,V8
   207		VSLDOI	  $6,V9,V9,V9
   208	#else
   209		VSLDOI	  $6,V6,V6,V6
   210		VSLDOI	  $4,V7,V7,V7
   211		VSLDOI	  $2,V8,V8,V8
   212	#endif
   213	
   214		// Merge V6-V9 into a single doubleword and move to a GPR.
   215		VOR	V6,V7,V11
   216		VOR	V8,V9,V4
   217		VOR	V4,V11,V4
   218		MFVRD	V4,R3
   219	
   220	#ifdef GOARCH_ppc64le
   221		ADD	  $-1,R3,R11
   222		ANDN	  R3,R11,R11
   223		POPCNTD	  R11,R11	// Count trailing zeros (Little Endian).
   224	#else
   225		CNTLZD	R3,R11		// Count leading zeros (Big Endian).
   226	#endif
   227		ADD	R8,R11,R3	// Calculate byte address
   228	
   229	return:
   230		SUB	R17,R3
   231		MOVD	R3,(R14)
   232		RET
   233	
   234	found_qw_align:
   235		// Use the same algorithm as above. Compress the result into
   236		// a single doubleword and move it to a GPR for the final
   237		// calculation.
   238		VBPERMQ	  V6,V10,V6
   239	
   240	#ifdef GOARCH_ppc64le
   241		MFVRD	  V6,R3
   242		ADD	  $-1,R3,R11
   243		ANDN	  R3,R11,R11
   244		POPCNTD	  R11,R11
   245	#else
   246		VSLDOI	  $6,V6,V6,V6
   247		MFVRD	  V6,R3
   248		CNTLZD	  R3,R11
   249	#endif
   250		ADD	  R8,R11,R3
   251		CMPU	  R11,R4
   252		BLT	  return
   253		BR	  notfound
   254	
   255	done:
   256		// At this point, R3 has 0xFF in the same position as the byte we are
   257		// looking for in the doubleword. Use that to calculate the exact index
   258		// of the byte.
   259	#ifdef GOARCH_ppc64le
   260		ADD	$-1,R3,R11
   261		ANDN	R3,R11,R11
   262		POPCNTD	R11,R11		// Count trailing zeros (Little Endian).
   263	#else
   264		CNTLZD	R3,R11		// Count leading zeros (Big Endian).
   265	#endif
   266		CMPU	R8,R7		// Check if we are at the last doubleword.
   267		SRD	$3,R11		// Convert trailing zeros to bytes.
   268		ADD	R11,R8,R3
   269		CMPU	R11,R6,CR7	// If at the last doubleword, check the byte offset.
   270		BNE	return
   271		BLE	CR7,return
   272		BR	notfound
   273	
   274	small_string:
   275		// We unroll this loop for better performance.
   276		CMPU	R4,$0		// Check for length=0
   277		BEQ	notfound
   278	
   279		MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
   280		CMPB	R12,R5,R3	// Check for a match.
   281		AND	R9,R3,R3	// Mask bytes below s_base.
   282		CMPU	R3,$0,CR7	// If we have a match, jump to the final computation.
   283		RLDICL	$0,R7,$61,R6	// length-1
   284		RLDICR	$0,R7,$60,R7	// Last doubleword in R7.
   285		CMPU	R8,R7
   286		BNE	CR7,done
   287		BEQ	notfound	// Hit length.
   288	
   289		MOVDU	8(R8),R12
   290		CMPB	R12,R5,R3
   291		CMPU	R3,$0,CR6
   292		CMPU	R8,R7
   293		BNE	CR6,done
   294		BEQ	notfound
   295	
   296		MOVDU	8(R8),R12
   297		CMPB	R12,R5,R3
   298		CMPU	R3,$0,CR6
   299		CMPU	R8,R7
   300		BNE	CR6,done
   301		BEQ	notfound
   302	
   303		MOVDU	8(R8),R12
   304		CMPB	R12,R5,R3
   305		CMPU	R3,$0,CR6
   306		CMPU	R8,R7
   307		BNE	CR6,done
   308		BEQ	notfound
   309	
   310		MOVDU	8(R8),R12
   311		CMPB	R12,R5,R3
   312		CMPU	R3,$0,CR6
   313		BNE	CR6,done
   314		BR	notfound
   315
View as plain text