Text file src/internal/bytealg/indexbyte_arm64.s
     1	// Copyright 2018 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	#include "textflag.h"
     6	
     7	TEXT ·IndexByte(SB),NOSPLIT,$0-40
     8		MOVD	b_base+0(FP), R0
     9		MOVD	b_len+8(FP), R2
    10		MOVBU	c+24(FP), R1
    11		MOVD	$ret+32(FP), R8
    12		B	indexbytebody<>(SB)
    13	
    14	TEXT ·IndexByteString(SB),NOSPLIT,$0-32
    15		MOVD	s_base+0(FP), R0
    16		MOVD	s_len+8(FP), R2
    17		MOVBU	c+16(FP), R1
    18		MOVD	$ret+24(FP), R8
    19		B	indexbytebody<>(SB)
    20	
    21	// input:
    22	//   R0: data
    23	//   R1: byte to search
    24	//   R2: data len
    25	//   R8: address to put result
    26	TEXT indexbytebody<>(SB),NOSPLIT,$0
    27		// Core algorithm:
    28		// For each 32-byte chunk we calculate a 64-bit syndrome value,
    29		// with two bits per byte. For each tuple, bit 0 is set if the
    30		// relevant byte matched the requested character and bit 1 is
    31		// not used (faster than using a 32bit syndrome). Since the bits
    32		// in the syndrome reflect exactly the order in which things occur
    33		// in the original string, counting trailing zeros allows to
    34		// identify exactly which byte has matched.
    35	
    36		CBZ	R2, fail
    37		MOVD	R0, R11
    38		// Magic constant 0x40100401 allows us to identify
    39		// which lane matches the requested byte.
    40		// 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24))
    41		// Different bytes have different bit masks (i.e: 1, 4, 16, 64)
    42		MOVD	$0x40100401, R5
    43		VMOV	R1, V0.B16
    44		// Work with aligned 32-byte chunks
    45		BIC	$0x1f, R0, R3
    46		VMOV	R5, V5.S4
    47		ANDS	$0x1f, R0, R9
    48		AND	$0x1f, R2, R10
    49		BEQ	loop
    50	
    51		// Input string is not 32-byte aligned. We calculate the
    52		// syndrome value for the aligned 32 bytes block containing
    53		// the first bytes and mask off the irrelevant part.
    54		VLD1.P	(R3), [V1.B16, V2.B16]
    55		SUB	$0x20, R9, R4
    56		ADDS	R4, R2, R2
    57		VCMEQ	V0.B16, V1.B16, V3.B16
    58		VCMEQ	V0.B16, V2.B16, V4.B16
    59		VAND	V5.B16, V3.B16, V3.B16
    60		VAND	V5.B16, V4.B16, V4.B16
    61		VADDP	V4.B16, V3.B16, V6.B16 // 256->128
    62		VADDP	V6.B16, V6.B16, V6.B16 // 128->64
    63		VMOV	V6.D[0], R6
    64		// Clear the irrelevant lower bits
    65		LSL	$1, R9, R4
    66		LSR	R4, R6, R6
    67		LSL	R4, R6, R6
    68		// The first block can also be the last
    69		BLS	masklast
    70		// Have we found something already?
    71		CBNZ	R6, tail
    72	
    73	loop:
    74		VLD1.P	(R3), [V1.B16, V2.B16]
    75		SUBS	$0x20, R2, R2
    76		VCMEQ	V0.B16, V1.B16, V3.B16
    77		VCMEQ	V0.B16, V2.B16, V4.B16
    78		// If we're out of data we finish regardless of the result
    79		BLS	end
    80		// Use a fast check for the termination condition
    81		VORR	V4.B16, V3.B16, V6.B16
    82		VADDP	V6.D2, V6.D2, V6.D2
    83		VMOV	V6.D[0], R6
    84		// We're not out of data, loop if we haven't found the character
    85		CBZ	R6, loop
    86	
    87	end:
    88		// Termination condition found, let's calculate the syndrome value
    89		VAND	V5.B16, V3.B16, V3.B16
    90		VAND	V5.B16, V4.B16, V4.B16
    91		VADDP	V4.B16, V3.B16, V6.B16
    92		VADDP	V6.B16, V6.B16, V6.B16
    93		VMOV	V6.D[0], R6
    94		// Only do the clear for the last possible block with less than 32 bytes
    95		// Condition flags come from SUBS in the loop
    96		BHS	tail
    97	
    98	masklast:
    99		// Clear the irrelevant upper bits
   100		ADD	R9, R10, R4
   101		AND	$0x1f, R4, R4
   102		SUB	$0x20, R4, R4
   103		NEG	R4<<1, R4
   104		LSL	R4, R6, R6
   105		LSR	R4, R6, R6
   106	
   107	tail:
   108		// Check that we have found a character
   109		CBZ	R6, fail
   110		// Count the trailing zeros using bit reversing
   111		RBIT	R6, R6
   112		// Compensate the last post-increment
   113		SUB	$0x20, R3, R3
   114		// And count the leading zeros
   115		CLZ	R6, R6
   116		// R6 is twice the offset into the fragment
   117		ADD	R6>>1, R3, R0
   118		// Compute the offset result
   119		SUB	R11, R0, R0
   120		MOVD	R0, (R8)
   121		RET
   122	
   123	fail:
   124		MOVD	$-1, R0
   125		MOVD	R0, (R8)
   126		RET
View as plain text