...

Text file src/internal/bytealg/count_arm64.s

     1	// Copyright 2018 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	#include "go_asm.h"
     6	#include "textflag.h"
     7	
     8	TEXT ·Count(SB),NOSPLIT,$0-40
     9		MOVD	b_base+0(FP), R0
    10		MOVD	b_len+8(FP), R2
    11		MOVBU	c+24(FP), R1
    12		MOVD	$ret+32(FP), R8
    13		B	countbytebody<>(SB)
    14	
    15	TEXT ·CountString(SB),NOSPLIT,$0-32
    16		MOVD	s_base+0(FP), R0
    17		MOVD	s_len+8(FP), R2
    18		MOVBU	c+16(FP), R1
    19		MOVD	$ret+24(FP), R8
    20		B	countbytebody<>(SB)
    21	
    22	// input:
    23	//   R0: data
    24	//   R2: data len
    25	//   R1: byte to find
    26	//   R8: address to put result
    27	TEXT countbytebody<>(SB),NOSPLIT,$0
    28		// R11 = count of byte to search
    29		MOVD	$0, R11
    30		// short path to handle 0-byte case
    31		CBZ	R2, done
    32		CMP	$0x20, R2
    33		// jump directly to tail if length < 32
    34		BLO	tail
    35		ANDS	$0x1f, R0, R9
    36		BEQ	chunk
    37		// Work with not 32-byte aligned head
    38		BIC	$0x1f, R0, R3
    39		ADD	$0x20, R3
    40	head_loop:
    41		MOVBU.P	1(R0), R5
    42		CMP	R5, R1
    43		CINC	EQ, R11, R11
    44		SUB	$1, R2, R2
    45		CMP	R0, R3
    46		BNE	head_loop
    47		// Work with 32-byte aligned chunks
    48	chunk:
    49		BIC	$0x1f, R2, R9
    50		// The first chunk can also be the last
    51		CBZ	R9, tail
    52		// R3 = end of 32-byte chunks
    53		ADD	R0, R9, R3
    54		MOVD	$1, R5
    55		VMOV	R5, V5.B16
    56		// R2 = length of tail
    57		SUB	R9, R2, R2
    58		// Duplicate R1 (byte to search) to 16 1-byte elements of V0
    59		VMOV	R1, V0.B16
    60		// Clear the low 64-bit element of V7 and V8
    61		VEOR	V7.B8, V7.B8, V7.B8
    62		VEOR	V8.B8, V8.B8, V8.B8
    63		// Count the target byte in 32-byte chunk
    64	chunk_loop:
    65		VLD1.P	(R0), [V1.B16, V2.B16]
    66		CMP	R0, R3
    67		VCMEQ	V0.B16, V1.B16, V3.B16
    68		VCMEQ	V0.B16, V2.B16, V4.B16
    69		// Clear the higher 7 bits
    70		VAND	V5.B16, V3.B16, V3.B16
    71		VAND	V5.B16, V4.B16, V4.B16
    72		// Count lanes match the requested byte
    73		VADDP	V4.B16, V3.B16, V6.B16 // 32B->16B
    74		VUADDLV	V6.B16, V7
    75		// Accumulate the count in low 64-bit element of V8 when inside the loop
    76		VADD	V7, V8
    77		BNE	chunk_loop
    78		VMOV	V8.D[0], R6
    79		ADD	R6, R11, R11
    80		CBZ	R2, done
    81	tail:
    82		// Work with tail shorter than 32 bytes
    83		MOVBU.P	1(R0), R5
    84		SUB	$1, R2, R2
    85		CMP	R5, R1
    86		CINC	EQ, R11, R11
    87		CBNZ	R2, tail
    88	done:
    89		MOVD	R11, (R8)
    90		RET

View as plain text