...

Text file src/runtime/vlop_arm.s

     1	// Inferno's libkern/vlop-arm.s
     2	// https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/vlop-arm.s
     3	//
     4	//         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
     5	//         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
     6	//         Portions Copyright 2009 The Go Authors. All rights reserved.
     7	//
     8	// Permission is hereby granted, free of charge, to any person obtaining a copy
     9	// of this software and associated documentation files (the "Software"), to deal
    10	// in the Software without restriction, including without limitation the rights
    11	// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    12	// copies of the Software, and to permit persons to whom the Software is
    13	// furnished to do so, subject to the following conditions:
    14	//
    15	// The above copyright notice and this permission notice shall be included in
    16	// all copies or substantial portions of the Software.
    17	//
    18	// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    19	// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    20	// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
    21	// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    22	// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    23	// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    24	// THE SOFTWARE.
    25	
    26	#include "go_asm.h"
    27	#include "go_tls.h"
    28	#include "funcdata.h"
    29	#include "textflag.h"
    30	
    31	// func runtime·udiv(n, d uint32) (q, r uint32)
    32	// compiler knowns the register usage of this function
    33	// Reference:
    34	// Sloss, Andrew et. al; ARM System Developer's Guide: Designing and Optimizing System Software
    35	// Morgan Kaufmann; 1 edition (April 8, 2004), ISBN 978-1558608740
    36	#define Rq	R0 // input d, output q
    37	#define Rr	R1 // input n, output r
    38	#define Rs	R2 // three temporary variables
    39	#define RM	R3
    40	#define Ra	R11
    41	
    42	// Be careful: Ra == R11 will be used by the linker for synthesized instructions.
    43	// Note: this function does not have a frame. If it ever needs a frame,
    44	// the RET instruction will clobber R12 on nacl, and the compiler's register
    45	// allocator needs to know.
    46	TEXT runtime·udiv(SB),NOSPLIT|NOFRAME,$0
    47		MOVBU	internal∕cpu·ARM+const_offsetARMHasIDIVA(SB), Ra
    48		CMP	$0, Ra
    49		BNE	udiv_hardware
    50	
    51		CLZ 	Rq, Rs // find normalizing shift
    52		MOVW.S	Rq<<Rs, Ra
    53		MOVW	$fast_udiv_tab<>-64(SB), RM
    54		ADD.NE	Ra>>25, RM, Ra // index by most significant 7 bits of divisor
    55		MOVBU.NE	(Ra), Ra
    56	
    57		SUB.S	$7, Rs
    58		RSB 	$0, Rq, RM // M = -q
    59		MOVW.PL	Ra<<Rs, Rq
    60	
    61		// 1st Newton iteration
    62		MUL.PL	RM, Rq, Ra // a = -q*d
    63		BMI 	udiv_by_large_d
    64		MULAWT	Ra, Rq, Rq, Rq // q approx q-(q*q*d>>32)
    65		TEQ 	RM->1, RM // check for d=0 or d=1
    66	
    67		// 2nd Newton iteration
    68		MUL.NE	RM, Rq, Ra
    69		MOVW.NE	$0, Rs
    70		MULAL.NE Rq, Ra, (Rq,Rs)
    71		BEQ 	udiv_by_0_or_1
    72	
    73		// q now accurate enough for a remainder r, 0<=r<3*d
    74		MULLU	Rq, Rr, (Rq,Rs) // q = (r * q) >> 32
    75		ADD 	RM, Rr, Rr // r = n - d
    76		MULA	RM, Rq, Rr, Rr // r = n - (q+1)*d
    77	
    78		// since 0 <= n-q*d < 3*d; thus -d <= r < 2*d
    79		CMN 	RM, Rr // t = r-d
    80		SUB.CS	RM, Rr, Rr // if (t<-d || t>=0) r=r+d
    81		ADD.CC	$1, Rq
    82		ADD.PL	RM<<1, Rr
    83		ADD.PL	$2, Rq
    84		RET
    85	
    86	// use hardware divider
    87	udiv_hardware:
    88		DIVUHW	Rq, Rr, Rs
    89		MUL	Rs, Rq, RM
    90		RSB	Rr, RM, Rr
    91		MOVW	Rs, Rq
    92		RET
    93	
    94	udiv_by_large_d:
    95		// at this point we know d>=2^(31-6)=2^25
    96		SUB 	$4, Ra, Ra
    97		RSB 	$0, Rs, Rs
    98		MOVW	Ra>>Rs, Rq
    99		MULLU	Rq, Rr, (Rq,Rs)
   100		MULA	RM, Rq, Rr, Rr
   101	
   102		// q now accurate enough for a remainder r, 0<=r<4*d
   103		CMN 	Rr>>1, RM // if(r/2 >= d)
   104		ADD.CS	RM<<1, Rr
   105		ADD.CS	$2, Rq
   106		CMN 	Rr, RM
   107		ADD.CS	RM, Rr
   108		ADD.CS	$1, Rq
   109		RET
   110	
   111	udiv_by_0_or_1:
   112		// carry set if d==1, carry clear if d==0
   113		BCC udiv_by_0
   114		MOVW	Rr, Rq
   115		MOVW	$0, Rr
   116		RET
   117	
   118	udiv_by_0:
   119		MOVW	$runtime·panicdivide(SB), R11
   120		B	(R11)
   121	
   122	// var tab [64]byte
   123	// tab[0] = 255; for i := 1; i <= 63; i++ { tab[i] = (1<<14)/(64+i) }
   124	// laid out here as little-endian uint32s
   125	DATA fast_udiv_tab<>+0x00(SB)/4, $0xf4f8fcff
   126	DATA fast_udiv_tab<>+0x04(SB)/4, $0xe6eaedf0
   127	DATA fast_udiv_tab<>+0x08(SB)/4, $0xdadde0e3
   128	DATA fast_udiv_tab<>+0x0c(SB)/4, $0xcfd2d4d7
   129	DATA fast_udiv_tab<>+0x10(SB)/4, $0xc5c7cacc
   130	DATA fast_udiv_tab<>+0x14(SB)/4, $0xbcbec0c3
   131	DATA fast_udiv_tab<>+0x18(SB)/4, $0xb4b6b8ba
   132	DATA fast_udiv_tab<>+0x1c(SB)/4, $0xacaeb0b2
   133	DATA fast_udiv_tab<>+0x20(SB)/4, $0xa5a7a8aa
   134	DATA fast_udiv_tab<>+0x24(SB)/4, $0x9fa0a2a3
   135	DATA fast_udiv_tab<>+0x28(SB)/4, $0x999a9c9d
   136	DATA fast_udiv_tab<>+0x2c(SB)/4, $0x93949697
   137	DATA fast_udiv_tab<>+0x30(SB)/4, $0x8e8f9092
   138	DATA fast_udiv_tab<>+0x34(SB)/4, $0x898a8c8d
   139	DATA fast_udiv_tab<>+0x38(SB)/4, $0x85868788
   140	DATA fast_udiv_tab<>+0x3c(SB)/4, $0x81828384
   141	GLOBL fast_udiv_tab<>(SB), RODATA, $64
   142	
   143	// The linker will pass numerator in R8
   144	#define Rn R8
   145	// The linker expects the result in RTMP
   146	#define RTMP R11
   147	
   148	TEXT runtime·_divu(SB), NOSPLIT, $16-0
   149		// It's not strictly true that there are no local pointers.
   150		// It could be that the saved registers Rq, Rr, Rs, and Rm
   151		// contain pointers. However, the only way this can matter
   152		// is if the stack grows (which it can't, udiv is nosplit)
   153		// or if a fault happens and more frames are added to
   154		// the stack due to deferred functions.
   155		// In the latter case, the stack can grow arbitrarily,
   156		// and garbage collection can happen, and those
   157		// operations care about pointers, but in that case
   158		// the calling frame is dead, and so are the saved
   159		// registers. So we can claim there are no pointers here.
   160		NO_LOCAL_POINTERS
   161		MOVW	Rq, 4(R13)
   162		MOVW	Rr, 8(R13)
   163		MOVW	Rs, 12(R13)
   164		MOVW	RM, 16(R13)
   165	
   166		MOVW	Rn, Rr			/* numerator */
   167		MOVW	g_m(g), Rq
   168		MOVW	m_divmod(Rq), Rq	/* denominator */
   169		BL  	runtime·udiv(SB)
   170		MOVW	Rq, RTMP
   171		MOVW	4(R13), Rq
   172		MOVW	8(R13), Rr
   173		MOVW	12(R13), Rs
   174		MOVW	16(R13), RM
   175		RET
   176	
   177	TEXT runtime·_modu(SB), NOSPLIT, $16-0
   178		NO_LOCAL_POINTERS
   179		MOVW	Rq, 4(R13)
   180		MOVW	Rr, 8(R13)
   181		MOVW	Rs, 12(R13)
   182		MOVW	RM, 16(R13)
   183	
   184		MOVW	Rn, Rr			/* numerator */
   185		MOVW	g_m(g), Rq
   186		MOVW	m_divmod(Rq), Rq	/* denominator */
   187		BL  	runtime·udiv(SB)
   188		MOVW	Rr, RTMP
   189		MOVW	4(R13), Rq
   190		MOVW	8(R13), Rr
   191		MOVW	12(R13), Rs
   192		MOVW	16(R13), RM
   193		RET
   194	
   195	TEXT runtime·_div(SB),NOSPLIT,$16-0
   196		NO_LOCAL_POINTERS
   197		MOVW	Rq, 4(R13)
   198		MOVW	Rr, 8(R13)
   199		MOVW	Rs, 12(R13)
   200		MOVW	RM, 16(R13)
   201		MOVW	Rn, Rr			/* numerator */
   202		MOVW	g_m(g), Rq
   203		MOVW	m_divmod(Rq), Rq	/* denominator */
   204		CMP 	$0, Rr
   205		BGE 	d1
   206		RSB 	$0, Rr, Rr
   207		CMP 	$0, Rq
   208		BGE 	d2
   209		RSB 	$0, Rq, Rq
   210	d0:
   211		BL  	runtime·udiv(SB)  	/* none/both neg */
   212		MOVW	Rq, RTMP
   213		B	out1
   214	d1:
   215		CMP 	$0, Rq
   216		BGE 	d0
   217		RSB 	$0, Rq, Rq
   218	d2:
   219		BL  	runtime·udiv(SB)  	/* one neg */
   220		RSB	$0, Rq, RTMP
   221	out1:
   222		MOVW	4(R13), Rq
   223		MOVW	8(R13), Rr
   224		MOVW	12(R13), Rs
   225		MOVW	16(R13), RM
   226		RET
   227	
   228	TEXT runtime·_mod(SB),NOSPLIT,$16-0
   229		NO_LOCAL_POINTERS
   230		MOVW	Rq, 4(R13)
   231		MOVW	Rr, 8(R13)
   232		MOVW	Rs, 12(R13)
   233		MOVW	RM, 16(R13)
   234		MOVW	Rn, Rr			/* numerator */
   235		MOVW	g_m(g), Rq
   236		MOVW	m_divmod(Rq), Rq	/* denominator */
   237		CMP 	$0, Rq
   238		RSB.LT	$0, Rq, Rq
   239		CMP 	$0, Rr
   240		BGE 	m1
   241		RSB 	$0, Rr, Rr
   242		BL  	runtime·udiv(SB)  	/* neg numerator */
   243		RSB 	$0, Rr, RTMP
   244		B   	out
   245	m1:
   246		BL  	runtime·udiv(SB)  	/* pos numerator */
   247		MOVW	Rr, RTMP
   248	out:
   249		MOVW	4(R13), Rq
   250		MOVW	8(R13), Rr
   251		MOVW	12(R13), Rs
   252		MOVW	16(R13), RM
   253		RET
   254	
   255	// _mul64by32 and _div64by32 not implemented on arm
   256	TEXT runtime·_mul64by32(SB), NOSPLIT, $0
   257		MOVW	$0, R0
   258		MOVW	(R0), R1 // crash
   259	
   260	TEXT runtime·_div64by32(SB), NOSPLIT, $0
   261		MOVW	$0, R0
   262		MOVW	(R0), R1 // crash

View as plain text