Text file src/runtime/vlop_arm.s
1 // Inferno's libkern/vlop-arm.s
2 // https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/vlop-arm.s
3 //
4 // Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
5 // Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved.
6 // Portions Copyright 2009 The Go Authors. All rights reserved.
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a copy
9 // of this software and associated documentation files (the "Software"), to deal
10 // in the Software without restriction, including without limitation the rights
11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 // copies of the Software, and to permit persons to whom the Software is
13 // furnished to do so, subject to the following conditions:
14 //
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
17 //
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 // THE SOFTWARE.
25
26 #include "go_asm.h"
27 #include "go_tls.h"
28 #include "funcdata.h"
29 #include "textflag.h"
30
31 // func runtime·udiv(n, d uint32) (q, r uint32)
32 // compiler knowns the register usage of this function
33 // Reference:
34 // Sloss, Andrew et. al; ARM System Developer's Guide: Designing and Optimizing System Software
35 // Morgan Kaufmann; 1 edition (April 8, 2004), ISBN 978-1558608740
36 #define Rq R0 // input d, output q
37 #define Rr R1 // input n, output r
38 #define Rs R2 // three temporary variables
39 #define RM R3
40 #define Ra R11
41
42 // Be careful: Ra == R11 will be used by the linker for synthesized instructions.
43 // Note: this function does not have a frame. If it ever needs a frame,
44 // the RET instruction will clobber R12 on nacl, and the compiler's register
45 // allocator needs to know.
46 TEXT runtime·udiv(SB),NOSPLIT|NOFRAME,$0
47 MOVBU internal∕cpu·ARM+const_offsetARMHasIDIVA(SB), Ra
48 CMP $0, Ra
49 BNE udiv_hardware
50
51 CLZ Rq, Rs // find normalizing shift
52 MOVW.S Rq<<Rs, Ra
53 MOVW $fast_udiv_tab<>-64(SB), RM
54 ADD.NE Ra>>25, RM, Ra // index by most significant 7 bits of divisor
55 MOVBU.NE (Ra), Ra
56
57 SUB.S $7, Rs
58 RSB $0, Rq, RM // M = -q
59 MOVW.PL Ra<<Rs, Rq
60
61 // 1st Newton iteration
62 MUL.PL RM, Rq, Ra // a = -q*d
63 BMI udiv_by_large_d
64 MULAWT Ra, Rq, Rq, Rq // q approx q-(q*q*d>>32)
65 TEQ RM->1, RM // check for d=0 or d=1
66
67 // 2nd Newton iteration
68 MUL.NE RM, Rq, Ra
69 MOVW.NE $0, Rs
70 MULAL.NE Rq, Ra, (Rq,Rs)
71 BEQ udiv_by_0_or_1
72
73 // q now accurate enough for a remainder r, 0<=r<3*d
74 MULLU Rq, Rr, (Rq,Rs) // q = (r * q) >> 32
75 ADD RM, Rr, Rr // r = n - d
76 MULA RM, Rq, Rr, Rr // r = n - (q+1)*d
77
78 // since 0 <= n-q*d < 3*d; thus -d <= r < 2*d
79 CMN RM, Rr // t = r-d
80 SUB.CS RM, Rr, Rr // if (t<-d || t>=0) r=r+d
81 ADD.CC $1, Rq
82 ADD.PL RM<<1, Rr
83 ADD.PL $2, Rq
84 RET
85
86 // use hardware divider
87 udiv_hardware:
88 DIVUHW Rq, Rr, Rs
89 MUL Rs, Rq, RM
90 RSB Rr, RM, Rr
91 MOVW Rs, Rq
92 RET
93
94 udiv_by_large_d:
95 // at this point we know d>=2^(31-6)=2^25
96 SUB $4, Ra, Ra
97 RSB $0, Rs, Rs
98 MOVW Ra>>Rs, Rq
99 MULLU Rq, Rr, (Rq,Rs)
100 MULA RM, Rq, Rr, Rr
101
102 // q now accurate enough for a remainder r, 0<=r<4*d
103 CMN Rr>>1, RM // if(r/2 >= d)
104 ADD.CS RM<<1, Rr
105 ADD.CS $2, Rq
106 CMN Rr, RM
107 ADD.CS RM, Rr
108 ADD.CS $1, Rq
109 RET
110
111 udiv_by_0_or_1:
112 // carry set if d==1, carry clear if d==0
113 BCC udiv_by_0
114 MOVW Rr, Rq
115 MOVW $0, Rr
116 RET
117
118 udiv_by_0:
119 MOVW $runtime·panicdivide(SB), R11
120 B (R11)
121
122 // var tab [64]byte
123 // tab[0] = 255; for i := 1; i <= 63; i++ { tab[i] = (1<<14)/(64+i) }
124 // laid out here as little-endian uint32s
125 DATA fast_udiv_tab<>+0x00(SB)/4, $0xf4f8fcff
126 DATA fast_udiv_tab<>+0x04(SB)/4, $0xe6eaedf0
127 DATA fast_udiv_tab<>+0x08(SB)/4, $0xdadde0e3
128 DATA fast_udiv_tab<>+0x0c(SB)/4, $0xcfd2d4d7
129 DATA fast_udiv_tab<>+0x10(SB)/4, $0xc5c7cacc
130 DATA fast_udiv_tab<>+0x14(SB)/4, $0xbcbec0c3
131 DATA fast_udiv_tab<>+0x18(SB)/4, $0xb4b6b8ba
132 DATA fast_udiv_tab<>+0x1c(SB)/4, $0xacaeb0b2
133 DATA fast_udiv_tab<>+0x20(SB)/4, $0xa5a7a8aa
134 DATA fast_udiv_tab<>+0x24(SB)/4, $0x9fa0a2a3
135 DATA fast_udiv_tab<>+0x28(SB)/4, $0x999a9c9d
136 DATA fast_udiv_tab<>+0x2c(SB)/4, $0x93949697
137 DATA fast_udiv_tab<>+0x30(SB)/4, $0x8e8f9092
138 DATA fast_udiv_tab<>+0x34(SB)/4, $0x898a8c8d
139 DATA fast_udiv_tab<>+0x38(SB)/4, $0x85868788
140 DATA fast_udiv_tab<>+0x3c(SB)/4, $0x81828384
141 GLOBL fast_udiv_tab<>(SB), RODATA, $64
142
143 // The linker will pass numerator in R8
144 #define Rn R8
145 // The linker expects the result in RTMP
146 #define RTMP R11
147
148 TEXT runtime·_divu(SB), NOSPLIT, $16-0
149 // It's not strictly true that there are no local pointers.
150 // It could be that the saved registers Rq, Rr, Rs, and Rm
151 // contain pointers. However, the only way this can matter
152 // is if the stack grows (which it can't, udiv is nosplit)
153 // or if a fault happens and more frames are added to
154 // the stack due to deferred functions.
155 // In the latter case, the stack can grow arbitrarily,
156 // and garbage collection can happen, and those
157 // operations care about pointers, but in that case
158 // the calling frame is dead, and so are the saved
159 // registers. So we can claim there are no pointers here.
160 NO_LOCAL_POINTERS
161 MOVW Rq, 4(R13)
162 MOVW Rr, 8(R13)
163 MOVW Rs, 12(R13)
164 MOVW RM, 16(R13)
165
166 MOVW Rn, Rr /* numerator */
167 MOVW g_m(g), Rq
168 MOVW m_divmod(Rq), Rq /* denominator */
169 BL runtime·udiv(SB)
170 MOVW Rq, RTMP
171 MOVW 4(R13), Rq
172 MOVW 8(R13), Rr
173 MOVW 12(R13), Rs
174 MOVW 16(R13), RM
175 RET
176
177 TEXT runtime·_modu(SB), NOSPLIT, $16-0
178 NO_LOCAL_POINTERS
179 MOVW Rq, 4(R13)
180 MOVW Rr, 8(R13)
181 MOVW Rs, 12(R13)
182 MOVW RM, 16(R13)
183
184 MOVW Rn, Rr /* numerator */
185 MOVW g_m(g), Rq
186 MOVW m_divmod(Rq), Rq /* denominator */
187 BL runtime·udiv(SB)
188 MOVW Rr, RTMP
189 MOVW 4(R13), Rq
190 MOVW 8(R13), Rr
191 MOVW 12(R13), Rs
192 MOVW 16(R13), RM
193 RET
194
195 TEXT runtime·_div(SB),NOSPLIT,$16-0
196 NO_LOCAL_POINTERS
197 MOVW Rq, 4(R13)
198 MOVW Rr, 8(R13)
199 MOVW Rs, 12(R13)
200 MOVW RM, 16(R13)
201 MOVW Rn, Rr /* numerator */
202 MOVW g_m(g), Rq
203 MOVW m_divmod(Rq), Rq /* denominator */
204 CMP $0, Rr
205 BGE d1
206 RSB $0, Rr, Rr
207 CMP $0, Rq
208 BGE d2
209 RSB $0, Rq, Rq
210 d0:
211 BL runtime·udiv(SB) /* none/both neg */
212 MOVW Rq, RTMP
213 B out1
214 d1:
215 CMP $0, Rq
216 BGE d0
217 RSB $0, Rq, Rq
218 d2:
219 BL runtime·udiv(SB) /* one neg */
220 RSB $0, Rq, RTMP
221 out1:
222 MOVW 4(R13), Rq
223 MOVW 8(R13), Rr
224 MOVW 12(R13), Rs
225 MOVW 16(R13), RM
226 RET
227
228 TEXT runtime·_mod(SB),NOSPLIT,$16-0
229 NO_LOCAL_POINTERS
230 MOVW Rq, 4(R13)
231 MOVW Rr, 8(R13)
232 MOVW Rs, 12(R13)
233 MOVW RM, 16(R13)
234 MOVW Rn, Rr /* numerator */
235 MOVW g_m(g), Rq
236 MOVW m_divmod(Rq), Rq /* denominator */
237 CMP $0, Rq
238 RSB.LT $0, Rq, Rq
239 CMP $0, Rr
240 BGE m1
241 RSB $0, Rr, Rr
242 BL runtime·udiv(SB) /* neg numerator */
243 RSB $0, Rr, RTMP
244 B out
245 m1:
246 BL runtime·udiv(SB) /* pos numerator */
247 MOVW Rr, RTMP
248 out:
249 MOVW 4(R13), Rq
250 MOVW 8(R13), Rr
251 MOVW 12(R13), Rs
252 MOVW 16(R13), RM
253 RET
254
255 // _mul64by32 and _div64by32 not implemented on arm
256 TEXT runtime·_mul64by32(SB), NOSPLIT, $0
257 MOVW $0, R0
258 MOVW (R0), R1 // crash
259
260 TEXT runtime·_div64by32(SB), NOSPLIT, $0
261 MOVW $0, R0
262 MOVW (R0), R1 // crash
View as plain text