diff options
author | Sean Bartell <wingedtachikoma@gmail.com> | 2011-06-25 21:32:25 -0400 |
---|---|---|
committer | Nils Wallménius <nils@rockbox.org> | 2012-04-25 22:13:20 +0200 |
commit | f40bfc9267b13b54e6379dfe7539447662879d24 (patch) | |
tree | 9b20069d5e62809ff434061ad730096836f916f2 /lib/rbcodec/codecs/demac/libdemac/udiv32_arm.S | |
parent | a0009907de7a0107d49040d8a180f140e2eff299 (diff) | |
download | rockbox-f40bfc9267b13b54e6379dfe7539447662879d24.tar.gz rockbox-f40bfc9267b13b54e6379dfe7539447662879d24.zip |
Add codecs to librbcodec.
Change-Id: Id7f4717d51ed02d67cb9f9cb3c0ada4a81843f97
Reviewed-on: http://gerrit.rockbox.org/137
Reviewed-by: Nils Wallménius <nils@rockbox.org>
Tested-by: Nils Wallménius <nils@rockbox.org>
Diffstat (limited to 'lib/rbcodec/codecs/demac/libdemac/udiv32_arm.S')
-rw-r--r-- | lib/rbcodec/codecs/demac/libdemac/udiv32_arm.S | 318 |
1 files changed, 318 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/demac/libdemac/udiv32_arm.S b/lib/rbcodec/codecs/demac/libdemac/udiv32_arm.S new file mode 100644 index 0000000000..7b851659bd --- /dev/null +++ b/lib/rbcodec/codecs/demac/libdemac/udiv32_arm.S | |||
@@ -0,0 +1,318 @@ | |||
1 | /*************************************************************************** | ||
2 | * __________ __ ___. | ||
3 | * Open \______ \ ____ ____ | | _\_ |__ _______ ___ | ||
4 | * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / | ||
5 | * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < | ||
6 | * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ | ||
7 | * \/ \/ \/ \/ \/ | ||
8 | * $Id$ | ||
9 | * | ||
10 | * Copyright (C) 2008 by Jens Arnold | ||
11 | * Copyright (C) 2009 by Andrew Mahone | ||
12 | * | ||
13 | * Optimised unsigned integer division for ARMv4 | ||
14 | * | ||
15 | * Based on: libgcc routines for ARM cpu, additional algorithms from ARM System | ||
16 | * Developer's Guide | ||
17 | * Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk) | ||
18 | * Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005 | ||
19 | * Free Software Foundation, Inc. | ||
20 | * | ||
21 | * This program is free software; you can redistribute it and/or | ||
22 | * modify it under the terms of the GNU General Public License | ||
23 | * as published by the Free Software Foundation; either version 2 | ||
24 | * of the License, or (at your option) any later version. | ||
25 | * | ||
26 | * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY | ||
27 | * KIND, either express or implied. | ||
28 | * | ||
29 | ****************************************************************************/ | ||
30 | |||
31 | #include "config.h" | ||
32 | /* On targets with codec iram, a header file will be generated after an initial | ||
33 | link of the APE codec, stating the amount of IRAM remaining for use by the | ||
34 | reciprocal lookup table. */ | ||
35 | #if !defined(APE_PRE) && defined(USE_IRAM) && ARM_ARCH < 5 | ||
36 | #include "lib/rbcodec/codecs/ape_free_iram.h" | ||
37 | #endif | ||
38 | |||
39 | /* Codecs should not normally do this, but we need to check a macro, and | ||
40 | * codecs.h would confuse the assembler. */ | ||
41 | |||
42 | #ifdef USE_IRAM | ||
43 | #define DIV_RECIP | ||
44 | .section .icode,"ax",%progbits | ||
45 | #else | ||
46 | .text | ||
47 | #endif | ||
48 | .align | ||
49 | .global udiv32_arm | ||
50 | .type udiv32_arm,%function | ||
51 | |||
52 | #if ARM_ARCH < 5 | ||
53 | /* Adapted from an algorithm given in ARM System Developer's Guide (7.3.1.2) | ||
54 | for dividing a 30-bit value by a 15-bit value, with two operations per | ||
55 | iteration by storing quotient and remainder together and adding the previous | ||
56 | quotient bit during trial subtraction. Modified to work with any dividend | ||
57 | and divisor both less than 1 << 30, and skipping trials by calculating bits | ||
58 | in output. */ | ||
59 | .macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient, remainder | ||
60 | |||
61 | mov \bits, #1 | ||
62 | /* Shift the divisor left until it aligns with the numerator. If it already | ||
63 | has the high bit set, this is fine, everything inside .rept will be | ||
64 | skipped, and the add before and adcs after will set the one-bit result | ||
65 | to zero. */ | ||
66 | cmn \divisor, \dividend, lsr #16 | ||
67 | movcs \divisor, \divisor, lsl #16 | ||
68 | addcs \bits, \bits, #16 | ||
69 | cmn \divisor, \dividend, lsr #8 | ||
70 | movcs \divisor, \divisor, lsl #8 | ||
71 | addcs \bits, \bits, #8 | ||
72 | cmn \divisor, \dividend, lsr #4 | ||
73 | movcs \divisor, \divisor, lsl #4 | ||
74 | addcs \bits, \bits, #4 | ||
75 | cmn \divisor, \dividend, lsr #2 | ||
76 | movcs \divisor, \divisor, lsl #2 | ||
77 | addcs \bits, \bits, #2 | ||
78 | cmn \divisor, \dividend, lsr #1 | ||
79 | movcs \divisor, \divisor, lsl #1 | ||
80 | addcs \bits, \bits, #1 | ||
81 | adds \result, \dividend, \divisor | ||
82 | subcc \result, \result, \divisor | ||
83 | rsb \curbit, \bits, #31 | ||
84 | add pc, pc, \curbit, lsl #3 | ||
85 | nop | ||
86 | .rept 30 | ||
87 | adcs \result, \divisor, \result, lsl #1 | ||
88 | /* Fix the remainder portion of the result. This must be done because the | ||
89 | handler for 32-bit numerators needs the remainder. */ | ||
90 | subcc \result, \result, \divisor | ||
91 | .endr | ||
92 | /* Shift remainder/quotient left one, add final quotient bit */ | ||
93 | adc \result, \result, \result | ||
94 | mov \remainder, \result, lsr \bits | ||
95 | eor \quotient, \result, \remainder, lsl \bits | ||
96 | .endm | ||
97 | |||
98 | #ifndef FREE_IRAM | ||
99 | .set recip_max, 2 | ||
100 | #else | ||
101 | /* Each table entry is one word. Since a compare is done against the maximum | ||
102 | entry as an immediate, the maximum entry must be a valid ARM immediate, | ||
103 | which means a byte shifted by an even number of places. */ | ||
104 | .set recip_max, 2 + FREE_IRAM / 4 | ||
105 | .set recip_max_tmp, recip_max >> 8 | ||
106 | .set recip_mask_shift, 0 | ||
107 | .set tmp_shift, 16 | ||
108 | .rept 5 | ||
109 | .if recip_max_tmp >> tmp_shift | ||
110 | .set recip_max_tmp, recip_max_tmp >> tmp_shift | ||
111 | .set recip_mask_shift, recip_mask_shift + tmp_shift | ||
112 | .endif | ||
113 | .set tmp_shift, tmp_shift >> 1 | ||
114 | .endr | ||
115 | .if recip_max_tmp | ||
116 | .set recip_mask_shift, recip_mask_shift + 1 | ||
117 | .endif | ||
118 | .set recip_mask_shift, (recip_mask_shift + 1) & 62 | ||
119 | .set recip_max, recip_max & (255 << recip_mask_shift) | ||
120 | //.set recip_max, 2 | ||
121 | #endif | ||
122 | |||
123 | udiv32_arm: | ||
124 | #ifdef DIV_RECIP | ||
125 | cmp r1, #3 | ||
126 | bcc .L_udiv_tiny | ||
127 | cmp r1, #recip_max | ||
128 | bhi .L_udiv | ||
129 | adr r3, .L_udiv_recip_table-12 | ||
130 | ldr r2, [r3, r1, lsl #2] | ||
131 | mov r3, r0 | ||
132 | umull ip, r0, r2, r0 | ||
133 | mul r2, r0, r1 | ||
134 | cmp r3, r2 | ||
135 | bxcs lr | ||
136 | sub r0, r0, #1 | ||
137 | bx lr | ||
138 | .L_udiv_tiny: | ||
139 | cmp r1, #1 | ||
140 | movhi r0, r0, lsr #1 | ||
141 | bxcs lr | ||
142 | b .L_div0 | ||
143 | #endif | ||
144 | .L_udiv: | ||
145 | /* Invert divisor. ARM_DIV_31_BODY uses adc to both subtract the divisor | ||
146 | and add the next bit of the result. The correction code at .L_udiv32 | ||
147 | does not need the divisor inverted, but can be modified to work with it, | ||
148 | and this allows the zero divisor test to be done early and without an | ||
149 | explicit comparison. */ | ||
150 | rsbs r1, r1, #0 | ||
151 | #ifndef DIV_RECIP | ||
152 | beq .L_div0 | ||
153 | #endif | ||
154 | tst r0, r0 | ||
155 | /* High bit must be unset, otherwise shift numerator right, calculate, | ||
156 | and correct results. As this case is very uncommon we want to avoid | ||
157 | any other delays on the main path in handling it, so the long divide | ||
158 | calls the short divide as a function. */ | ||
159 | bmi .L_udiv32 | ||
160 | .L_udiv31: | ||
161 | ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0, r1 | ||
162 | bx lr | ||
163 | .L_udiv32: | ||
164 | /* store original numerator and divisor, we'll need them to correct the | ||
165 | result, */ | ||
166 | stmdb sp, { r0, r1, lr } | ||
167 | /* Call __div0 here if divisor is zero, otherwise it would report the wrong | ||
168 | address. */ | ||
169 | mov r0, r0, lsr #1 | ||
170 | bl .L_udiv31 | ||
171 | ldmdb sp, { r2, r3, lr } | ||
172 | /* Move the low bit of the original numerator to the carry bit */ | ||
173 | movs r2, r2, lsr #1 | ||
174 | /* Shift the remainder left one and add in the carry bit */ | ||
175 | adc r1, r1, r1 | ||
176 | /* Subtract the original divisor from the remainder, setting carry if the | ||
177 | result is non-negative */ | ||
178 | adds r1, r1, r3 | ||
179 | /* Shift quotient left one and add carry bit */ | ||
180 | adc r0, r0, r0 | ||
181 | bx lr | ||
182 | .L_div0: | ||
183 | /* __div0 expects the calling address on the top of the stack */ | ||
184 | stmdb sp!, { lr } | ||
185 | mov r0, #0 | ||
186 | #if defined(__ARM_EABI__) || !defined(USE_IRAM) | ||
187 | bl __div0 | ||
188 | #else | ||
189 | ldr pc, [pc, #-4] | ||
190 | .word __div0 | ||
191 | #endif | ||
192 | #ifdef DIV_RECIP | ||
193 | .L_udiv_recip_table: | ||
194 | .set div, 3 | ||
195 | .rept recip_max - 2 | ||
196 | .if (div - 1) & div | ||
197 | .set q, 0x40000000 / div | ||
198 | .set r, (0x40000000 - (q * div))<<1 | ||
199 | .set q, q << 1 | ||
200 | .if r >= div | ||
201 | .set q, q + 1 | ||
202 | .set r, r - div | ||
203 | .endif | ||
204 | .set r, r << 1 | ||
205 | .set q, q << 1 | ||
206 | .if r >= div | ||
207 | .set q, q + 1 | ||
208 | .set r, r - div | ||
209 | .endif | ||
210 | .set q, q + 1 | ||
211 | .else | ||
212 | .set q, 0x40000000 / div * 4 | ||
213 | .endif | ||
214 | .word q | ||
215 | .set div, div+1 | ||
216 | .endr | ||
217 | #endif | ||
218 | .size udiv32_arm, . - udiv32_arm | ||
219 | |||
220 | #else | ||
221 | .macro ARMV5_UDIV32_BODY numerator, divisor, quotient, bits, inv, neg, div0label | ||
222 | cmp \numerator, \divisor | ||
223 | clz \bits, \divisor | ||
224 | bcc 30f | ||
225 | mov \inv, \divisor, lsl \bits | ||
226 | add \neg, pc, \inv, lsr #25 | ||
227 | cmp \inv, #1<<31 | ||
228 | ldrhib \inv, [\neg, #.L_udiv_est_table-.-64] | ||
229 | bls 20f | ||
230 | subs \bits, \bits, #7 | ||
231 | rsb \neg, \divisor, #0 | ||
232 | movpl \divisor, \inv, lsl \bits | ||
233 | bmi 10f | ||
234 | mul \inv, \divisor, \neg | ||
235 | smlawt \divisor, \divisor, \inv, \divisor | ||
236 | mul \inv, \divisor, \neg | ||
237 | /* This will save a cycle on ARMv6, but requires that the numerator sign | ||
238 | bit is not set (that of inv is guaranteed unset). The branch should | ||
239 | predict very well, making it typically 1 cycle, and thus both the branch | ||
240 | and test fill delay cycles for the multiplies. Based on logging of | ||
241 | numerator sizes in the APE codec, the branch is taken about 1/10^7 of | ||
242 | the time. */ | ||
243 | #if ARM_ARCH >= 6 | ||
244 | tst \numerator, \numerator | ||
245 | smmla \divisor, \divisor, \inv, \divisor | ||
246 | bmi 40f | ||
247 | smmul \inv, \numerator, \divisor | ||
248 | #else | ||
249 | mov \bits, #0 | ||
250 | smlal \bits, \divisor, \inv, \divisor | ||
251 | umull \bits, \inv, \numerator, \divisor | ||
252 | #endif | ||
253 | add \numerator, \numerator, \neg | ||
254 | mla \divisor, \inv, \neg, \numerator | ||
255 | mov \quotient, \inv | ||
256 | cmn \divisor, \neg | ||
257 | addcc \quotient, \quotient, #1 | ||
258 | addpl \quotient, \quotient, #2 | ||
259 | bx lr | ||
260 | 10: | ||
261 | rsb \bits, \bits, #0 | ||
262 | sub \inv, \inv, #4 | ||
263 | mov \divisor, \inv, lsr \bits | ||
264 | umull \bits, \inv, \numerator, \divisor | ||
265 | mla \divisor, \inv, \neg, \numerator | ||
266 | mov \quotient, \inv | ||
267 | cmn \neg, \divisor, lsr #1 | ||
268 | addcs \divisor, \divisor, \neg, lsl #1 | ||
269 | addcs \quotient, \quotient, #2 | ||
270 | cmn \neg, \divisor | ||
271 | addcs \quotient, \quotient, #1 | ||
272 | bx lr | ||
273 | 20: | ||
274 | .ifnc "", "\div0label" | ||
275 | rsb \bits, \bits, #31 | ||
276 | bne \div0label | ||
277 | .endif | ||
278 | mov \quotient, \numerator, lsr \bits | ||
279 | bx lr | ||
280 | 30: | ||
281 | mov \quotient, #0 | ||
282 | bx lr | ||
283 | #if ARM_ARCH >= 6 | ||
284 | 40: | ||
285 | umull \bits, \inv, \numerator, \divisor | ||
286 | add \numerator, \numerator, \neg | ||
287 | mla \divisor, \inv, \neg, \numerator | ||
288 | mov \quotient, \inv | ||
289 | cmn \divisor, \neg | ||
290 | addcc \quotient, \quotient, #1 | ||
291 | addpl \quotient, \quotient, #2 | ||
292 | bx lr | ||
293 | #endif | ||
294 | .endm | ||
295 | |||
296 | udiv32_arm: | ||
297 | ARMV5_UDIV32_BODY r0, r1, r0, r2, r3, ip, .L_div0 | ||
298 | .L_div0: | ||
299 | /* __div0 expects the calling address on the top of the stack */ | ||
300 | stmdb sp!, { lr } | ||
301 | mov r0, #0 | ||
302 | #if defined(__ARM_EABI__) || !defined(USE_IRAM) | ||
303 | bl __div0 | ||
304 | #else | ||
305 | ldr pc, [pc, #-4] | ||
306 | .word __div0 | ||
307 | #endif | ||
308 | .L_udiv_est_table: | ||
309 | .byte 0xff, 0xfc, 0xf8, 0xf4, 0xf0, 0xed, 0xea, 0xe6 | ||
310 | .byte 0xe3, 0xe0, 0xdd, 0xda, 0xd7, 0xd4, 0xd2, 0xcf | ||
311 | .byte 0xcc, 0xca, 0xc7, 0xc5, 0xc3, 0xc0, 0xbe, 0xbc | ||
312 | .byte 0xba, 0xb8, 0xb6, 0xb4, 0xb2, 0xb0, 0xae, 0xac | ||
313 | .byte 0xaa, 0xa8, 0xa7, 0xa5, 0xa3, 0xa2, 0xa0, 0x9f | ||
314 | .byte 0x9d, 0x9c, 0x9a, 0x99, 0x97, 0x96, 0x94, 0x93 | ||
315 | .byte 0x92, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8a, 0x89 | ||
316 | .byte 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81 | ||
317 | #endif | ||
318 | .size udiv32_arm, . - udiv32_arm | ||