summaryrefslogtreecommitdiff
path: root/lib/rbcodec/codecs/demac/libdemac/udiv32_arm.S
diff options
context:
space:
mode:
Diffstat (limited to 'lib/rbcodec/codecs/demac/libdemac/udiv32_arm.S')
-rw-r--r--lib/rbcodec/codecs/demac/libdemac/udiv32_arm.S318
1 files changed, 318 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/demac/libdemac/udiv32_arm.S b/lib/rbcodec/codecs/demac/libdemac/udiv32_arm.S
new file mode 100644
index 0000000000..7b851659bd
--- /dev/null
+++ b/lib/rbcodec/codecs/demac/libdemac/udiv32_arm.S
@@ -0,0 +1,318 @@
1/***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id$
9 *
10 * Copyright (C) 2008 by Jens Arnold
11 * Copyright (C) 2009 by Andrew Mahone
12 *
13 * Optimised unsigned integer division for ARMv4
14 *
15 * Based on: libgcc routines for ARM cpu, additional algorithms from ARM System
16 * Developer's Guide
17 * Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
18 * Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
19 * Free Software Foundation, Inc.
20 *
21 * This program is free software; you can redistribute it and/or
22 * modify it under the terms of the GNU General Public License
23 * as published by the Free Software Foundation; either version 2
24 * of the License, or (at your option) any later version.
25 *
26 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
27 * KIND, either express or implied.
28 *
29 ****************************************************************************/
30
31#include "config.h"
32/* On targets with codec iram, a header file will be generated after an initial
33 link of the APE codec, stating the amount of IRAM remaining for use by the
34 reciprocal lookup table. */
35#if !defined(APE_PRE) && defined(USE_IRAM) && ARM_ARCH < 5
36#include "lib/rbcodec/codecs/ape_free_iram.h"
37#endif
38
39/* Codecs should not normally do this, but we need to check a macro, and
40 * codecs.h would confuse the assembler. */
41
42#ifdef USE_IRAM
43#define DIV_RECIP
44 .section .icode,"ax",%progbits
45#else
46 .text
47#endif
48 .align
49 .global udiv32_arm
50 .type udiv32_arm,%function
51
52#if ARM_ARCH < 5
53/* Adapted from an algorithm given in ARM System Developer's Guide (7.3.1.2)
54 for dividing a 30-bit value by a 15-bit value, with two operations per
55 iteration by storing quotient and remainder together and adding the previous
56 quotient bit during trial subtraction. Modified to work with any dividend
57 and divisor both less than 1 << 30, and skipping trials by calculating bits
58 in output. */
59.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient, remainder
60
61 mov \bits, #1
62 /* Shift the divisor left until it aligns with the numerator. If it already
63 has the high bit set, this is fine, everything inside .rept will be
64 skipped, and the add before and adcs after will set the one-bit result
65 to zero. */
66 cmn \divisor, \dividend, lsr #16
67 movcs \divisor, \divisor, lsl #16
68 addcs \bits, \bits, #16
69 cmn \divisor, \dividend, lsr #8
70 movcs \divisor, \divisor, lsl #8
71 addcs \bits, \bits, #8
72 cmn \divisor, \dividend, lsr #4
73 movcs \divisor, \divisor, lsl #4
74 addcs \bits, \bits, #4
75 cmn \divisor, \dividend, lsr #2
76 movcs \divisor, \divisor, lsl #2
77 addcs \bits, \bits, #2
78 cmn \divisor, \dividend, lsr #1
79 movcs \divisor, \divisor, lsl #1
80 addcs \bits, \bits, #1
81 adds \result, \dividend, \divisor
82 subcc \result, \result, \divisor
83 rsb \curbit, \bits, #31
84 add pc, pc, \curbit, lsl #3
85 nop
86 .rept 30
87 adcs \result, \divisor, \result, lsl #1
88 /* Fix the remainder portion of the result. This must be done because the
89 handler for 32-bit numerators needs the remainder. */
90 subcc \result, \result, \divisor
91 .endr
92 /* Shift remainder/quotient left one, add final quotient bit */
93 adc \result, \result, \result
94 mov \remainder, \result, lsr \bits
95 eor \quotient, \result, \remainder, lsl \bits
96.endm
97
98#ifndef FREE_IRAM
99.set recip_max, 2
100#else
101/* Each table entry is one word. Since a compare is done against the maximum
102 entry as an immediate, the maximum entry must be a valid ARM immediate,
103 which means a byte shifted by an even number of places. */
104.set recip_max, 2 + FREE_IRAM / 4
105.set recip_max_tmp, recip_max >> 8
106.set recip_mask_shift, 0
107.set tmp_shift, 16
108.rept 5
109 .if recip_max_tmp >> tmp_shift
110 .set recip_max_tmp, recip_max_tmp >> tmp_shift
111 .set recip_mask_shift, recip_mask_shift + tmp_shift
112 .endif
113 .set tmp_shift, tmp_shift >> 1
114.endr
115.if recip_max_tmp
116 .set recip_mask_shift, recip_mask_shift + 1
117.endif
118.set recip_mask_shift, (recip_mask_shift + 1) & 62
119.set recip_max, recip_max & (255 << recip_mask_shift)
120//.set recip_max, 2
121#endif
122
123udiv32_arm:
124#ifdef DIV_RECIP
125 cmp r1, #3
126 bcc .L_udiv_tiny
127 cmp r1, #recip_max
128 bhi .L_udiv
129 adr r3, .L_udiv_recip_table-12
130 ldr r2, [r3, r1, lsl #2]
131 mov r3, r0
132 umull ip, r0, r2, r0
133 mul r2, r0, r1
134 cmp r3, r2
135 bxcs lr
136 sub r0, r0, #1
137 bx lr
138.L_udiv_tiny:
139 cmp r1, #1
140 movhi r0, r0, lsr #1
141 bxcs lr
142 b .L_div0
143#endif
144.L_udiv:
145 /* Invert divisor. ARM_DIV_31_BODY uses adc to both subtract the divisor
146 and add the next bit of the result. The correction code at .L_udiv32
147 does not need the divisor inverted, but can be modified to work with it,
148 and this allows the zero divisor test to be done early and without an
149 explicit comparison. */
150 rsbs r1, r1, #0
151#ifndef DIV_RECIP
152 beq .L_div0
153#endif
154 tst r0, r0
155 /* High bit must be unset, otherwise shift numerator right, calculate,
156 and correct results. As this case is very uncommon we want to avoid
157 any other delays on the main path in handling it, so the long divide
158 calls the short divide as a function. */
159 bmi .L_udiv32
160.L_udiv31:
161 ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0, r1
162 bx lr
163.L_udiv32:
164 /* store original numerator and divisor, we'll need them to correct the
165 result, */
166 stmdb sp, { r0, r1, lr }
167 /* Call __div0 here if divisor is zero, otherwise it would report the wrong
168 address. */
169 mov r0, r0, lsr #1
170 bl .L_udiv31
171 ldmdb sp, { r2, r3, lr }
172 /* Move the low bit of the original numerator to the carry bit */
173 movs r2, r2, lsr #1
174 /* Shift the remainder left one and add in the carry bit */
175 adc r1, r1, r1
176 /* Subtract the original divisor from the remainder, setting carry if the
177 result is non-negative */
178 adds r1, r1, r3
179 /* Shift quotient left one and add carry bit */
180 adc r0, r0, r0
181 bx lr
182.L_div0:
183 /* __div0 expects the calling address on the top of the stack */
184 stmdb sp!, { lr }
185 mov r0, #0
186#if defined(__ARM_EABI__) || !defined(USE_IRAM)
187 bl __div0
188#else
189 ldr pc, [pc, #-4]
190 .word __div0
191#endif
192#ifdef DIV_RECIP
193.L_udiv_recip_table:
194 .set div, 3
195 .rept recip_max - 2
196 .if (div - 1) & div
197 .set q, 0x40000000 / div
198 .set r, (0x40000000 - (q * div))<<1
199 .set q, q << 1
200 .if r >= div
201 .set q, q + 1
202 .set r, r - div
203 .endif
204 .set r, r << 1
205 .set q, q << 1
206 .if r >= div
207 .set q, q + 1
208 .set r, r - div
209 .endif
210 .set q, q + 1
211 .else
212 .set q, 0x40000000 / div * 4
213 .endif
214 .word q
215 .set div, div+1
216 .endr
217#endif
218 .size udiv32_arm, . - udiv32_arm
219
220#else
221.macro ARMV5_UDIV32_BODY numerator, divisor, quotient, bits, inv, neg, div0label
222 cmp \numerator, \divisor
223 clz \bits, \divisor
224 bcc 30f
225 mov \inv, \divisor, lsl \bits
226 add \neg, pc, \inv, lsr #25
227 cmp \inv, #1<<31
228 ldrhib \inv, [\neg, #.L_udiv_est_table-.-64]
229 bls 20f
230 subs \bits, \bits, #7
231 rsb \neg, \divisor, #0
232 movpl \divisor, \inv, lsl \bits
233 bmi 10f
234 mul \inv, \divisor, \neg
235 smlawt \divisor, \divisor, \inv, \divisor
236 mul \inv, \divisor, \neg
237 /* This will save a cycle on ARMv6, but requires that the numerator sign
238 bit is not set (that of inv is guaranteed unset). The branch should
239 predict very well, making it typically 1 cycle, and thus both the branch
240 and test fill delay cycles for the multiplies. Based on logging of
241 numerator sizes in the APE codec, the branch is taken about 1/10^7 of
242 the time. */
243#if ARM_ARCH >= 6
244 tst \numerator, \numerator
245 smmla \divisor, \divisor, \inv, \divisor
246 bmi 40f
247 smmul \inv, \numerator, \divisor
248#else
249 mov \bits, #0
250 smlal \bits, \divisor, \inv, \divisor
251 umull \bits, \inv, \numerator, \divisor
252#endif
253 add \numerator, \numerator, \neg
254 mla \divisor, \inv, \neg, \numerator
255 mov \quotient, \inv
256 cmn \divisor, \neg
257 addcc \quotient, \quotient, #1
258 addpl \quotient, \quotient, #2
259 bx lr
26010:
261 rsb \bits, \bits, #0
262 sub \inv, \inv, #4
263 mov \divisor, \inv, lsr \bits
264 umull \bits, \inv, \numerator, \divisor
265 mla \divisor, \inv, \neg, \numerator
266 mov \quotient, \inv
267 cmn \neg, \divisor, lsr #1
268 addcs \divisor, \divisor, \neg, lsl #1
269 addcs \quotient, \quotient, #2
270 cmn \neg, \divisor
271 addcs \quotient, \quotient, #1
272 bx lr
27320:
274.ifnc "", "\div0label"
275 rsb \bits, \bits, #31
276 bne \div0label
277.endif
278 mov \quotient, \numerator, lsr \bits
279 bx lr
28030:
281 mov \quotient, #0
282 bx lr
283#if ARM_ARCH >= 6
28440:
285 umull \bits, \inv, \numerator, \divisor
286 add \numerator, \numerator, \neg
287 mla \divisor, \inv, \neg, \numerator
288 mov \quotient, \inv
289 cmn \divisor, \neg
290 addcc \quotient, \quotient, #1
291 addpl \quotient, \quotient, #2
292 bx lr
293#endif
294.endm
295
296udiv32_arm:
297 ARMV5_UDIV32_BODY r0, r1, r0, r2, r3, ip, .L_div0
298.L_div0:
299 /* __div0 expects the calling address on the top of the stack */
300 stmdb sp!, { lr }
301 mov r0, #0
302#if defined(__ARM_EABI__) || !defined(USE_IRAM)
303 bl __div0
304#else
305 ldr pc, [pc, #-4]
306 .word __div0
307#endif
308.L_udiv_est_table:
309 .byte 0xff, 0xfc, 0xf8, 0xf4, 0xf0, 0xed, 0xea, 0xe6
310 .byte 0xe3, 0xe0, 0xdd, 0xda, 0xd7, 0xd4, 0xd2, 0xcf
311 .byte 0xcc, 0xca, 0xc7, 0xc5, 0xc3, 0xc0, 0xbe, 0xbc
312 .byte 0xba, 0xb8, 0xb6, 0xb4, 0xb2, 0xb0, 0xae, 0xac
313 .byte 0xaa, 0xa8, 0xa7, 0xa5, 0xa3, 0xa2, 0xa0, 0x9f
314 .byte 0x9d, 0x9c, 0x9a, 0x99, 0x97, 0x96, 0x94, 0x93
315 .byte 0x92, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8a, 0x89
316 .byte 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81
317#endif
318 .size udiv32_arm, . - udiv32_arm