summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Mahone <andrew.mahone@gmail.com>2010-01-28 02:28:52 +0000
committerAndrew Mahone <andrew.mahone@gmail.com>2010-01-28 02:28:52 +0000
commite76f30a57c25a3ae762fc48218e57bc46dff4410 (patch)
treeb3ca05f49dab3bd6eb4f35af8714653515771cb0
parente18e8069304eefca5439d9b4e573429e2f600a2c (diff)
downloadrockbox-e76f30a57c25a3ae762fc48218e57bc46dff4410.tar.gz
rockbox-e76f30a57c25a3ae762fc48218e57bc46dff4410.zip
Improvements to specialized dividers for APE codec:
* Use Newton-Raphson divider on ARMv5e and ARMv6, about 7% speedup on Gigabeat S. * On ARMv4 targets using IRAM, remove insane filter buffer from IRAM, fill available IRAM with LUT of reciprocals for small divisors - speedup varies according to target and available IRAM, APE normal sample is approx. 109% RT on e200. * Rename apps/codecs/lib/udiv32_armv4.S to apps/codecs/lib/udiv32_arm.S, which includes dividers for all ARM targets specialized for APE. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24354 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/demac/libdemac/demac_config.h4
-rw-r--r--apps/codecs/lib/SOURCES4
-rw-r--r--apps/codecs/lib/codeclib.h2
-rw-r--r--apps/codecs/lib/udiv32_arm.S319
-rw-r--r--apps/codecs/lib/udiv32_armv4.S134
5 files changed, 323 insertions, 140 deletions
diff --git a/apps/codecs/demac/libdemac/demac_config.h b/apps/codecs/demac/libdemac/demac_config.h
index 1bbdef3d56..1beda2b9cd 100644
--- a/apps/codecs/demac/libdemac/demac_config.h
+++ b/apps/codecs/demac/libdemac/demac_config.h
@@ -57,11 +57,11 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
57#elif defined(CPU_S5L870X) 57#elif defined(CPU_S5L870X)
58#define ICODE_SECTION_DEMAC_ARM .icode 58#define ICODE_SECTION_DEMAC_ARM .icode
59#define ICODE_ATTR_DEMAC ICODE_ATTR 59#define ICODE_ATTR_DEMAC ICODE_ATTR
60#define IBSS_ATTR_DEMAC_INSANEBUF IBSS_ATTR 60#define IBSS_ATTR_DEMAC_INSANEBUF
61#else 61#else
62#define ICODE_SECTION_DEMAC_ARM .text 62#define ICODE_SECTION_DEMAC_ARM .text
63#define ICODE_ATTR_DEMAC 63#define ICODE_ATTR_DEMAC
64#define IBSS_ATTR_DEMAC_INSANEBUF IBSS_ATTR 64#define IBSS_ATTR_DEMAC_INSANEBUF
65#endif 65#endif
66 66
67#else /* !ROCKBOX */ 67#else /* !ROCKBOX */
diff --git a/apps/codecs/lib/SOURCES b/apps/codecs/lib/SOURCES
index 3a741a5c81..ffbe1af92e 100644
--- a/apps/codecs/lib/SOURCES
+++ b/apps/codecs/lib/SOURCES
@@ -7,9 +7,7 @@ mdct_lookup.c
7#ifdef CPU_ARM 7#ifdef CPU_ARM
8mdct_arm.S 8mdct_arm.S
9setjmp_arm.S 9setjmp_arm.S
10#if ARM_ARCH == 4 10udiv32_arm.S
11udiv32_armv4.S
12#endif
13#endif 11#endif
14 12
15#ifdef CPU_COLDFIRE 13#ifdef CPU_COLDFIRE
diff --git a/apps/codecs/lib/codeclib.h b/apps/codecs/lib/codeclib.h
index 517264f3a5..926035f05e 100644
--- a/apps/codecs/lib/codeclib.h
+++ b/apps/codecs/lib/codeclib.h
@@ -65,7 +65,7 @@ void qsort(void *base, size_t nmemb, size_t size, int(*compar)(const void *, con
65 65
66extern void mdct_backward(int n, int32_t *in, int32_t *out); 66extern void mdct_backward(int n, int32_t *in, int32_t *out);
67 67
68#if defined(CPU_ARM) && (ARM_ARCH == 4) 68#ifdef CPU_ARM
69/* optimised unsigned integer division for ARMv4, in IRAM */ 69/* optimised unsigned integer division for ARMv4, in IRAM */
70unsigned udiv32_arm(unsigned a, unsigned b); 70unsigned udiv32_arm(unsigned a, unsigned b);
71#define UDIV32(a, b) udiv32_arm(a, b) 71#define UDIV32(a, b) udiv32_arm(a, b)
diff --git a/apps/codecs/lib/udiv32_arm.S b/apps/codecs/lib/udiv32_arm.S
new file mode 100644
index 0000000000..c46a09be5c
--- /dev/null
+++ b/apps/codecs/lib/udiv32_arm.S
@@ -0,0 +1,319 @@
1/***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id$
9 *
10 * Copyright (C) 2008 by Jens Arnold
11 * Copyright (C) 2009 by Andrew Mahone
12 *
13 * Optimised unsigned integer division for ARMv4
14 *
15 * Based on: libgcc routines for ARM cpu, additional algorithms from ARM System
16 * Developer's Guide
17 * Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
18 * Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
19 * Free Software Foundation, Inc.
20 *
21 * This program is free software; you can redistribute it and/or
22 * modify it under the terms of the GNU General Public License
23 * as published by the Free Software Foundation; either version 2
24 * of the License, or (at your option) any later version.
25 *
26 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
27 * KIND, either express or implied.
28 *
29 ****************************************************************************/
30
31#include "config.h"
32/* Codecs should not normally do this, but we need to check a macro, and
33 * codecs.h would confuse the assembler. */
34
35#ifdef USE_IRAM
36#define DIV_RECIP
37 .section .icode,"ax",%progbits
38#else
39 .text
40#endif
41 .align
42 .global udiv32_arm
43 .type udiv32_arm,%function
44
45#if ARM_ARCH < 5
46/* Adapted from an algorithm given in ARM System Developer's Guide (7.3.1.2)
47 for dividing a 30-bit value by a 15-bit value, with two operations per
48 iteration by storing quotient and remainder together and adding the previous
49 quotient bit during trial subtraction. Modified to work with any dividend
50 and divisor both less than 1 << 30, and skipping trials by calculating bits
51 in output. */
52.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient, remainder
53
54 mov \bits, #1
55 /* Shift the divisor left until it aligns with the numerator. If it already
56 has the high bit set, this is fine, everything inside .rept will be
57 skipped, and the add before and adcs after will set the one-bit result
58 to zero. */
59 cmn \divisor, \dividend, lsr #16
60 movcs \divisor, \divisor, lsl #16
61 addcs \bits, \bits, #16
62 cmn \divisor, \dividend, lsr #8
63 movcs \divisor, \divisor, lsl #8
64 addcs \bits, \bits, #8
65 cmn \divisor, \dividend, lsr #4
66 movcs \divisor, \divisor, lsl #4
67 addcs \bits, \bits, #4
68 cmn \divisor, \dividend, lsr #2
69 movcs \divisor, \divisor, lsl #2
70 addcs \bits, \bits, #2
71 cmn \divisor, \dividend, lsr #1
72 movcs \divisor, \divisor, lsl #1
73 addcs \bits, \bits, #1
74 adds \result, \dividend, \divisor
75 subcc \result, \result, \divisor
76 rsb \curbit, \bits, #31
77 add pc, pc, \curbit, lsl #3
78 nop
79 .rept 30
80 adcs \result, \divisor, \result, lsl #1
81 /* Fix the remainder portion of the result. This must be done because the
82 handler for 32-bit numerators needs the remainder. */
83 subcc \result, \result, \divisor
84 .endr
85 /* Shift remainder/quotient left one, add final quotient bit */
86 adc \result, \result, \result
87 mov \remainder, \result, lsr \bits
88 eor \quotient, \result, \remainder, lsl \bits
89.endm
90
91#ifdef CPU_PP
92#if CONFIG_CPU == PP5020
93.set recip_max, 5952
94#elif CONFIG_CPU == PP5002
95.set recip_max, 1472
96#else
97.set recip_max, 14208
98#endif
99#elif CONFIG_CPU == AS3525
100.set recip_max, 42752
101#elif CONFIG_CPU == S5L8701
102.set recip_max, 9600
103#elif CONFIG_CPU == S5L8700
104.set recip_max, 5504
105#endif
106
107udiv32_arm:
108#ifdef DIV_RECIP
109 cmp r1, #3
110 bcc .L_udiv_tiny
111 cmp r1, #recip_max
112 bhi .L_udiv
113 adr r3, .L_udiv_recip_table-12
114 ldr r2, [r3, r1, lsl #2]
115 mov r3, r0
116 umull ip, r0, r2, r0
117 mul r2, r0, r1
118 cmp r3, r2
119 bxcs lr
120 sub r0, r0, #1
121 bx lr
122.L_udiv_tiny:
123 cmp r1, #1
124 movhi r0, r0, lsr #1
125 bxcs lr
126 b .L_div0
127#endif
128.L_udiv:
129 /* Invert divisor. ARM_DIV_31_BODY uses adc to both subtract the divisor
130 and add the next bit of the result. The correction code at .L_udiv32
131 does not need the divisor inverted, but can be modified to work with it,
132 and this allows the zero divisor test to be done early and without an
133 explicit comparison. */
134 rsbs r1, r1, #0
135#ifndef DIV_RECIP
136 beq .L_div0
137#endif
138 tst r0, r0
139 /* High bit must be unset, otherwise shift numerator right, calculate,
140 and correct results. As this case is very uncommon we want to avoid
141 any other delays on the main path in handling it, so the long divide
142 calls the short divide as a function. */
143 bmi .L_udiv32
144.L_udiv31:
145 ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0, r1
146 bx lr
147.L_udiv32:
148 /* store original numerator and divisor, we'll need them to correct the
149 result, */
150 stmdb sp, { r0, r1, lr }
151 /* Call __div0 here if divisor is zero, otherwise it would report the wrong
152 address. */
153 mov r0, r0, lsr #1
154 bl .L_udiv31
155 ldmdb sp, { r2, r3, lr }
156 /* Move the low bit of the original numerator to the carry bit */
157 movs r2, r2, lsr #1
158 /* Shift the remainder left one and add in the carry bit */
159 adc r1, r1, r1
160 /* Subtract the original divisor from the remainder, setting carry if the
161 result is non-negative */
162 adds r1, r1, r3
163 /* Shift quotient left one and add carry bit */
164 adc r0, r0, r0
165 bx lr
166.L_div0:
167 /* __div0 expects the calling address on the top of the stack */
168 stmdb sp!, { lr }
169 mov r0, #0
170#if defined(__ARM_EABI__) || !defined(USE_IRAM)
171 bl __div0
172#else
173 ldr pc, [pc, #-4]
174 .word __div0
175#endif
176#ifdef DIV_RECIP
177.L_udiv_recip_table:
178 .set div, 3
179 .rept recip_max - 2
180 .if (div - 1) & div
181 .set q, 0x40000000 / div
182 .set r, (0x40000000 - (q * div))<<1
183 .set q, q << 1
184 .if r >= div
185 .set q, q + 1
186 .set r, r - div
187 .endif
188 .set r, r << 1
189 .set q, q << 1
190 .if r >= div
191 .set q, q + 1
192 .set r, r - div
193 .endif
194 .set q, q + 1
195 .else
196 .set q, 0x40000000 / div * 4
197 .endif
198 .word q
199 .set div, div+1
200 .endr
201#endif
202 .size udiv32_arm, . - udiv32_arm
203
204#else
205.macro ARMV5_UDIV32_BODY numerator, divisor, quotient, bits, inv, neg, div0label
206 cmp \numerator, \divisor
207 clz \bits, \divisor
208 bcc 30f
209 mov \inv, \divisor, lsl \bits
210 add \neg, pc, \inv, lsr #25
211 cmp \inv, #1<<31
212 ldrhib \inv, [\neg, #.L_udiv_est_table-.-64]
213 bls 20f
214 subs \bits, \bits, #7
215 rsb \neg, \divisor, #0
216 movpl \divisor, \inv, lsl \bits
217 bmi 10f
218 mul \inv, \divisor, \neg
219 smlawt \divisor, \divisor, \inv, \divisor
220 mul \inv, \divisor, \neg
221 /* This will save a cycle on ARMv6, but does not produce a correct result
222 if numerator sign bit is set. This case accounts for about 1 in 10^7 of
223 divisions, done by the APE decoder, so we specialize for the more common
224 case and handle the uncommon large-numerator separately */
225#if ARM_ARCH >= 6
226 tst \numerator, \numerator
227 smmla \divisor, \divisor, \inv, \divisor
228 bmi 40f
229 smmul \inv, \numerator, \divisor
230#else
231 mov \bits, #0
232 smlal \bits, \divisor, \divisor, \inv
233 umull \bits, \inv, \numerator, \divisor
234#endif
235 add \numerator, \numerator, \neg
236 mla \divisor, \inv, \neg, \numerator
237 mov \quotient, \inv
238 cmn \divisor, \neg
239 addcc \quotient, \quotient, #1
240 addpl \quotient, \quotient, #2
241 bx lr
24210:
243 rsb \bits, \bits, #0
244 sub \inv, \inv, #4
245 mov \divisor, \inv, lsr \bits
246#if ARM_ARCH >= 6
247 tst \numerator, \numerator
248 smmla \divisor, \divisor, \inv, \divisor
249 bmi 50f
250 smmul \inv, \numerator, \divisor
251#else
252 mov \bits, #0
253 smlal \bits, \divisor, \divisor, \inv
254 umull \bits, \inv, \numerator, \divisor
255#endif
256 mla \divisor, \inv, \neg, \numerator
257 mov \quotient, \inv
258 cmn \neg, \divisor, lsr #1
259 addcs \divisor, \divisor, \neg, lsl #1
260 addcs \quotient, \quotient, #2
261 cmn \neg, \divisor
262 addcs \quotient, \quotient, #1
263 bx lr
26420:
265.ifnc "", "\div0label"
266 rsb \bits, \bits, #31
267 bne \div0label
268.endif
269 mov \quotient, \numerator, lsr \bits
270 bx lr
27130:
272 mov \quotient, #0
273 bx lr
274#if ARM_ARCH >= 6
27540:
276 umull \bits, \inv, \numerator, \divisor
277 add \numerator, \numerator, \neg
278 mla \divisor, \inv, \neg, \numerator
279 mov \quotient, \inv
280 cmn \divisor, \neg
281 addcc \quotient, \quotient, #1
282 addpl \quotient, \quotient, #2
283 bx lr
28450:
285 umull \bits, \inv, \numerator, \divisor
286 mla \divisor, \inv, \neg, \numerator
287 mov \quotient, \inv
288 cmn \neg, \divisor, lsr #1
289 addcs \divisor, \divisor, \neg, lsl #1
290 addcs \quotient, \quotient, #2
291 cmn \neg, \divisor
292 addcs \quotient, \quotient, #1
293 bx lr
294#endif
295.endm
296
297udiv32_arm:
298 ARMV5_UDIV32_BODY r0, r1, r0, r2, r3, ip, .L_div0
299.L_div0:
300 /* __div0 expects the calling address on the top of the stack */
301 stmdb sp!, { lr }
302 mov r0, #0
303#if defined(__ARM_EABI__) || !defined(USE_IRAM)
304 bl __div0
305#else
306 ldr pc, [pc, #-4]
307 .word __div0
308#endif
309.L_udiv_est_table:
310 .byte 0xff, 0xfc, 0xf8, 0xf4, 0xf0, 0xed, 0xea, 0xe6
311 .byte 0xe3, 0xe0, 0xdd, 0xda, 0xd7, 0xd4, 0xd2, 0xcf
312 .byte 0xcc, 0xca, 0xc7, 0xc5, 0xc3, 0xc0, 0xbe, 0xbc
313 .byte 0xba, 0xb8, 0xb6, 0xb4, 0xb2, 0xb0, 0xae, 0xac
314 .byte 0xaa, 0xa8, 0xa7, 0xa5, 0xa3, 0xa2, 0xa0, 0x9f
315 .byte 0x9d, 0x9c, 0x9a, 0x99, 0x97, 0x96, 0x94, 0x93
316 .byte 0x92, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8a, 0x89
317 .byte 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81
318#endif
319 .size udiv32_arm, . - udiv32_arm
diff --git a/apps/codecs/lib/udiv32_armv4.S b/apps/codecs/lib/udiv32_armv4.S
deleted file mode 100644
index c4aea14093..0000000000
--- a/apps/codecs/lib/udiv32_armv4.S
+++ /dev/null
@@ -1,134 +0,0 @@
1/***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id$
9 *
10 * Copyright (C) 2008 by Jens Arnold
11 * Copyright (C) 2009 by Andrew Mahone
12 *
13 * Optimised unsigned integer division for ARMv4
14 *
15 * Based on: libgcc routines for ARM cpu.
16 * Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
17 * Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
18 * Free Software Foundation, Inc.
19 *
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version 2
23 * of the License, or (at your option) any later version.
24 *
25 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
26 * KIND, either express or implied.
27 *
28 ****************************************************************************/
29
30#include "config.h"
31/* Codecs should not normally do this, but we need to check a macro, and
32 * codecs.h would confuse the assembler. */
33
34/* Adapted from an algorithm given in ARM System Developer's Guide (7.3.1.2)
35 for dividing a 30-bit value by a 15-bit value, with two operations per
36 iteration by storing quotient and remainder together and adding the previous
37 quotient bit during trial subtraction. Modified to work with any dividend
38 and divisor both less than 1 << 30, and skipping trials by calculating bits
39 in output. */
40.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient, remainder
41
42 mov \bits, #1
43 /* Shift the divisor left until it aligns with the numerator. If it already
44 has the high bit set, this is fine, everything inside .rept will be
45 skipped, and the add before and adcs after will set the one-bit result
46 to zero. */
47 cmn \divisor, \dividend, lsr #16
48 movcs \divisor, \divisor, lsl #16
49 addcs \bits, \bits, #16
50 cmn \divisor, \dividend, lsr #8
51 movcs \divisor, \divisor, lsl #8
52 addcs \bits, \bits, #8
53 cmn \divisor, \dividend, lsr #4
54 movcs \divisor, \divisor, lsl #4
55 addcs \bits, \bits, #4
56 cmn \divisor, \dividend, lsr #2
57 movcs \divisor, \divisor, lsl #2
58 addcs \bits, \bits, #2
59 cmn \divisor, \dividend, lsr #1
60 movcs \divisor, \divisor, lsl #1
61 addcs \bits, \bits, #1
62 adds \result, \dividend, \divisor
63 subcc \result, \result, \divisor
64 rsb \curbit, \bits, #31
65 add pc, pc, \curbit, lsl #3
66 nop
67 .rept 30
68 adcs \result, \divisor, \result, lsl #1
69 /* Fix the remainder portion of the result. This must be done because the
70 handler for 32-bit numerators needs the remainder. */
71 subcc \result, \result, \divisor
72 .endr
73 /* Shift remainder/quotient left one, add final quotient bit */
74 adc \result, \result, \result
75 mov \remainder, \result, lsr \bits
76 eor \quotient, \result, \remainder, lsl \bits
77.endm
78
79#ifdef USE_IRAM
80 .section .icode,"ax",%progbits
81#else
82 .text
83#endif
84 .align
85 .global udiv32_arm
86 .type udiv32_arm,%function
87
88udiv32_arm:
89 /* Invert divisor. ARM_DIV_31_BODY uses adc to both subtract the divisor
90 and add the next bit of the result. The correction code at .L_udiv32
91 does not need the divisor inverted, but can be modified to work with it,
92 and this allows the zero divisor test to be done early and without an
93 explicit comparison. */
94 rsbs r1, r1, #0
95 beq .L_div0
96 tst r0, r0
97 /* High bit must be unset, otherwise shift numerator right, calculate,
98 and correct results. As this case is very uncommon we want to avoid
99 any other delays on the main path in handling it, so the long divide
100 calls the short divide as a function. */
101 bmi .L_udiv32
102.L_udiv31:
103 ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0, r1
104 bx lr
105
106.L_udiv32:
107 /* store original numerator and divisor, we'll need them to correct the
108 result, */
109 stmdb sp, { r0, r1, lr }
110 /* Call __div0 here if divisor is zero, otherwise it would report the wrong
111 address. */
112 mov r0, r0, lsr #1
113 bl .L_udiv31
114 ldmdb sp, { r2, r3, lr }
115 /* Move the low bit of the original numerator to the carry bit */
116 movs r2, r2, lsr #1
117 /* Shift the remainder left one and add in the carry bit */
118 adc r1, r1, r1
119 /* Subtract the original divisor from the remainder, setting carry if the
120 result is non-negative */
121 adds r1, r1, r3
122 /* Shift quotient left one and add carry bit */
123 adc r0, r0, r0
124 bx lr
125.L_div0:
126 /* __div0 expects the calling address on the top of the stack */
127 stmdb sp!, { lr }
128#if defined(__ARM_EABI__) || !defined(USE_IRAM)
129 bl __div0
130#else
131 mov lr, pc
132 bx r3
133#endif
134 .size udiv32_arm, . - udiv32_arm