Improvements to specialized dividers for APE codec:

* Use Newton-Raphson divider on ARMv5e and ARMv6, about 7% speedup on Gigabeat S. * On ARMv4 targets using IRAM, remove insane filter buffer from IRAM, fill available IRAM with LUT of reciprocals for small divisors - speedup varies according to target and available IRAM, APE normal sample is approx. 109% RT on e200. * Rename apps/codecs/lib/udiv32_armv4.S to apps/codecs/lib/udiv32_arm.S, which includes dividers for all ARM targets specialized for APE. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24354 a1c6a512-1295-4272-9138-f99709370657
author: Andrew Mahone <andrew.mahone@gmail.com> 2010-01-28 02:28:52 +0000
committer: Andrew Mahone <andrew.mahone@gmail.com> 2010-01-28 02:28:52 +0000
commit: e76f30a57c25a3ae762fc48218e57bc46dff4410 (patch)
tree: b3ca05f49dab3bd6eb4f35af8714653515771cb0
parent: e18e8069304eefca5439d9b4e573429e2f600a2c (diff)
download: rockbox-e76f30a57c25a3ae762fc48218e57bc46dff4410.tar.gz
rockbox-e76f30a57c25a3ae762fc48218e57bc46dff4410.zip
5 files changed, 323 insertions, 140 deletions
diff --git a/apps/codecs/demac/libdemac/demac_config.h b/apps/codecs/demac/libdemac/demac_config.h
index 1bbdef3d56..1beda2b9cd 100644
--- a/apps/codecs/demac/libdemac/demac_config.h
+++ b/apps/codecs/demac/libdemac/demac_config.h
@@ -57,11 +57,11 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
 #elif defined(CPU_S5L870X)
 #define ICODE_SECTION_DEMAC_ARM   .icode
 #define ICODE_ATTR_DEMAC          ICODE_ATTR
-#define IBSS_ATTR_DEMAC_INSANEBUF IBSS_ATTR
+#define IBSS_ATTR_DEMAC_INSANEBUF
 #else
 #define ICODE_SECTION_DEMAC_ARM   .text
 #define ICODE_ATTR_DEMAC
-#define IBSS_ATTR_DEMAC_INSANEBUF IBSS_ATTR
+#define IBSS_ATTR_DEMAC_INSANEBUF
 #endif
 #else /* !ROCKBOX */
diff --git a/apps/codecs/lib/SOURCES b/apps/codecs/lib/SOURCES
index 3a741a5c81..ffbe1af92e 100644
--- a/apps/codecs/lib/SOURCES
+++ b/apps/codecs/lib/SOURCES
@@ -7,9 +7,7 @@ mdct_lookup.c
 #ifdef CPU_ARM
 mdct_arm.S
 setjmp_arm.S
-#if ARM_ARCH == 4
+udiv32_arm.S
-udiv32_armv4.S
-#endif
 #endif
 #ifdef CPU_COLDFIRE
diff --git a/apps/codecs/lib/codeclib.h b/apps/codecs/lib/codeclib.h
index 517264f3a5..926035f05e 100644
--- a/apps/codecs/lib/codeclib.h
+++ b/apps/codecs/lib/codeclib.h
@@ -65,7 +65,7 @@ void qsort(void *base, size_t nmemb, size_t size, int(*compar)(const void *, con
 extern void mdct_backward(int n, int32_t *in, int32_t *out);
-#if defined(CPU_ARM) && (ARM_ARCH == 4)
+#ifdef CPU_ARM
 /* optimised unsigned integer division for ARMv4, in IRAM */
 unsigned udiv32_arm(unsigned a, unsigned b);
 #define UDIV32(a, b) udiv32_arm(a, b)
diff --git a/apps/codecs/lib/udiv32_arm.S b/apps/codecs/lib/udiv32_arm.S
new file mode 100644
index 0000000000..c46a09be5c
--- /dev/null
+++ b/apps/codecs/lib/udiv32_arm.S
@@ -0,0 +1,319 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2008 by Jens Arnold
+ * Copyright (C) 2009 by Andrew Mahone
+ *
+ * Optimised unsigned integer division for ARMv4
+ *
+ * Based on: libgcc routines for ARM cpu, additional algorithms from ARM System
+ *           Developer's Guide
+ * Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
+ * Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
+ * Free Software Foundation, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+#include "config.h"
+/* Codecs should not normally do this, but we need to check a macro, and
+ * codecs.h would confuse the assembler. */
+#ifdef USE_IRAM
+#define DIV_RECIP
+    .section    .icode,"ax",%progbits
+#else
+    .text
+#endif
+    .align
+    .global udiv32_arm
+    .type   udiv32_arm,%function
+#if ARM_ARCH < 5
+/* Adapted from an algorithm given in ARM System Developer's Guide (7.3.1.2)
+   for dividing a 30-bit value by a 15-bit value, with two operations per
+   iteration by storing quotient and remainder together and adding the previous
+   quotient bit during trial subtraction. Modified to work with any dividend
+   and divisor both less than 1 << 30, and skipping trials by calculating bits
+   in output. */
+.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient, remainder
+    mov     \bits, #1
+    /* Shift the divisor left until it aligns with the numerator. If it already
+       has the high bit set, this is fine, everything inside .rept will be
+       skipped, and the add before and adcs after will set the one-bit result
+       to zero. */
+    cmn     \divisor, \dividend, lsr #16
+    movcs   \divisor, \divisor, lsl #16
+    addcs   \bits, \bits, #16
+    cmn     \divisor, \dividend, lsr #8
+    movcs   \divisor, \divisor, lsl #8
+    addcs   \bits, \bits, #8
+    cmn     \divisor, \dividend, lsr #4
+    movcs   \divisor, \divisor, lsl #4
+    addcs   \bits, \bits, #4
+    cmn     \divisor, \dividend, lsr #2
+    movcs   \divisor, \divisor, lsl #2
+    addcs   \bits, \bits, #2
+    cmn     \divisor, \dividend, lsr #1
+    movcs   \divisor, \divisor, lsl #1
+    addcs   \bits, \bits, #1
+    adds    \result, \dividend, \divisor
+    subcc   \result, \result, \divisor
+    rsb     \curbit, \bits, #31
+    add     pc, pc, \curbit, lsl #3
+    nop
+    .rept   30
+    adcs    \result, \divisor, \result, lsl #1
+    /* Fix the remainder portion of the result. This must be done because the
+       handler for 32-bit numerators needs the remainder. */
+    subcc   \result, \result, \divisor
+    .endr
+    /* Shift remainder/quotient left one, add final quotient bit */
+    adc     \result, \result, \result
+    mov     \remainder, \result, lsr \bits
+    eor     \quotient, \result, \remainder, lsl \bits
+.endm
+#ifdef CPU_PP
+#if CONFIG_CPU == PP5020
+.set recip_max, 5952
+#elif CONFIG_CPU == PP5002
+.set recip_max, 1472
+#else
+.set recip_max, 14208
+#endif
+#elif CONFIG_CPU == AS3525
+.set recip_max, 42752
+#elif CONFIG_CPU == S5L8701
+.set recip_max, 9600
+#elif CONFIG_CPU == S5L8700
+.set recip_max, 5504
+#endif
+udiv32_arm:
+#ifdef DIV_RECIP
+    cmp     r1, #3
+    bcc     .L_udiv_tiny
+    cmp     r1, #recip_max
+    bhi     .L_udiv
+    adr     r3, .L_udiv_recip_table-12
+    ldr     r2, [r3, r1, lsl #2]
+    mov     r3, r0
+    umull   ip, r0, r2, r0
+    mul     r2, r0, r1
+    cmp     r3, r2
+    bxcs    lr
+    sub     r0, r0, #1
+    bx      lr
+.L_udiv_tiny:
+    cmp     r1, #1
+    movhi   r0, r0, lsr #1
+    bxcs    lr
+    b       .L_div0
+#endif
+.L_udiv:
+    /* Invert divisor. ARM_DIV_31_BODY uses adc to both subtract the divisor
+       and add the next bit of the result. The correction code at .L_udiv32
+       does not need the divisor inverted, but can be modified to work with it,
+       and this allows the zero divisor test to be done early and without an
+       explicit comparison. */
+    rsbs    r1, r1, #0
+#ifndef DIV_RECIP
+    beq .L_div0
+#endif
+    tst     r0, r0
+    /* High bit must be unset, otherwise shift numerator right, calculate,
+       and correct results. As this case is very uncommon we want to avoid
+       any other delays on the main path in handling it, so the long divide
+       calls the short divide as a function. */
+    bmi     .L_udiv32
+.L_udiv31:
+    ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0, r1
+    bx      lr
+.L_udiv32:
+    /* store original numerator and divisor, we'll need them to correct the
+       result, */
+    stmdb   sp, { r0, r1, lr }
+    /* Call __div0 here if divisor is zero, otherwise it would report the wrong
+       address. */
+    mov     r0, r0, lsr #1
+    bl      .L_udiv31
+    ldmdb   sp, { r2, r3, lr }
+    /* Move the low bit of the original numerator to the carry bit */
+    movs    r2, r2, lsr #1
+    /* Shift the remainder left one and add in the carry bit */
+    adc     r1, r1, r1
+    /* Subtract the original divisor from the remainder, setting carry if the
+       result is non-negative */
+    adds    r1, r1, r3
+    /* Shift quotient left one and add carry bit */
+    adc     r0, r0, r0
+    bx      lr
+.L_div0:
+    /* __div0 expects the calling address on the top of the stack */
+    stmdb sp!, { lr }
+    mov     r0, #0
+#if defined(__ARM_EABI__) || !defined(USE_IRAM)
+    bl      __div0
+#else
+    ldr     pc, [pc, #-4]
+    .word   __div0
+#endif
+#ifdef DIV_RECIP
+.L_udiv_recip_table:
+    .set div, 3
+    .rept recip_max - 2
+        .if (div - 1) & div
+            .set q, 0x40000000 / div
+            .set r, (0x40000000 - (q * div))<<1
+            .set q, q << 1
+            .if r >= div
+                .set q, q + 1
+                .set r, r - div
+            .endif
+            .set r, r << 1
+            .set q, q << 1
+            .if r >= div
+                .set q, q + 1
+                .set r, r - div
+            .endif
+            .set q, q + 1
+        .else
+            .set q, 0x40000000 / div * 4
+        .endif
+        .word q
+        .set div, div+1
+    .endr
+#endif
+    .size udiv32_arm, . - udiv32_arm
+#else
+.macro ARMV5_UDIV32_BODY numerator, divisor, quotient, bits, inv, neg, div0label
+    cmp     \numerator, \divisor
+    clz     \bits, \divisor
+    bcc     30f
+    mov     \inv, \divisor, lsl \bits
+    add     \neg, pc, \inv, lsr #25
+    cmp     \inv, #1<<31
+    ldrhib  \inv, [\neg, #.L_udiv_est_table-.-64]
+    bls     20f
+    subs    \bits, \bits, #7
+    rsb     \neg, \divisor, #0
+    movpl   \divisor, \inv, lsl \bits
+    bmi     10f
+    mul     \inv, \divisor, \neg
+    smlawt  \divisor, \divisor, \inv, \divisor
+    mul     \inv, \divisor, \neg
+    /* This will save a cycle on ARMv6, but does not produce a correct result
+       if numerator sign bit is set. This case accounts for about 1 in 10^7 of
+       divisions, done by the APE decoder, so we specialize for the more common
+       case and handle the uncommon large-numerator separately */
+#if ARM_ARCH >= 6
+    tst     \numerator, \numerator
+    smmla   \divisor, \divisor, \inv, \divisor
+    bmi     40f
+    smmul   \inv, \numerator, \divisor
+#else
+    mov     \bits, #0
+    smlal   \bits, \divisor, \divisor, \inv
+    umull   \bits, \inv, \numerator, \divisor
+#endif
+    add     \numerator, \numerator, \neg
+    mla     \divisor, \inv, \neg, \numerator
+    mov     \quotient, \inv
+    cmn     \divisor, \neg
+    addcc   \quotient, \quotient, #1
+    addpl   \quotient, \quotient, #2
+    bx      lr
+10:
+    rsb     \bits, \bits, #0
+    sub     \inv, \inv, #4
+    mov     \divisor, \inv, lsr \bits
+#if ARM_ARCH >= 6
+    tst     \numerator, \numerator
+    smmla   \divisor, \divisor, \inv, \divisor
+    bmi     50f
+    smmul   \inv, \numerator, \divisor
+#else
+    mov     \bits, #0
+    smlal   \bits, \divisor, \divisor, \inv
+    umull   \bits, \inv, \numerator, \divisor
+#endif
+    mla     \divisor, \inv, \neg, \numerator
+    mov     \quotient, \inv
+    cmn     \neg, \divisor, lsr #1
+    addcs   \divisor, \divisor, \neg, lsl #1
+    addcs   \quotient, \quotient, #2
+    cmn     \neg, \divisor
+    addcs   \quotient, \quotient, #1
+    bx      lr
+20:
+.ifnc "", "\div0label"
+    rsb     \bits, \bits, #31
+    bne     \div0label
+.endif
+    mov     \quotient, \numerator, lsr \bits
+    bx      lr
+30:
+    mov     \quotient, #0
+    bx      lr
+#if ARM_ARCH >= 6
+40:
+    umull   \bits, \inv, \numerator, \divisor
+    add     \numerator, \numerator, \neg
+    mla     \divisor, \inv, \neg, \numerator
+    mov     \quotient, \inv
+    cmn     \divisor, \neg
+    addcc   \quotient, \quotient, #1
+    addpl   \quotient, \quotient, #2
+    bx      lr
+50:
+    umull   \bits, \inv, \numerator, \divisor
+    mla     \divisor, \inv, \neg, \numerator
+    mov     \quotient, \inv
+    cmn     \neg, \divisor, lsr #1
+    addcs   \divisor, \divisor, \neg, lsl #1
+    addcs   \quotient, \quotient, #2
+    cmn     \neg, \divisor
+    addcs   \quotient, \quotient, #1
+    bx      lr
+#endif
+.endm
+udiv32_arm:
+    ARMV5_UDIV32_BODY r0, r1, r0, r2, r3, ip, .L_div0
+.L_div0:
+    /* __div0 expects the calling address on the top of the stack */
+    stmdb sp!, { lr }
+    mov     r0, #0
+#if defined(__ARM_EABI__) || !defined(USE_IRAM)
+    bl      __div0
+#else
+    ldr     pc, [pc, #-4]
+    .word   __div0
+#endif
+.L_udiv_est_table:
+    .byte 0xff, 0xfc, 0xf8, 0xf4, 0xf0, 0xed, 0xea, 0xe6
+    .byte 0xe3, 0xe0, 0xdd, 0xda, 0xd7, 0xd4, 0xd2, 0xcf
+    .byte 0xcc, 0xca, 0xc7, 0xc5, 0xc3, 0xc0, 0xbe, 0xbc
+    .byte 0xba, 0xb8, 0xb6, 0xb4, 0xb2, 0xb0, 0xae, 0xac
+    .byte 0xaa, 0xa8, 0xa7, 0xa5, 0xa3, 0xa2, 0xa0, 0x9f
+    .byte 0x9d, 0x9c, 0x9a, 0x99, 0x97, 0x96, 0x94, 0x93
+    .byte 0x92, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8a, 0x89
+    .byte 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81
+#endif
+    .size udiv32_arm, . - udiv32_arm
diff --git a/apps/codecs/lib/udiv32_armv4.S b/apps/codecs/lib/udiv32_armv4.S
deleted file mode 100644
index c4aea14093..0000000000
--- a/apps/codecs/lib/udiv32_armv4.S
+++ /dev/null
@@ -1,134 +0,0 @@
-/***************************************************************************
- *             __________               __   ___.
- *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
- *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
- *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
- *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
- *                     \/            \/     \/    \/            \/
- * $Id$
- *
- * Copyright (C) 2008 by Jens Arnold
- * Copyright (C) 2009 by Andrew Mahone
- *
- * Optimised unsigned integer division for ARMv4
- *
- * Based on: libgcc routines for ARM cpu.
- * Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
- * Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
- * Free Software Foundation, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
- * KIND, either express or implied.
- *
- ****************************************************************************/
-#include "config.h"
-/* Codecs should not normally do this, but we need to check a macro, and
- * codecs.h would confuse the assembler. */
-/* Adapted from an algorithm given in ARM System Developer's Guide (7.3.1.2)
-   for dividing a 30-bit value by a 15-bit value, with two operations per
-   iteration by storing quotient and remainder together and adding the previous
-   quotient bit during trial subtraction. Modified to work with any dividend
-   and divisor both less than 1 << 30, and skipping trials by calculating bits
-   in output. */
-.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient, remainder
-    mov     \bits, #1
-    /* Shift the divisor left until it aligns with the numerator. If it already
-       has the high bit set, this is fine, everything inside .rept will be
-       skipped, and the add before and adcs after will set the one-bit result
-       to zero. */
-    cmn     \divisor, \dividend, lsr #16
-    movcs   \divisor, \divisor, lsl #16
-    addcs   \bits, \bits, #16
-    cmn     \divisor, \dividend, lsr #8
-    movcs   \divisor, \divisor, lsl #8
-    addcs   \bits, \bits, #8
-    cmn     \divisor, \dividend, lsr #4
-    movcs   \divisor, \divisor, lsl #4
-    addcs   \bits, \bits, #4
-    cmn     \divisor, \dividend, lsr #2
-    movcs   \divisor, \divisor, lsl #2
-    addcs   \bits, \bits, #2
-    cmn     \divisor, \dividend, lsr #1
-    movcs   \divisor, \divisor, lsl #1
-    addcs   \bits, \bits, #1
-    adds    \result, \dividend, \divisor
-    subcc   \result, \result, \divisor
-    rsb     \curbit, \bits, #31
-    add     pc, pc, \curbit, lsl #3
-    nop
-    .rept   30
-    adcs    \result, \divisor, \result, lsl #1
-    /* Fix the remainder portion of the result. This must be done because the
-       handler for 32-bit numerators needs the remainder. */
-    subcc   \result, \result, \divisor
-    .endr
-    /* Shift remainder/quotient left one, add final quotient bit */
-    adc     \result, \result, \result
-    mov     \remainder, \result, lsr \bits
-    eor     \quotient, \result, \remainder, lsl \bits
-.endm
-#ifdef USE_IRAM
-    .section    .icode,"ax",%progbits
-#else
-    .text
-#endif
-    .align
-    .global udiv32_arm
-    .type   udiv32_arm,%function
-udiv32_arm:
-    /* Invert divisor. ARM_DIV_31_BODY uses adc to both subtract the divisor
-       and add the next bit of the result. The correction code at .L_udiv32
-       does not need the divisor inverted, but can be modified to work with it,
-       and this allows the zero divisor test to be done early and without an
-       explicit comparison. */
-    rsbs    r1, r1, #0
-    beq     .L_div0
-    tst     r0, r0
-    /* High bit must be unset, otherwise shift numerator right, calculate,
-       and correct results. As this case is very uncommon we want to avoid
-       any other delays on the main path in handling it, so the long divide
-       calls the short divide as a function. */
-    bmi     .L_udiv32
-.L_udiv31:
-    ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0, r1
-    bx      lr
-.L_udiv32:
-    /* store original numerator and divisor, we'll need them to correct the
-       result, */
-    stmdb   sp, { r0, r1, lr }
-    /* Call __div0 here if divisor is zero, otherwise it would report the wrong
-       address. */
-    mov     r0, r0, lsr #1
-    bl      .L_udiv31
-    ldmdb   sp, { r2, r3, lr }
-    /* Move the low bit of the original numerator to the carry bit */
-    movs    r2, r2, lsr #1
-    /* Shift the remainder left one and add in the carry bit */
-    adc     r1, r1, r1
-    /* Subtract the original divisor from the remainder, setting carry if the
-       result is non-negative */
-    adds    r1, r1, r3
-    /* Shift quotient left one and add carry bit */
-    adc     r0, r0, r0
-    bx      lr
-.L_div0:
-    /* __div0 expects the calling address on the top of the stack */
-    stmdb sp!, { lr }
-#if defined(__ARM_EABI__) || !defined(USE_IRAM)
-    bl      __div0
-#else
-    mov     lr, pc
-    bx      r3
-#endif
-    .size udiv32_arm, . - udiv32_arm
author	Andrew Mahone <andrew.mahone@gmail.com>	2010-01-28 02:28:52 +0000
committer	Andrew Mahone <andrew.mahone@gmail.com>	2010-01-28 02:28:52 +0000
commit	e76f30a57c25a3ae762fc48218e57bc46dff4410 (patch)
tree	b3ca05f49dab3bd6eb4f35af8714653515771cb0
parent	e18e8069304eefca5439d9b4e573429e2f600a2c (diff)
download	rockbox-e76f30a57c25a3ae762fc48218e57bc46dff4410.tar.gz rockbox-e76f30a57c25a3ae762fc48218e57bc46dff4410.zip

diff --git a/apps/codecs/demac/libdemac/demac_config.h b/apps/codecs/demac/libdemac/demac_config.h index 1bbdef3d56..1beda2b9cd 100644 --- a/apps/codecs/demac/libdemac/demac_config.h +++ b/apps/codecs/demac/libdemac/demac_config.h
@@ -57,11 +57,11 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
57	#elif defined(CPU_S5L870X)	57	#elif defined(CPU_S5L870X)
58	#define ICODE_SECTION_DEMAC_ARM .icode	58	#define ICODE_SECTION_DEMAC_ARM .icode
59	#define ICODE_ATTR_DEMAC ICODE_ATTR	59	#define ICODE_ATTR_DEMAC ICODE_ATTR
60	#define IBSS_ATTR_DEMAC_INSANEBUF IBSS_ATTR	60	#define IBSS_ATTR_DEMAC_INSANEBUF
61	#else	61	#else
62	#define ICODE_SECTION_DEMAC_ARM .text	62	#define ICODE_SECTION_DEMAC_ARM .text
63	#define ICODE_ATTR_DEMAC	63	#define ICODE_ATTR_DEMAC
64	#define IBSS_ATTR_DEMAC_INSANEBUF IBSS_ATTR	64	#define IBSS_ATTR_DEMAC_INSANEBUF
65	#endif	65	#endif
66		66
67	#else /* !ROCKBOX */	67	#else /* !ROCKBOX */


diff --git a/apps/codecs/lib/SOURCES b/apps/codecs/lib/SOURCES index 3a741a5c81..ffbe1af92e 100644 --- a/apps/codecs/lib/SOURCES +++ b/apps/codecs/lib/SOURCES
@@ -7,9 +7,7 @@ mdct_lookup.c
7	#ifdef CPU_ARM	7	#ifdef CPU_ARM
8	mdct_arm.S	8	mdct_arm.S
9	setjmp_arm.S	9	setjmp_arm.S
10	#if ARM_ARCH == 4	10	udiv32_arm.S
11	udiv32_armv4.S
12	#endif
13	#endif	11	#endif
14		12
15	#ifdef CPU_COLDFIRE	13	#ifdef CPU_COLDFIRE


diff --git a/apps/codecs/lib/codeclib.h b/apps/codecs/lib/codeclib.h index 517264f3a5..926035f05e 100644 --- a/apps/codecs/lib/codeclib.h +++ b/apps/codecs/lib/codeclib.h
@@ -65,7 +65,7 @@ void qsort(void base, size_t nmemb, size_t size, int(compar)(const void *, con
65		65
66	extern void mdct_backward(int n, int32_t in, int32_t out);	66	extern void mdct_backward(int n, int32_t in, int32_t out);
67		67
68	#if defined(CPU_ARM) && (ARM_ARCH == 4)	68	#ifdef CPU_ARM
69	/* optimised unsigned integer division for ARMv4, in IRAM */	69	/* optimised unsigned integer division for ARMv4, in IRAM */
70	unsigned udiv32_arm(unsigned a, unsigned b);	70	unsigned udiv32_arm(unsigned a, unsigned b);
71	#define UDIV32(a, b) udiv32_arm(a, b)	71	#define UDIV32(a, b) udiv32_arm(a, b)


diff --git a/apps/codecs/lib/udiv32_arm.S b/apps/codecs/lib/udiv32_arm.S new file mode 100644 index 0000000000..c46a09be5c --- /dev/null +++ b/apps/codecs/lib/udiv32_arm.S
@@ -0,0 +1,319 @@
		1	/***************************************************************************
		2	* __________ __ ___.
		3	* Open \______ \ ____ ____ \| \| _\_ \|__ _______ ___
		4	* Source \| _// _ \_/ ___\\| \|/ /\| __ \ / _ \ \/ /
		5	* Jukebox \| \| ( <_> ) \___\| < \| \_\ ( <_> > < <
		6	* Firmware \|____\|_ /\____/ \___ >__\|_ \\|___ /\____/__/\_ \
		7	* \/ \/ \/ \/ \/
		8	* $Id$
		9	*
		10	* Copyright (C) 2008 by Jens Arnold
		11	* Copyright (C) 2009 by Andrew Mahone
		12	*
		13	* Optimised unsigned integer division for ARMv4
		14	*
		15	* Based on: libgcc routines for ARM cpu, additional algorithms from ARM System
		16	* Developer's Guide
		17	* Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
		18	* Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
		19	* Free Software Foundation, Inc.
		20	*
		21	* This program is free software; you can redistribute it and/or
		22	* modify it under the terms of the GNU General Public License
		23	* as published by the Free Software Foundation; either version 2
		24	* of the License, or (at your option) any later version.
		25	*
		26	* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
		27	* KIND, either express or implied.
		28	*
		29	****************************************************************************/
		30
		31	#include "config.h"
		32	/* Codecs should not normally do this, but we need to check a macro, and
		33	* codecs.h would confuse the assembler. */
		34
		35	#ifdef USE_IRAM
		36	#define DIV_RECIP
		37	.section .icode,"ax",%progbits
		38	#else
		39	.text
		40	#endif
		41	.align
		42	.global udiv32_arm
		43	.type udiv32_arm,%function
		44
		45	#if ARM_ARCH < 5
		46	/* Adapted from an algorithm given in ARM System Developer's Guide (7.3.1.2)
		47	for dividing a 30-bit value by a 15-bit value, with two operations per
		48	iteration by storing quotient and remainder together and adding the previous
		49	quotient bit during trial subtraction. Modified to work with any dividend
		50	and divisor both less than 1 << 30, and skipping trials by calculating bits
		51	in output. */
		52	.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient, remainder
		53
		54	mov \bits, #1
		55	/* Shift the divisor left until it aligns with the numerator. If it already
		56	has the high bit set, this is fine, everything inside .rept will be
		57	skipped, and the add before and adcs after will set the one-bit result
		58	to zero. */
		59	cmn \divisor, \dividend, lsr #16
		60	movcs \divisor, \divisor, lsl #16
		61	addcs \bits, \bits, #16
		62	cmn \divisor, \dividend, lsr #8
		63	movcs \divisor, \divisor, lsl #8
		64	addcs \bits, \bits, #8
		65	cmn \divisor, \dividend, lsr #4
		66	movcs \divisor, \divisor, lsl #4
		67	addcs \bits, \bits, #4
		68	cmn \divisor, \dividend, lsr #2
		69	movcs \divisor, \divisor, lsl #2
		70	addcs \bits, \bits, #2
		71	cmn \divisor, \dividend, lsr #1
		72	movcs \divisor, \divisor, lsl #1
		73	addcs \bits, \bits, #1
		74	adds \result, \dividend, \divisor
		75	subcc \result, \result, \divisor
		76	rsb \curbit, \bits, #31
		77	add pc, pc, \curbit, lsl #3
		78	nop
		79	.rept 30
		80	adcs \result, \divisor, \result, lsl #1
		81	/* Fix the remainder portion of the result. This must be done because the
		82	handler for 32-bit numerators needs the remainder. */
		83	subcc \result, \result, \divisor
		84	.endr
		85	/* Shift remainder/quotient left one, add final quotient bit */
		86	adc \result, \result, \result
		87	mov \remainder, \result, lsr \bits
		88	eor \quotient, \result, \remainder, lsl \bits
		89	.endm
		90
		91	#ifdef CPU_PP
		92	#if CONFIG_CPU == PP5020
		93	.set recip_max, 5952
		94	#elif CONFIG_CPU == PP5002
		95	.set recip_max, 1472
		96	#else
		97	.set recip_max, 14208
		98	#endif
		99	#elif CONFIG_CPU == AS3525
		100	.set recip_max, 42752
		101	#elif CONFIG_CPU == S5L8701
		102	.set recip_max, 9600
		103	#elif CONFIG_CPU == S5L8700
		104	.set recip_max, 5504
		105	#endif
		106
		107	udiv32_arm:
		108	#ifdef DIV_RECIP
		109	cmp r1, #3
		110	bcc .L_udiv_tiny
		111	cmp r1, #recip_max
		112	bhi .L_udiv
		113	adr r3, .L_udiv_recip_table-12
		114	ldr r2, [r3, r1, lsl #2]
		115	mov r3, r0
		116	umull ip, r0, r2, r0
		117	mul r2, r0, r1
		118	cmp r3, r2
		119	bxcs lr
		120	sub r0, r0, #1
		121	bx lr
		122	.L_udiv_tiny:
		123	cmp r1, #1
		124	movhi r0, r0, lsr #1
		125	bxcs lr
		126	b .L_div0
		127	#endif
		128	.L_udiv:
		129	/* Invert divisor. ARM_DIV_31_BODY uses adc to both subtract the divisor
		130	and add the next bit of the result. The correction code at .L_udiv32
		131	does not need the divisor inverted, but can be modified to work with it,
		132	and this allows the zero divisor test to be done early and without an
		133	explicit comparison. */
		134	rsbs r1, r1, #0
		135	#ifndef DIV_RECIP
		136	beq .L_div0
		137	#endif
		138	tst r0, r0
		139	/* High bit must be unset, otherwise shift numerator right, calculate,
		140	and correct results. As this case is very uncommon we want to avoid
		141	any other delays on the main path in handling it, so the long divide
		142	calls the short divide as a function. */
		143	bmi .L_udiv32
		144	.L_udiv31:
		145	ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0, r1
		146	bx lr
		147	.L_udiv32:
		148	/* store original numerator and divisor, we'll need them to correct the
		149	result, */
		150	stmdb sp, { r0, r1, lr }
		151	/* Call __div0 here if divisor is zero, otherwise it would report the wrong
		152	address. */
		153	mov r0, r0, lsr #1
		154	bl .L_udiv31
		155	ldmdb sp, { r2, r3, lr }
		156	/* Move the low bit of the original numerator to the carry bit */
		157	movs r2, r2, lsr #1
		158	/* Shift the remainder left one and add in the carry bit */
		159	adc r1, r1, r1
		160	/* Subtract the original divisor from the remainder, setting carry if the
		161	result is non-negative */
		162	adds r1, r1, r3
		163	/* Shift quotient left one and add carry bit */
		164	adc r0, r0, r0
		165	bx lr
		166	.L_div0:
		167	/* __div0 expects the calling address on the top of the stack */
		168	stmdb sp!, { lr }
		169	mov r0, #0
		170	#if defined(__ARM_EABI__) \|\| !defined(USE_IRAM)
		171	bl __div0
		172	#else
		173	ldr pc, [pc, #-4]
		174	.word __div0
		175	#endif
		176	#ifdef DIV_RECIP
		177	.L_udiv_recip_table:
		178	.set div, 3
		179	.rept recip_max - 2
		180	.if (div - 1) & div
		181	.set q, 0x40000000 / div
		182	.set r, (0x40000000 - (q * div))<<1
		183	.set q, q << 1
		184	.if r >= div
		185	.set q, q + 1
		186	.set r, r - div
		187	.endif
		188	.set r, r << 1
		189	.set q, q << 1
		190	.if r >= div
		191	.set q, q + 1
		192	.set r, r - div
		193	.endif
		194	.set q, q + 1
		195	.else
		196	.set q, 0x40000000 / div * 4
		197	.endif
		198	.word q
		199	.set div, div+1
		200	.endr
		201	#endif
		202	.size udiv32_arm, . - udiv32_arm
		203
		204	#else
		205	.macro ARMV5_UDIV32_BODY numerator, divisor, quotient, bits, inv, neg, div0label
		206	cmp \numerator, \divisor
		207	clz \bits, \divisor
		208	bcc 30f
		209	mov \inv, \divisor, lsl \bits
		210	add \neg, pc, \inv, lsr #25
		211	cmp \inv, #1<<31
		212	ldrhib \inv, [\neg, #.L_udiv_est_table-.-64]
		213	bls 20f
		214	subs \bits, \bits, #7
		215	rsb \neg, \divisor, #0
		216	movpl \divisor, \inv, lsl \bits
		217	bmi 10f
		218	mul \inv, \divisor, \neg
		219	smlawt \divisor, \divisor, \inv, \divisor
		220	mul \inv, \divisor, \neg
		221	/* This will save a cycle on ARMv6, but does not produce a correct result
		222	if numerator sign bit is set. This case accounts for about 1 in 10^7 of
		223	divisions, done by the APE decoder, so we specialize for the more common
		224	case and handle the uncommon large-numerator separately */
		225	#if ARM_ARCH >= 6
		226	tst \numerator, \numerator
		227	smmla \divisor, \divisor, \inv, \divisor
		228	bmi 40f
		229	smmul \inv, \numerator, \divisor
		230	#else
		231	mov \bits, #0
		232	smlal \bits, \divisor, \divisor, \inv
		233	umull \bits, \inv, \numerator, \divisor
		234	#endif
		235	add \numerator, \numerator, \neg
		236	mla \divisor, \inv, \neg, \numerator
		237	mov \quotient, \inv
		238	cmn \divisor, \neg
		239	addcc \quotient, \quotient, #1
		240	addpl \quotient, \quotient, #2
		241	bx lr
		242	10:
		243	rsb \bits, \bits, #0
		244	sub \inv, \inv, #4
		245	mov \divisor, \inv, lsr \bits
		246	#if ARM_ARCH >= 6
		247	tst \numerator, \numerator
		248	smmla \divisor, \divisor, \inv, \divisor
		249	bmi 50f
		250	smmul \inv, \numerator, \divisor
		251	#else
		252	mov \bits, #0
		253	smlal \bits, \divisor, \divisor, \inv
		254	umull \bits, \inv, \numerator, \divisor
		255	#endif
		256	mla \divisor, \inv, \neg, \numerator
		257	mov \quotient, \inv
		258	cmn \neg, \divisor, lsr #1
		259	addcs \divisor, \divisor, \neg, lsl #1
		260	addcs \quotient, \quotient, #2
		261	cmn \neg, \divisor
		262	addcs \quotient, \quotient, #1
		263	bx lr
		264	20:
		265	.ifnc "", "\div0label"
		266	rsb \bits, \bits, #31
		267	bne \div0label
		268	.endif
		269	mov \quotient, \numerator, lsr \bits
		270	bx lr
		271	30:
		272	mov \quotient, #0
		273	bx lr
		274	#if ARM_ARCH >= 6
		275	40:
		276	umull \bits, \inv, \numerator, \divisor
		277	add \numerator, \numerator, \neg
		278	mla \divisor, \inv, \neg, \numerator
		279	mov \quotient, \inv
		280	cmn \divisor, \neg
		281	addcc \quotient, \quotient, #1
		282	addpl \quotient, \quotient, #2
		283	bx lr
		284	50:
		285	umull \bits, \inv, \numerator, \divisor
		286	mla \divisor, \inv, \neg, \numerator
		287	mov \quotient, \inv
		288	cmn \neg, \divisor, lsr #1
		289	addcs \divisor, \divisor, \neg, lsl #1
		290	addcs \quotient, \quotient, #2
		291	cmn \neg, \divisor
		292	addcs \quotient, \quotient, #1
		293	bx lr
		294	#endif
		295	.endm
		296
		297	udiv32_arm:
		298	ARMV5_UDIV32_BODY r0, r1, r0, r2, r3, ip, .L_div0
		299	.L_div0:
		300	/* __div0 expects the calling address on the top of the stack */
		301	stmdb sp!, { lr }
		302	mov r0, #0
		303	#if defined(__ARM_EABI__) \|\| !defined(USE_IRAM)
		304	bl __div0
		305	#else
		306	ldr pc, [pc, #-4]
		307	.word __div0
		308	#endif
		309	.L_udiv_est_table:
		310	.byte 0xff, 0xfc, 0xf8, 0xf4, 0xf0, 0xed, 0xea, 0xe6
		311	.byte 0xe3, 0xe0, 0xdd, 0xda, 0xd7, 0xd4, 0xd2, 0xcf
		312	.byte 0xcc, 0xca, 0xc7, 0xc5, 0xc3, 0xc0, 0xbe, 0xbc
		313	.byte 0xba, 0xb8, 0xb6, 0xb4, 0xb2, 0xb0, 0xae, 0xac
		314	.byte 0xaa, 0xa8, 0xa7, 0xa5, 0xa3, 0xa2, 0xa0, 0x9f
		315	.byte 0x9d, 0x9c, 0x9a, 0x99, 0x97, 0x96, 0x94, 0x93
		316	.byte 0x92, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8a, 0x89
		317	.byte 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81
		318	#endif
		319	.size udiv32_arm, . - udiv32_arm


diff --git a/apps/codecs/lib/udiv32_armv4.S b/apps/codecs/lib/udiv32_armv4.S deleted file mode 100644 index c4aea14093..0000000000 --- a/apps/codecs/lib/udiv32_armv4.S +++ /dev/null
@@ -1,134 +0,0 @@
1	/***************************************************************************
2	* __________ __ ___.
3	* Open \______ \ ____ ____ \| \| _\_ \|__ _______ ___
4	* Source \| _// _ \_/ ___\\| \|/ /\| __ \ / _ \ \/ /
5	* Jukebox \| \| ( <_> ) \___\| < \| \_\ ( <_> > < <
6	* Firmware \|____\|_ /\____/ \___ >__\|_ \\|___ /\____/__/\_ \
7	* \/ \/ \/ \/ \/
8	* $Id$
9	*
10	* Copyright (C) 2008 by Jens Arnold
11	* Copyright (C) 2009 by Andrew Mahone
12	*
13	* Optimised unsigned integer division for ARMv4
14	*
15	* Based on: libgcc routines for ARM cpu.
16	* Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
17	* Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
18	* Free Software Foundation, Inc.
19	*
20	* This program is free software; you can redistribute it and/or
21	* modify it under the terms of the GNU General Public License
22	* as published by the Free Software Foundation; either version 2
23	* of the License, or (at your option) any later version.
24	*
25	* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
26	* KIND, either express or implied.
27	*
28	****************************************************************************/
29
30	#include "config.h"
31	/* Codecs should not normally do this, but we need to check a macro, and
32	* codecs.h would confuse the assembler. */
33
34	/* Adapted from an algorithm given in ARM System Developer's Guide (7.3.1.2)
35	for dividing a 30-bit value by a 15-bit value, with two operations per
36	iteration by storing quotient and remainder together and adding the previous
37	quotient bit during trial subtraction. Modified to work with any dividend
38	and divisor both less than 1 << 30, and skipping trials by calculating bits
39	in output. */
40	.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient, remainder
41
42	mov \bits, #1
43	/* Shift the divisor left until it aligns with the numerator. If it already
44	has the high bit set, this is fine, everything inside .rept will be
45	skipped, and the add before and adcs after will set the one-bit result
46	to zero. */
47	cmn \divisor, \dividend, lsr #16
48	movcs \divisor, \divisor, lsl #16
49	addcs \bits, \bits, #16
50	cmn \divisor, \dividend, lsr #8
51	movcs \divisor, \divisor, lsl #8
52	addcs \bits, \bits, #8
53	cmn \divisor, \dividend, lsr #4
54	movcs \divisor, \divisor, lsl #4
55	addcs \bits, \bits, #4
56	cmn \divisor, \dividend, lsr #2
57	movcs \divisor, \divisor, lsl #2
58	addcs \bits, \bits, #2
59	cmn \divisor, \dividend, lsr #1
60	movcs \divisor, \divisor, lsl #1
61	addcs \bits, \bits, #1
62	adds \result, \dividend, \divisor
63	subcc \result, \result, \divisor
64	rsb \curbit, \bits, #31
65	add pc, pc, \curbit, lsl #3
66	nop
67	.rept 30
68	adcs \result, \divisor, \result, lsl #1
69	/* Fix the remainder portion of the result. This must be done because the
70	handler for 32-bit numerators needs the remainder. */
71	subcc \result, \result, \divisor
72	.endr
73	/* Shift remainder/quotient left one, add final quotient bit */
74	adc \result, \result, \result
75	mov \remainder, \result, lsr \bits
76	eor \quotient, \result, \remainder, lsl \bits
77	.endm
78
79	#ifdef USE_IRAM
80	.section .icode,"ax",%progbits
81	#else
82	.text
83	#endif
84	.align
85	.global udiv32_arm
86	.type udiv32_arm,%function
87
88	udiv32_arm:
89	/* Invert divisor. ARM_DIV_31_BODY uses adc to both subtract the divisor
90	and add the next bit of the result. The correction code at .L_udiv32
91	does not need the divisor inverted, but can be modified to work with it,
92	and this allows the zero divisor test to be done early and without an
93	explicit comparison. */
94	rsbs r1, r1, #0
95	beq .L_div0
96	tst r0, r0
97	/* High bit must be unset, otherwise shift numerator right, calculate,
98	and correct results. As this case is very uncommon we want to avoid
99	any other delays on the main path in handling it, so the long divide
100	calls the short divide as a function. */
101	bmi .L_udiv32
102	.L_udiv31:
103	ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0, r1
104	bx lr
105
106	.L_udiv32:
107	/* store original numerator and divisor, we'll need them to correct the
108	result, */
109	stmdb sp, { r0, r1, lr }
110	/* Call __div0 here if divisor is zero, otherwise it would report the wrong
111	address. */
112	mov r0, r0, lsr #1
113	bl .L_udiv31
114	ldmdb sp, { r2, r3, lr }
115	/* Move the low bit of the original numerator to the carry bit */
116	movs r2, r2, lsr #1
117	/* Shift the remainder left one and add in the carry bit */
118	adc r1, r1, r1
119	/* Subtract the original divisor from the remainder, setting carry if the
120	result is non-negative */
121	adds r1, r1, r3
122	/* Shift quotient left one and add carry bit */
123	adc r0, r0, r0
124	bx lr
125	.L_div0:
126	/* __div0 expects the calling address on the top of the stack */
127	stmdb sp!, { lr }
128	#if defined(__ARM_EABI__) \|\| !defined(USE_IRAM)
129	bl __div0
130	#else
131	mov lr, pc
132	bx r3
133	#endif
134	.size udiv32_arm, . - udiv32_arm