Further optimised (vs. libgcc) unsigned 32 bit division for ARMv4 (based on the ARMv5(+) version from libgcc), in IRAM on PP for better performance on PP5002, and put into the codeclib for possible reuse. APE -c1000 is now usable on both PP502x and PP5002 (~138% realtime, they're on par now). Gigabeat F/X should also see an APE speedup.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19009 a1c6a512-1295-4272-9138-f99709370657
author: Jens Arnold <amiconn@rockbox.org> 2008-11-05 00:10:05 +0000
committer: Jens Arnold <amiconn@rockbox.org> 2008-11-05 00:10:05 +0000
commit: fe04e40be7a26c758a82e410e58be63c1f3d571c (patch)
tree: 955b1557f3da7cd8362bc05d96302cac08a72ff2
parent: 7a835ee0c64bb941f205a2eb915cf0aaf460f1bc (diff)
download: rockbox-fe04e40be7a26c758a82e410e58be63c1f3d571c.tar.gz
rockbox-fe04e40be7a26c758a82e410e58be63c1f3d571c.zip
4 files changed, 137 insertions, 3 deletions
diff --git a/apps/codecs/demac/libdemac/rangecoding.h b/apps/codecs/demac/libdemac/rangecoding.h
index c96886e32b..645fd1ad92 100644
--- a/apps/codecs/demac/libdemac/rangecoding.h
+++ b/apps/codecs/demac/libdemac/rangecoding.h
@@ -49,6 +49,14 @@ removing the rc parameter from each function (and the RNGC macro)).
 */
+#ifdef ROCKBOX
+#include "../lib/codeclib.h"
+/* for UDIV32() */
+#endif
+#ifndef UDIV32
+#define UDIV32(a, b)  (a / b)
+#endif
 /* BITSTREAM READING FUNCTIONS */
@@ -121,15 +129,15 @@ static inline void range_dec_normalize(void)
 static inline int range_decode_culfreq(int tot_f)
 {
    range_dec_normalize();
-    rc.help = rc.range / tot_f;
+    rc.help = UDIV32(rc.range, tot_f);
-    return rc.low / rc.help;
+    return UDIV32(rc.low, rc.help);
 }
 static inline int range_decode_culshift(int shift)
 {
    range_dec_normalize();
    rc.help = rc.range >> shift;
-    return rc.low / rc.help;
+    return UDIV32(rc.low, rc.help);
 }
diff --git a/apps/codecs/lib/SOURCES b/apps/codecs/lib/SOURCES
index 9c6d4e7ff6..8099620098 100644
--- a/apps/codecs/lib/SOURCES
+++ b/apps/codecs/lib/SOURCES
@@ -5,6 +5,9 @@ codeclib.c
 mdct2.c
 #ifdef CPU_ARM
 mdct_arm.S
+#if ARM_ARCH == 4
+udiv32_armv4.S
+#endif
 #endif
 #elif defined(SIMULATOR) && defined(__APPLE__)
diff --git a/apps/codecs/lib/codeclib.h b/apps/codecs/lib/codeclib.h
index 744accb8aa..477818a23d 100644
--- a/apps/codecs/lib/codeclib.h
+++ b/apps/codecs/lib/codeclib.h
@@ -57,6 +57,15 @@ void qsort(void *base, size_t nmemb, size_t size, int(*compar)(const void *, con
 extern void mdct_backward(int n, int32_t *in, int32_t *out);
+#if defined(CPU_ARM) && (ARM_ARCH == 4)
+/* optimised unsigned integer division for ARMv4, in IRAM */
+unsigned udiv32_arm(unsigned a, unsigned b);
+#define UDIV32(a, b) udiv32_arm(a, b)
+#else
+/* default */
+#define UDIV32(a, b) (a / b)
+#endif
 /* Various codec helper functions */
 int codec_init(void);
diff --git a/apps/codecs/lib/udiv32_armv4.S b/apps/codecs/lib/udiv32_armv4.S
new file mode 100644
index 0000000000..a659a9eb8e
--- /dev/null
+++ b/apps/codecs/lib/udiv32_armv4.S
@@ -0,0 +1,114 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2008 by Jens Arnold
+ *
+ * Optimised unsigned integer division for ARMv4
+ *
+ * Based on: libgcc routines for ARM cpu.
+ * Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
+ * Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
+ * Free Software Foundation, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+#include "config.h"
+/* Codecs should not normally do this, but we need to check a macro, and
+ * codecs.h would confuse the assembler. */
+.macro ARM_DIV_BODY dividend, divisor, result, curbit
+    mov     \result, \dividend
+    mov     \curbit, #90          @ 3 * 30, (calculating branch dest)
+    cmp     \divisor, \result, lsr #16
+    movls   \result,\result, lsr #16
+    subls   \curbit, \curbit, #48
+    cmp     \divisor, \result, lsr #8
+    movls   \result,\result, lsr #8
+    subls   \curbit, \curbit, #24
+    cmp     \divisor, \result, lsr #4
+    movls   \result,\result, lsr #4
+    subls   \curbit, \curbit, #12
+    cmp     \divisor, \result, lsr #2
+    subls   \curbit, \curbit, #6
+    @ calculation is only done down to shift=2, because the shift=1 step
+    @ would need 3 more cycles, but would only gain 1.5 cycles on average
+    mov     \result, #0
+    add     pc, pc, \curbit, lsl #2
+    nop
+    .set    shift, 32
+    .rept   32
+    .set    shift, shift - 1
+    cmp     \dividend, \divisor, lsl #shift
+    adc     \result, \result, \result
+    subcs   \dividend, \dividend, \divisor, lsl #shift
+    .endr
+.endm
+.macro ARM_DIV2_ORDER divisor, order
+    cmp     \divisor, #(1 << 16)
+    movhs   \divisor, \divisor, lsr #16
+    movhs   \order, #16
+    movlo   \order, #0
+    cmp     \divisor, #(1 << 8)
+    movhs   \divisor, \divisor, lsr #8
+    addhs   \order, \order, #8
+    cmp     \divisor, #(1 << 4)
+    movhs   \divisor, \divisor, lsr #4
+    addhs   \order, \order, #4
+    cmp     \divisor, #(1 << 2)
+    addhi   \order, \order, #3
+    addls   \order, \order, \divisor, lsr #1
+.endm
+#ifdef USE_IRAM
+    .section    .icode,"ax",%progbits
+#else
+    .text
+#endif
+    .align
+    .global udiv32_arm
+    .type   udiv32_arm,%function
+udiv32_arm:
+    subs    r2, r1, #1
+    bxeq    lr
+    bcc     20f
+    cmp     r0, r1
+    bls     10f
+    tst     r1, r2
+    beq     30f
+    ARM_DIV_BODY r0, r1, r2, r3
+    mov     r0, r2
+    bx      lr
+10:
+    moveq   r0, #1
+20:
+    movne   r0, #0
+    bx      lr
+30:
+    ARM_DIV2_ORDER r1, r2
+    mov     r0, r0, lsr r2
+    bx      lr
author	Jens Arnold <amiconn@rockbox.org>	2008-11-05 00:10:05 +0000
committer	Jens Arnold <amiconn@rockbox.org>	2008-11-05 00:10:05 +0000
commit	fe04e40be7a26c758a82e410e58be63c1f3d571c (patch)
tree	955b1557f3da7cd8362bc05d96302cac08a72ff2
parent	7a835ee0c64bb941f205a2eb915cf0aaf460f1bc (diff)
download	rockbox-fe04e40be7a26c758a82e410e58be63c1f3d571c.tar.gz rockbox-fe04e40be7a26c758a82e410e58be63c1f3d571c.zip

diff --git a/apps/codecs/demac/libdemac/rangecoding.h b/apps/codecs/demac/libdemac/rangecoding.h index c96886e32b..645fd1ad92 100644 --- a/apps/codecs/demac/libdemac/rangecoding.h +++ b/apps/codecs/demac/libdemac/rangecoding.h
@@ -49,6 +49,14 @@ removing the rc parameter from each function (and the RNGC macro)).
49		49
50	*/	50	*/
51		51
		52	#ifdef ROCKBOX
		53	#include "../lib/codeclib.h"
		54	/* for UDIV32() */
		55	#endif
		56
		57	#ifndef UDIV32
		58	#define UDIV32(a, b) (a / b)
		59	#endif
52		60
53	/* BITSTREAM READING FUNCTIONS */	61	/* BITSTREAM READING FUNCTIONS */
54		62
@@ -121,15 +129,15 @@ static inline void range_dec_normalize(void)
121	static inline int range_decode_culfreq(int tot_f)	129	static inline int range_decode_culfreq(int tot_f)
122	{	130	{
123	range_dec_normalize();	131	range_dec_normalize();
124	rc.help = rc.range / tot_f;	132	rc.help = UDIV32(rc.range, tot_f);
125	return rc.low / rc.help;	133	return UDIV32(rc.low, rc.help);
126	}	134	}
127		135
128	static inline int range_decode_culshift(int shift)	136	static inline int range_decode_culshift(int shift)
129	{	137	{
130	range_dec_normalize();	138	range_dec_normalize();
131	rc.help = rc.range >> shift;	139	rc.help = rc.range >> shift;
132	return rc.low / rc.help;	140	return UDIV32(rc.low, rc.help);
133	}	141	}
134		142
135		143


diff --git a/apps/codecs/lib/SOURCES b/apps/codecs/lib/SOURCES index 9c6d4e7ff6..8099620098 100644 --- a/apps/codecs/lib/SOURCES +++ b/apps/codecs/lib/SOURCES
@@ -5,6 +5,9 @@ codeclib.c
5	mdct2.c	5	mdct2.c
6	#ifdef CPU_ARM	6	#ifdef CPU_ARM
7	mdct_arm.S	7	mdct_arm.S
		8	#if ARM_ARCH == 4
		9	udiv32_armv4.S
		10	#endif
8	#endif	11	#endif
9		12
10	#elif defined(SIMULATOR) && defined(__APPLE__)	13	#elif defined(SIMULATOR) && defined(__APPLE__)


diff --git a/apps/codecs/lib/codeclib.h b/apps/codecs/lib/codeclib.h index 744accb8aa..477818a23d 100644 --- a/apps/codecs/lib/codeclib.h +++ b/apps/codecs/lib/codeclib.h
@@ -57,6 +57,15 @@ void qsort(void base, size_t nmemb, size_t size, int(compar)(const void *, con
57		57
58	extern void mdct_backward(int n, int32_t in, int32_t out);	58	extern void mdct_backward(int n, int32_t in, int32_t out);
59		59
		60	#if defined(CPU_ARM) && (ARM_ARCH == 4)
		61	/* optimised unsigned integer division for ARMv4, in IRAM */
		62	unsigned udiv32_arm(unsigned a, unsigned b);
		63	#define UDIV32(a, b) udiv32_arm(a, b)
		64	#else
		65	/* default */
		66	#define UDIV32(a, b) (a / b)
		67	#endif
		68
60	/* Various codec helper functions */	69	/* Various codec helper functions */
61		70
62	int codec_init(void);	71	int codec_init(void);


diff --git a/apps/codecs/lib/udiv32_armv4.S b/apps/codecs/lib/udiv32_armv4.S new file mode 100644 index 0000000000..a659a9eb8e --- /dev/null +++ b/apps/codecs/lib/udiv32_armv4.S
@@ -0,0 +1,114 @@
		1	/***************************************************************************
		2	* __________ __ ___.
		3	* Open \______ \ ____ ____ \| \| _\_ \|__ _______ ___
		4	* Source \| _// _ \_/ ___\\| \|/ /\| __ \ / _ \ \/ /
		5	* Jukebox \| \| ( <_> ) \___\| < \| \_\ ( <_> > < <
		6	* Firmware \|____\|_ /\____/ \___ >__\|_ \\|___ /\____/__/\_ \
		7	* \/ \/ \/ \/ \/
		8	* $Id$
		9	*
		10	* Copyright (C) 2008 by Jens Arnold
		11	*
		12	* Optimised unsigned integer division for ARMv4
		13	*
		14	* Based on: libgcc routines for ARM cpu.
		15	* Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
		16	* Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
		17	* Free Software Foundation, Inc.
		18	*
		19	* This program is free software; you can redistribute it and/or
		20	* modify it under the terms of the GNU General Public License
		21	* as published by the Free Software Foundation; either version 2
		22	* of the License, or (at your option) any later version.
		23	*
		24	* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
		25	* KIND, either express or implied.
		26	*
		27	****************************************************************************/
		28
		29	#include "config.h"
		30	/* Codecs should not normally do this, but we need to check a macro, and
		31	* codecs.h would confuse the assembler. */
		32
		33	.macro ARM_DIV_BODY dividend, divisor, result, curbit
		34
		35	mov \result, \dividend
		36	mov \curbit, #90 @ 3 * 30, (calculating branch dest)
		37	cmp \divisor, \result, lsr #16
		38	movls \result,\result, lsr #16
		39	subls \curbit, \curbit, #48
		40	cmp \divisor, \result, lsr #8
		41	movls \result,\result, lsr #8
		42	subls \curbit, \curbit, #24
		43	cmp \divisor, \result, lsr #4
		44	movls \result,\result, lsr #4
		45	subls \curbit, \curbit, #12
		46	cmp \divisor, \result, lsr #2
		47	subls \curbit, \curbit, #6
		48	@ calculation is only done down to shift=2, because the shift=1 step
		49	@ would need 3 more cycles, but would only gain 1.5 cycles on average
		50	mov \result, #0
		51	add pc, pc, \curbit, lsl #2
		52	nop
		53	.set shift, 32
		54	.rept 32
		55	.set shift, shift - 1
		56	cmp \dividend, \divisor, lsl #shift
		57	adc \result, \result, \result
		58	subcs \dividend, \dividend, \divisor, lsl #shift
		59	.endr
		60	.endm
		61
		62	.macro ARM_DIV2_ORDER divisor, order
		63
		64	cmp \divisor, #(1 << 16)
		65	movhs \divisor, \divisor, lsr #16
		66	movhs \order, #16
		67	movlo \order, #0
		68
		69	cmp \divisor, #(1 << 8)
		70	movhs \divisor, \divisor, lsr #8
		71	addhs \order, \order, #8
		72
		73	cmp \divisor, #(1 << 4)
		74	movhs \divisor, \divisor, lsr #4
		75	addhs \order, \order, #4
		76
		77	cmp \divisor, #(1 << 2)
		78	addhi \order, \order, #3
		79	addls \order, \order, \divisor, lsr #1
		80	.endm
		81
		82
		83	#ifdef USE_IRAM
		84	.section .icode,"ax",%progbits
		85	#else
		86	.text
		87	#endif
		88	.align
		89	.global udiv32_arm
		90	.type udiv32_arm,%function
		91
		92	udiv32_arm:
		93	subs r2, r1, #1
		94	bxeq lr
		95	bcc 20f
		96	cmp r0, r1
		97	bls 10f
		98	tst r1, r2
		99	beq 30f
		100
		101	ARM_DIV_BODY r0, r1, r2, r3
		102	mov r0, r2
		103	bx lr
		104
		105	10:
		106	moveq r0, #1
		107	20:
		108	movne r0, #0
		109	bx lr
		110
		111	30:
		112	ARM_DIV2_ORDER r1, r2
		113	mov r0, r0, lsr r2
		114	bx lr