Move udiv32_arm.S into libdemac, as this divider is specialized for the APE codec and an optimized divider is already provided for general use in codeclib.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24506 a1c6a512-1295-4272-9138-f99709370657
author: Andrew Mahone <andrew.mahone@gmail.com> 2010-02-04 05:49:37 +0000
committer: Andrew Mahone <andrew.mahone@gmail.com> 2010-02-04 05:49:37 +0000
commit: 8ed7bda64cb98d491431fd130eb754c6320441a0 (patch)
tree: be9b6340aaaba3820e87a57c7c3db545c9c2cf15 /apps/codecs/demac/libdemac
parent: 7ed87517f734d7d70ab6f294735a77a65bd22e42 (diff)
download: rockbox-8ed7bda64cb98d491431fd130eb754c6320441a0.tar.gz
rockbox-8ed7bda64cb98d491431fd130eb754c6320441a0.zip
3 files changed, 311 insertions, 4 deletions
diff --git a/apps/codecs/demac/libdemac/SOURCES b/apps/codecs/demac/libdemac/SOURCES
index 5a4482376c..f9f8f217c7 100644
--- a/apps/codecs/demac/libdemac/SOURCES
+++ b/apps/codecs/demac/libdemac/SOURCES
@@ -2,6 +2,7 @@ crc.c
 predictor.c
 #ifdef CPU_ARM
 predictor-arm.S
+udiv32_arm.S
 #elif defined CPU_COLDFIRE
 predictor-cf.S
 #endif
diff --git a/apps/codecs/demac/libdemac/demac_config.h b/apps/codecs/demac/libdemac/demac_config.h
index 13166f69ae..7388aa1059 100644
--- a/apps/codecs/demac/libdemac/demac_config.h
+++ b/apps/codecs/demac/libdemac/demac_config.h
@@ -91,10 +91,6 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
 /* Defaults */
-#ifndef UDIV32
-#define UDIV32(a, b) (a / b)
-#endif
 #ifndef FILTER_HISTORY_SIZE
 #define FILTER_HISTORY_SIZE 512
 #endif
@@ -109,6 +105,16 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
 #ifndef __ASSEMBLER__
+#if defined(CPU_ARM) && (ARM_ARCH < 5 || defined(USE_IRAM))
+/* optimised unsigned integer division for ARMv4, in IRAM */
+unsigned udiv32_arm(unsigned a, unsigned b);
+#define UDIV32(a, b) udiv32_arm(a, b)
+#else
+/* default */
+#define UDIV32(a, b) (a / b)
+#endif
 #include <inttypes.h>
 #if FILTER_BITS == 32
 typedef int32_t filter_int;
diff --git a/apps/codecs/demac/libdemac/udiv32_arm.S b/apps/codecs/demac/libdemac/udiv32_arm.S
new file mode 100644
index 0000000000..4492492d30
--- /dev/null
+++ b/apps/codecs/demac/libdemac/udiv32_arm.S
@@ -0,0 +1,300 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2008 by Jens Arnold
+ * Copyright (C) 2009 by Andrew Mahone
+ *
+ * Optimised unsigned integer division for ARMv4
+ *
+ * Based on: libgcc routines for ARM cpu, additional algorithms from ARM System
+ *           Developer's Guide
+ * Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
+ * Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
+ * Free Software Foundation, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+#include "config.h"
+/* Codecs should not normally do this, but we need to check a macro, and
+ * codecs.h would confuse the assembler. */
+#ifdef USE_IRAM
+#define DIV_RECIP
+    .section    .icode,"ax",%progbits
+#else
+    .text
+#endif
+    .align
+    .global udiv32_arm
+    .type   udiv32_arm,%function
+#if ARM_ARCH < 5
+/* Adapted from an algorithm given in ARM System Developer's Guide (7.3.1.2)
+   for dividing a 30-bit value by a 15-bit value, with two operations per
+   iteration by storing quotient and remainder together and adding the previous
+   quotient bit during trial subtraction. Modified to work with any dividend
+   and divisor both less than 1 << 30, and skipping trials by calculating bits
+   in output. */
+.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient, remainder
+    mov     \bits, #1
+    /* Shift the divisor left until it aligns with the numerator. If it already
+       has the high bit set, this is fine, everything inside .rept will be
+       skipped, and the add before and adcs after will set the one-bit result
+       to zero. */
+    cmn     \divisor, \dividend, lsr #16
+    movcs   \divisor, \divisor, lsl #16
+    addcs   \bits, \bits, #16
+    cmn     \divisor, \dividend, lsr #8
+    movcs   \divisor, \divisor, lsl #8
+    addcs   \bits, \bits, #8
+    cmn     \divisor, \dividend, lsr #4
+    movcs   \divisor, \divisor, lsl #4
+    addcs   \bits, \bits, #4
+    cmn     \divisor, \dividend, lsr #2
+    movcs   \divisor, \divisor, lsl #2
+    addcs   \bits, \bits, #2
+    cmn     \divisor, \dividend, lsr #1
+    movcs   \divisor, \divisor, lsl #1
+    addcs   \bits, \bits, #1
+    adds    \result, \dividend, \divisor
+    subcc   \result, \result, \divisor
+    rsb     \curbit, \bits, #31
+    add     pc, pc, \curbit, lsl #3
+    nop
+    .rept   30
+    adcs    \result, \divisor, \result, lsl #1
+    /* Fix the remainder portion of the result. This must be done because the
+       handler for 32-bit numerators needs the remainder. */
+    subcc   \result, \result, \divisor
+    .endr
+    /* Shift remainder/quotient left one, add final quotient bit */
+    adc     \result, \result, \result
+    mov     \remainder, \result, lsr \bits
+    eor     \quotient, \result, \remainder, lsl \bits
+.endm
+#ifdef CPU_PP
+#if CONFIG_CPU == PP5020
+.set recip_max, 8384
+#elif CONFIG_CPU == PP5002
+.set recip_max, 4608
+#else
+.set recip_max, 16384
+#endif
+#elif CONFIG_CPU == AS3525
+.set recip_max, 42752
+#elif CONFIG_CPU == S5L8701
+.set recip_max, 12800
+#elif CONFIG_CPU == S5L8700
+.set recip_max, 9088
+#endif
+udiv32_arm:
+#ifdef DIV_RECIP
+    cmp     r1, #3
+    bcc     .L_udiv_tiny
+    cmp     r1, #recip_max
+    bhi     .L_udiv
+    adr     r3, .L_udiv_recip_table-12
+    ldr     r2, [r3, r1, lsl #2]
+    mov     r3, r0
+    umull   ip, r0, r2, r0
+    mul     r2, r0, r1
+    cmp     r3, r2
+    bxcs    lr
+    sub     r0, r0, #1
+    bx      lr
+.L_udiv_tiny:
+    cmp     r1, #1
+    movhi   r0, r0, lsr #1
+    bxcs    lr
+    b       .L_div0
+#endif
+.L_udiv:
+    /* Invert divisor. ARM_DIV_31_BODY uses adc to both subtract the divisor
+       and add the next bit of the result. The correction code at .L_udiv32
+       does not need the divisor inverted, but can be modified to work with it,
+       and this allows the zero divisor test to be done early and without an
+       explicit comparison. */
+    rsbs    r1, r1, #0
+#ifndef DIV_RECIP
+    beq .L_div0
+#endif
+    tst     r0, r0
+    /* High bit must be unset, otherwise shift numerator right, calculate,
+       and correct results. As this case is very uncommon we want to avoid
+       any other delays on the main path in handling it, so the long divide
+       calls the short divide as a function. */
+    bmi     .L_udiv32
+.L_udiv31:
+    ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0, r1
+    bx      lr
+.L_udiv32:
+    /* store original numerator and divisor, we'll need them to correct the
+       result, */
+    stmdb   sp, { r0, r1, lr }
+    /* Call __div0 here if divisor is zero, otherwise it would report the wrong
+       address. */
+    mov     r0, r0, lsr #1
+    bl      .L_udiv31
+    ldmdb   sp, { r2, r3, lr }
+    /* Move the low bit of the original numerator to the carry bit */
+    movs    r2, r2, lsr #1
+    /* Shift the remainder left one and add in the carry bit */
+    adc     r1, r1, r1
+    /* Subtract the original divisor from the remainder, setting carry if the
+       result is non-negative */
+    adds    r1, r1, r3
+    /* Shift quotient left one and add carry bit */
+    adc     r0, r0, r0
+    bx      lr
+.L_div0:
+    /* __div0 expects the calling address on the top of the stack */
+    stmdb sp!, { lr }
+    mov     r0, #0
+#if defined(__ARM_EABI__) || !defined(USE_IRAM)
+    bl      __div0
+#else
+    ldr     pc, [pc, #-4]
+    .word   __div0
+#endif
+#ifdef DIV_RECIP
+.L_udiv_recip_table:
+    .set div, 3
+    .rept recip_max - 2
+        .if (div - 1) & div
+            .set q, 0x40000000 / div
+            .set r, (0x40000000 - (q * div))<<1
+            .set q, q << 1
+            .if r >= div
+                .set q, q + 1
+                .set r, r - div
+            .endif
+            .set r, r << 1
+            .set q, q << 1
+            .if r >= div
+                .set q, q + 1
+                .set r, r - div
+            .endif
+            .set q, q + 1
+        .else
+            .set q, 0x40000000 / div * 4
+        .endif
+        .word q
+        .set div, div+1
+    .endr
+#endif
+    .size udiv32_arm, . - udiv32_arm
+#else
+.macro ARMV5_UDIV32_BODY numerator, divisor, quotient, bits, inv, neg, div0label
+    cmp     \numerator, \divisor
+    clz     \bits, \divisor
+    bcc     30f
+    mov     \inv, \divisor, lsl \bits
+    add     \neg, pc, \inv, lsr #25
+    cmp     \inv, #1<<31
+    ldrhib  \inv, [\neg, #.L_udiv_est_table-.-64]
+    bls     20f
+    subs    \bits, \bits, #7
+    rsb     \neg, \divisor, #0
+    movpl   \divisor, \inv, lsl \bits
+    bmi     10f
+    mul     \inv, \divisor, \neg
+    smlawt  \divisor, \divisor, \inv, \divisor
+    mul     \inv, \divisor, \neg
+    /* This will save a cycle on ARMv6, but does not produce a correct result
+       if numerator sign bit is set. This case accounts for about 1 in 10^7 of
+       divisions, done by the APE decoder, so we specialize for the more common
+       case and handle the uncommon large-numerator separately */
+#if ARM_ARCH >= 6
+    tst     \numerator, \numerator
+    smmla   \divisor, \divisor, \inv, \divisor
+    bmi     40f
+    smmul   \inv, \numerator, \divisor
+#else
+    mov     \bits, #0
+    smlal   \bits, \divisor, \inv, \divisor
+    umull   \bits, \inv, \numerator, \divisor
+#endif
+    add     \numerator, \numerator, \neg
+    mla     \divisor, \inv, \neg, \numerator
+    mov     \quotient, \inv
+    cmn     \divisor, \neg
+    addcc   \quotient, \quotient, #1
+    addpl   \quotient, \quotient, #2
+    bx      lr
+10:
+    rsb     \bits, \bits, #0
+    sub     \inv, \inv, #4
+    mov     \divisor, \inv, lsr \bits
+    umull   \bits, \inv, \numerator, \divisor
+    mla     \divisor, \inv, \neg, \numerator
+    mov     \quotient, \inv
+    cmn     \neg, \divisor, lsr #1
+    addcs   \divisor, \divisor, \neg, lsl #1
+    addcs   \quotient, \quotient, #2
+    cmn     \neg, \divisor
+    addcs   \quotient, \quotient, #1
+    bx      lr
+20:
+.ifnc "", "\div0label"
+    rsb     \bits, \bits, #31
+    bne     \div0label
+.endif
+    mov     \quotient, \numerator, lsr \bits
+    bx      lr
+30:
+    mov     \quotient, #0
+    bx      lr
+#if ARM_ARCH >= 6
+40:
+    umull   \bits, \inv, \numerator, \divisor
+    add     \numerator, \numerator, \neg
+    mla     \divisor, \inv, \neg, \numerator
+    mov     \quotient, \inv
+    cmn     \divisor, \neg
+    addcc   \quotient, \quotient, #1
+    addpl   \quotient, \quotient, #2
+    bx      lr
+#endif
+.endm
+udiv32_arm:
+    ARMV5_UDIV32_BODY r0, r1, r0, r2, r3, ip, .L_div0
+.L_div0:
+    /* __div0 expects the calling address on the top of the stack */
+    stmdb sp!, { lr }
+    mov     r0, #0
+#if defined(__ARM_EABI__) || !defined(USE_IRAM)
+    bl      __div0
+#else
+    ldr     pc, [pc, #-4]
+    .word   __div0
+#endif
+.L_udiv_est_table:
+    .byte 0xff, 0xfc, 0xf8, 0xf4, 0xf0, 0xed, 0xea, 0xe6
+    .byte 0xe3, 0xe0, 0xdd, 0xda, 0xd7, 0xd4, 0xd2, 0xcf
+    .byte 0xcc, 0xca, 0xc7, 0xc5, 0xc3, 0xc0, 0xbe, 0xbc
+    .byte 0xba, 0xb8, 0xb6, 0xb4, 0xb2, 0xb0, 0xae, 0xac
+    .byte 0xaa, 0xa8, 0xa7, 0xa5, 0xa3, 0xa2, 0xa0, 0x9f
+    .byte 0x9d, 0x9c, 0x9a, 0x99, 0x97, 0x96, 0x94, 0x93
+    .byte 0x92, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8a, 0x89
+    .byte 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81
+#endif
+    .size udiv32_arm, . - udiv32_arm
author	Andrew Mahone <andrew.mahone@gmail.com>	2010-02-04 05:49:37 +0000
committer	Andrew Mahone <andrew.mahone@gmail.com>	2010-02-04 05:49:37 +0000
commit	8ed7bda64cb98d491431fd130eb754c6320441a0 (patch)
tree	be9b6340aaaba3820e87a57c7c3db545c9c2cf15 /apps/codecs/demac/libdemac
parent	7ed87517f734d7d70ab6f294735a77a65bd22e42 (diff)
download	rockbox-8ed7bda64cb98d491431fd130eb754c6320441a0.tar.gz rockbox-8ed7bda64cb98d491431fd130eb754c6320441a0.zip

diff --git a/apps/codecs/demac/libdemac/SOURCES b/apps/codecs/demac/libdemac/SOURCES index 5a4482376c..f9f8f217c7 100644 --- a/apps/codecs/demac/libdemac/SOURCES +++ b/apps/codecs/demac/libdemac/SOURCES
@@ -2,6 +2,7 @@ crc.c
2	predictor.c	2	predictor.c
3	#ifdef CPU_ARM	3	#ifdef CPU_ARM
4	predictor-arm.S	4	predictor-arm.S
		5	udiv32_arm.S
5	#elif defined CPU_COLDFIRE	6	#elif defined CPU_COLDFIRE
6	predictor-cf.S	7	predictor-cf.S
7	#endif	8	#endif


diff --git a/apps/codecs/demac/libdemac/demac_config.h b/apps/codecs/demac/libdemac/demac_config.h index 13166f69ae..7388aa1059 100644 --- a/apps/codecs/demac/libdemac/demac_config.h +++ b/apps/codecs/demac/libdemac/demac_config.h
@@ -91,10 +91,6 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
91		91
92	/* Defaults */	92	/* Defaults */
93		93
94	#ifndef UDIV32
95	#define UDIV32(a, b) (a / b)
96	#endif
97
98	#ifndef FILTER_HISTORY_SIZE	94	#ifndef FILTER_HISTORY_SIZE
99	#define FILTER_HISTORY_SIZE 512	95	#define FILTER_HISTORY_SIZE 512
100	#endif	96	#endif
@@ -109,6 +105,16 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
109		105
110		106
111	#ifndef __ASSEMBLER__	107	#ifndef __ASSEMBLER__
		108
		109	#if defined(CPU_ARM) && (ARM_ARCH < 5 \|\| defined(USE_IRAM))
		110	/* optimised unsigned integer division for ARMv4, in IRAM */
		111	unsigned udiv32_arm(unsigned a, unsigned b);
		112	#define UDIV32(a, b) udiv32_arm(a, b)
		113	#else
		114	/* default */
		115	#define UDIV32(a, b) (a / b)
		116	#endif
		117
112	#include <inttypes.h>	118	#include <inttypes.h>
113	#if FILTER_BITS == 32	119	#if FILTER_BITS == 32
114	typedef int32_t filter_int;	120	typedef int32_t filter_int;


diff --git a/apps/codecs/demac/libdemac/udiv32_arm.S b/apps/codecs/demac/libdemac/udiv32_arm.S new file mode 100644 index 0000000000..4492492d30 --- /dev/null +++ b/apps/codecs/demac/libdemac/udiv32_arm.S
@@ -0,0 +1,300 @@
		1	/***************************************************************************
		2	* __________ __ ___.
		3	* Open \______ \ ____ ____ \| \| _\_ \|__ _______ ___
		4	* Source \| _// _ \_/ ___\\| \|/ /\| __ \ / _ \ \/ /
		5	* Jukebox \| \| ( <_> ) \___\| < \| \_\ ( <_> > < <
		6	* Firmware \|____\|_ /\____/ \___ >__\|_ \\|___ /\____/__/\_ \
		7	* \/ \/ \/ \/ \/
		8	* $Id$
		9	*
		10	* Copyright (C) 2008 by Jens Arnold
		11	* Copyright (C) 2009 by Andrew Mahone
		12	*
		13	* Optimised unsigned integer division for ARMv4
		14	*
		15	* Based on: libgcc routines for ARM cpu, additional algorithms from ARM System
		16	* Developer's Guide
		17	* Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
		18	* Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
		19	* Free Software Foundation, Inc.
		20	*
		21	* This program is free software; you can redistribute it and/or
		22	* modify it under the terms of the GNU General Public License
		23	* as published by the Free Software Foundation; either version 2
		24	* of the License, or (at your option) any later version.
		25	*
		26	* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
		27	* KIND, either express or implied.
		28	*
		29	****************************************************************************/
		30
		31	#include "config.h"
		32	/* Codecs should not normally do this, but we need to check a macro, and
		33	* codecs.h would confuse the assembler. */
		34
		35	#ifdef USE_IRAM
		36	#define DIV_RECIP
		37	.section .icode,"ax",%progbits
		38	#else
		39	.text
		40	#endif
		41	.align
		42	.global udiv32_arm
		43	.type udiv32_arm,%function
		44
		45	#if ARM_ARCH < 5
		46	/* Adapted from an algorithm given in ARM System Developer's Guide (7.3.1.2)
		47	for dividing a 30-bit value by a 15-bit value, with two operations per
		48	iteration by storing quotient and remainder together and adding the previous
		49	quotient bit during trial subtraction. Modified to work with any dividend
		50	and divisor both less than 1 << 30, and skipping trials by calculating bits
		51	in output. */
		52	.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient, remainder
		53
		54	mov \bits, #1
		55	/* Shift the divisor left until it aligns with the numerator. If it already
		56	has the high bit set, this is fine, everything inside .rept will be
		57	skipped, and the add before and adcs after will set the one-bit result
		58	to zero. */
		59	cmn \divisor, \dividend, lsr #16
		60	movcs \divisor, \divisor, lsl #16
		61	addcs \bits, \bits, #16
		62	cmn \divisor, \dividend, lsr #8
		63	movcs \divisor, \divisor, lsl #8
		64	addcs \bits, \bits, #8
		65	cmn \divisor, \dividend, lsr #4
		66	movcs \divisor, \divisor, lsl #4
		67	addcs \bits, \bits, #4
		68	cmn \divisor, \dividend, lsr #2
		69	movcs \divisor, \divisor, lsl #2
		70	addcs \bits, \bits, #2
		71	cmn \divisor, \dividend, lsr #1
		72	movcs \divisor, \divisor, lsl #1
		73	addcs \bits, \bits, #1
		74	adds \result, \dividend, \divisor
		75	subcc \result, \result, \divisor
		76	rsb \curbit, \bits, #31
		77	add pc, pc, \curbit, lsl #3
		78	nop
		79	.rept 30
		80	adcs \result, \divisor, \result, lsl #1
		81	/* Fix the remainder portion of the result. This must be done because the
		82	handler for 32-bit numerators needs the remainder. */
		83	subcc \result, \result, \divisor
		84	.endr
		85	/* Shift remainder/quotient left one, add final quotient bit */
		86	adc \result, \result, \result
		87	mov \remainder, \result, lsr \bits
		88	eor \quotient, \result, \remainder, lsl \bits
		89	.endm
		90
		91	#ifdef CPU_PP
		92	#if CONFIG_CPU == PP5020
		93	.set recip_max, 8384
		94	#elif CONFIG_CPU == PP5002
		95	.set recip_max, 4608
		96	#else
		97	.set recip_max, 16384
		98	#endif
		99	#elif CONFIG_CPU == AS3525
		100	.set recip_max, 42752
		101	#elif CONFIG_CPU == S5L8701
		102	.set recip_max, 12800
		103	#elif CONFIG_CPU == S5L8700
		104	.set recip_max, 9088
		105	#endif
		106
		107	udiv32_arm:
		108	#ifdef DIV_RECIP
		109	cmp r1, #3
		110	bcc .L_udiv_tiny
		111	cmp r1, #recip_max
		112	bhi .L_udiv
		113	adr r3, .L_udiv_recip_table-12
		114	ldr r2, [r3, r1, lsl #2]
		115	mov r3, r0
		116	umull ip, r0, r2, r0
		117	mul r2, r0, r1
		118	cmp r3, r2
		119	bxcs lr
		120	sub r0, r0, #1
		121	bx lr
		122	.L_udiv_tiny:
		123	cmp r1, #1
		124	movhi r0, r0, lsr #1
		125	bxcs lr
		126	b .L_div0
		127	#endif
		128	.L_udiv:
		129	/* Invert divisor. ARM_DIV_31_BODY uses adc to both subtract the divisor
		130	and add the next bit of the result. The correction code at .L_udiv32
		131	does not need the divisor inverted, but can be modified to work with it,
		132	and this allows the zero divisor test to be done early and without an
		133	explicit comparison. */
		134	rsbs r1, r1, #0
		135	#ifndef DIV_RECIP
		136	beq .L_div0
		137	#endif
		138	tst r0, r0
		139	/* High bit must be unset, otherwise shift numerator right, calculate,
		140	and correct results. As this case is very uncommon we want to avoid
		141	any other delays on the main path in handling it, so the long divide
		142	calls the short divide as a function. */
		143	bmi .L_udiv32
		144	.L_udiv31:
		145	ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0, r1
		146	bx lr
		147	.L_udiv32:
		148	/* store original numerator and divisor, we'll need them to correct the
		149	result, */
		150	stmdb sp, { r0, r1, lr }
		151	/* Call __div0 here if divisor is zero, otherwise it would report the wrong
		152	address. */
		153	mov r0, r0, lsr #1
		154	bl .L_udiv31
		155	ldmdb sp, { r2, r3, lr }
		156	/* Move the low bit of the original numerator to the carry bit */
		157	movs r2, r2, lsr #1
		158	/* Shift the remainder left one and add in the carry bit */
		159	adc r1, r1, r1
		160	/* Subtract the original divisor from the remainder, setting carry if the
		161	result is non-negative */
		162	adds r1, r1, r3
		163	/* Shift quotient left one and add carry bit */
		164	adc r0, r0, r0
		165	bx lr
		166	.L_div0:
		167	/* __div0 expects the calling address on the top of the stack */
		168	stmdb sp!, { lr }
		169	mov r0, #0
		170	#if defined(__ARM_EABI__) \|\| !defined(USE_IRAM)
		171	bl __div0
		172	#else
		173	ldr pc, [pc, #-4]
		174	.word __div0
		175	#endif
		176	#ifdef DIV_RECIP
		177	.L_udiv_recip_table:
		178	.set div, 3
		179	.rept recip_max - 2
		180	.if (div - 1) & div
		181	.set q, 0x40000000 / div
		182	.set r, (0x40000000 - (q * div))<<1
		183	.set q, q << 1
		184	.if r >= div
		185	.set q, q + 1
		186	.set r, r - div
		187	.endif
		188	.set r, r << 1
		189	.set q, q << 1
		190	.if r >= div
		191	.set q, q + 1
		192	.set r, r - div
		193	.endif
		194	.set q, q + 1
		195	.else
		196	.set q, 0x40000000 / div * 4
		197	.endif
		198	.word q
		199	.set div, div+1
		200	.endr
		201	#endif
		202	.size udiv32_arm, . - udiv32_arm
		203
		204	#else
		205	.macro ARMV5_UDIV32_BODY numerator, divisor, quotient, bits, inv, neg, div0label
		206	cmp \numerator, \divisor
		207	clz \bits, \divisor
		208	bcc 30f
		209	mov \inv, \divisor, lsl \bits
		210	add \neg, pc, \inv, lsr #25
		211	cmp \inv, #1<<31
		212	ldrhib \inv, [\neg, #.L_udiv_est_table-.-64]
		213	bls 20f
		214	subs \bits, \bits, #7
		215	rsb \neg, \divisor, #0
		216	movpl \divisor, \inv, lsl \bits
		217	bmi 10f
		218	mul \inv, \divisor, \neg
		219	smlawt \divisor, \divisor, \inv, \divisor
		220	mul \inv, \divisor, \neg
		221	/* This will save a cycle on ARMv6, but does not produce a correct result
		222	if numerator sign bit is set. This case accounts for about 1 in 10^7 of
		223	divisions, done by the APE decoder, so we specialize for the more common
		224	case and handle the uncommon large-numerator separately */
		225	#if ARM_ARCH >= 6
		226	tst \numerator, \numerator
		227	smmla \divisor, \divisor, \inv, \divisor
		228	bmi 40f
		229	smmul \inv, \numerator, \divisor
		230	#else
		231	mov \bits, #0
		232	smlal \bits, \divisor, \inv, \divisor
		233	umull \bits, \inv, \numerator, \divisor
		234	#endif
		235	add \numerator, \numerator, \neg
		236	mla \divisor, \inv, \neg, \numerator
		237	mov \quotient, \inv
		238	cmn \divisor, \neg
		239	addcc \quotient, \quotient, #1
		240	addpl \quotient, \quotient, #2
		241	bx lr
		242	10:
		243	rsb \bits, \bits, #0
		244	sub \inv, \inv, #4
		245	mov \divisor, \inv, lsr \bits
		246	umull \bits, \inv, \numerator, \divisor
		247	mla \divisor, \inv, \neg, \numerator
		248	mov \quotient, \inv
		249	cmn \neg, \divisor, lsr #1
		250	addcs \divisor, \divisor, \neg, lsl #1
		251	addcs \quotient, \quotient, #2
		252	cmn \neg, \divisor
		253	addcs \quotient, \quotient, #1
		254	bx lr
		255	20:
		256	.ifnc "", "\div0label"
		257	rsb \bits, \bits, #31
		258	bne \div0label
		259	.endif
		260	mov \quotient, \numerator, lsr \bits
		261	bx lr
		262	30:
		263	mov \quotient, #0
		264	bx lr
		265	#if ARM_ARCH >= 6
		266	40:
		267	umull \bits, \inv, \numerator, \divisor
		268	add \numerator, \numerator, \neg
		269	mla \divisor, \inv, \neg, \numerator
		270	mov \quotient, \inv
		271	cmn \divisor, \neg
		272	addcc \quotient, \quotient, #1
		273	addpl \quotient, \quotient, #2
		274	bx lr
		275	#endif
		276	.endm
		277
		278	udiv32_arm:
		279	ARMV5_UDIV32_BODY r0, r1, r0, r2, r3, ip, .L_div0
		280	.L_div0:
		281	/* __div0 expects the calling address on the top of the stack */
		282	stmdb sp!, { lr }
		283	mov r0, #0
		284	#if defined(__ARM_EABI__) \|\| !defined(USE_IRAM)
		285	bl __div0
		286	#else
		287	ldr pc, [pc, #-4]
		288	.word __div0
		289	#endif
		290	.L_udiv_est_table:
		291	.byte 0xff, 0xfc, 0xf8, 0xf4, 0xf0, 0xed, 0xea, 0xe6
		292	.byte 0xe3, 0xe0, 0xdd, 0xda, 0xd7, 0xd4, 0xd2, 0xcf
		293	.byte 0xcc, 0xca, 0xc7, 0xc5, 0xc3, 0xc0, 0xbe, 0xbc
		294	.byte 0xba, 0xb8, 0xb6, 0xb4, 0xb2, 0xb0, 0xae, 0xac
		295	.byte 0xaa, 0xa8, 0xa7, 0xa5, 0xa3, 0xa2, 0xa0, 0x9f
		296	.byte 0x9d, 0x9c, 0x9a, 0x99, 0x97, 0x96, 0x94, 0x93
		297	.byte 0x92, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8a, 0x89
		298	.byte 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81
		299	#endif
		300	.size udiv32_arm, . - udiv32_arm