1 files changed, 54 insertions, 53 deletions
diff --git a/apps/codecs/lib/udiv32_armv4.S b/apps/codecs/lib/udiv32_armv4.S
index 491cf43862..b54156809c 100644
--- a/apps/codecs/lib/udiv32_armv4.S
+++ b/apps/codecs/lib/udiv32_armv4.S
@@ -36,11 +36,14 @@
   iteration by storing quotient and remainder together and adding the previous
   quotient bit during trial subtraction. Modified to work with any dividend
   and divisor both less than 1 << 30, and skipping trials by calculating bits
-   in output.
+   in output. */
-*/
+.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient, remainder
-.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient
    mov     \bits, #1
+    /* Shift the divisor left until it aligns with the numerator. If it already
+       has the high bit set, this is fine, everything inside .rept will be
+       skipped, and the add before and adcs after will set the one-bit result
+       to zero. */
    cmp     \divisor, \dividend, lsr #16
    movls   \divisor, \divisor, lsl #16
    addls   \bits, \bits, #16
@@ -56,7 +59,8 @@
    cmp     \divisor, \dividend, lsr #1
    movls   \divisor, \divisor, lsl #1
    addls   \bits, \bits, #1
-    rsb     \divisor, \divisor, #0
+    rsbs    \divisor, \divisor, #0
+    bcs     .L_div0
    adds    \result, \dividend, \divisor
    subcc   \result, \result, \divisor
    rsb     \curbit, \bits, #31
@@ -64,44 +68,14 @@
    nop
    .rept   30
    adcs    \result, \divisor, \result, lsl #1
+    /* Fix the remainder portion of the result. This must be done because the
+       handler for 32-bit numerators needs the remainder. */
    subcc   \result, \result, \divisor
    .endr
-    /* shift remainder/quotient left one, add final quotient bit */
+    /* Shift remainder/quotient left one, add final quotient bit */
    adc     \result, \result, \result
-    mov     \dividend, \result, lsr \bits
+    mov     \remainder, \result, lsr \bits
-    eor     \quotient, \result, \dividend, lsl \bits
+    eor     \quotient, \result, \remainder, lsl \bits
-.endm
-.macro ARM_DIV_32_BODY dividend, divisor, result, curbit
-    mov     \result, \dividend
-    mov     \curbit, #90          @ 3 * 30, (calculating branch dest)
-    cmp     \divisor, \result, lsr #16
-    movls   \result,\result, lsr #16
-    subls   \curbit, \curbit, #48
-    cmp     \divisor, \result, lsr #8
-    movls   \result,\result, lsr #8
-    subls   \curbit, \curbit, #24
-    cmp     \divisor, \result, lsr #4
-    movls   \result,\result, lsr #4
-    subls   \curbit, \curbit, #12
-    cmp     \divisor, \result, lsr #2
-    subls   \curbit, \curbit, #6
-    @ Calculation is only done down to shift=2, because the shift=1 step
-    @ would need 3 more cycles, but would only gain 1.5 cycles on average.
-    mov     \result, #0
-    add     pc, pc, \curbit, lsl #2
-    nop
-    .set    shift, 32
-    .rept   31
-    .set    shift, shift - 1
-    cmp     \divisor, \dividend, lsr #shift
-    orrls   \result, \result, #(1 << shift)
-    subls   \dividend, \dividend, \divisor, lsl #shift
-    .endr   @ shift==0 in the .rept would cause a warning  for lsr #0
-    cmp     \divisor, \dividend
-    orrls   \result, \result, #1
-    @subls  \dividend, \dividend, \divisor  @ correct remainder not needed
 .endm
 #ifdef USE_IRAM
@@ -114,21 +88,48 @@
    .type   udiv32_arm,%function
 udiv32_arm:
-    cmp     r1, #0
-    beq     20f
    tst     r0, r0
-    /* High bit must be unset, otherwise use ARM_DIV_32_BODY. High bit of
+    /* High bit must be unset, otherwise shift numerator right, calculate,
-       divisor is also unset dividend has been tested to be >= divisor.
+       and correct results. As this case is very uncommon we want to avoid
-    */
+       any other delays on the main path in handling it, so the long divide
-    bmi     10f
+       calls the short divide as a function. */
-    ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0
+    bmi     .L_udiv32
-    bx      lr
+.L_udiv31:
+    ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0, r1
-10:
-    ARM_DIV_32_BODY r0, r1, r2, r3
-    mov     r0, r2
    bx      lr
-20:
+.L_udiv32:
-    movne   r0, #0
+    /* store original numerator and divisor, we'll need them to correct the
+       result, */
+    stmdb   sp, { r0, r1, lr }
+    /* Call __div0 here if divisor is zero, otherwise it would report the wrong
+       address. */
+    mov     r0, r0, lsr #1
+    bl      .L_udiv31
+    /* This address is never a branch target, but is used to test lr before
+       calling __div0. */
+.L_udiv32_div0_trap:
+    ldmdb   sp, { r2, r3, lr }
+    /* Move the low bit of the original numerator to the carry bit */
+    movs    r2, r2, lsr #1
+    /* Shift the remainder left one and add in the carry bit */
+    adc     r1, r1, r1
+    /* Subtract the original divisor from the remainder, setting carry if the
+       result is non-negative */
+    subs    r1, r1, r3
+    /* Shift quotient left one and add carry bit */
+    adc     r0, r0, r0
    bx      lr
+.L_div0:
+    /* Check the return address, since .L_udiv32 uses bl to wrap the 31-bit
+       divider. If the return address is at .L_udiv32_div0_trap, then the 
+       the return address of the original caller is at sp - 4
+    */
+    adr     r2, .L_udiv32_div0_trap
+    cmp     r2, lr
+    subeq     sp, sp, #4
+    bleq    __div0
+    /* Otherwise, push lr to the stack before calling __div0 */
+    stmdb sp!, { lr }
+    bl      __div0
+    .size udiv32_arm, . - udiv32_arm

diff --git a/apps/codecs/lib/udiv32_armv4.S b/apps/codecs/lib/udiv32_armv4.S index 491cf43862..b54156809c 100644 --- a/apps/codecs/lib/udiv32_armv4.S +++ b/apps/codecs/lib/udiv32_armv4.S
@@ -36,11 +36,14 @@
36	iteration by storing quotient and remainder together and adding the previous	36	iteration by storing quotient and remainder together and adding the previous
37	quotient bit during trial subtraction. Modified to work with any dividend	37	quotient bit during trial subtraction. Modified to work with any dividend
38	and divisor both less than 1 << 30, and skipping trials by calculating bits	38	and divisor both less than 1 << 30, and skipping trials by calculating bits
39	in output.	39	in output. */
40	*/	40	.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient, remainder
41	.macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient
42		41
43	mov \bits, #1	42	mov \bits, #1
		43	/* Shift the divisor left until it aligns with the numerator. If it already
		44	has the high bit set, this is fine, everything inside .rept will be
		45	skipped, and the add before and adcs after will set the one-bit result
		46	to zero. */
44	cmp \divisor, \dividend, lsr #16	47	cmp \divisor, \dividend, lsr #16
45	movls \divisor, \divisor, lsl #16	48	movls \divisor, \divisor, lsl #16
46	addls \bits, \bits, #16	49	addls \bits, \bits, #16
@@ -56,7 +59,8 @@
56	cmp \divisor, \dividend, lsr #1	59	cmp \divisor, \dividend, lsr #1
57	movls \divisor, \divisor, lsl #1	60	movls \divisor, \divisor, lsl #1
58	addls \bits, \bits, #1	61	addls \bits, \bits, #1
59	rsb \divisor, \divisor, #0	62	rsbs \divisor, \divisor, #0
		63	bcs .L_div0
60	adds \result, \dividend, \divisor	64	adds \result, \dividend, \divisor
61	subcc \result, \result, \divisor	65	subcc \result, \result, \divisor
62	rsb \curbit, \bits, #31	66	rsb \curbit, \bits, #31
@@ -64,44 +68,14 @@
64	nop	68	nop
65	.rept 30	69	.rept 30
66	adcs \result, \divisor, \result, lsl #1	70	adcs \result, \divisor, \result, lsl #1
		71	/* Fix the remainder portion of the result. This must be done because the
		72	handler for 32-bit numerators needs the remainder. */
67	subcc \result, \result, \divisor	73	subcc \result, \result, \divisor
68	.endr	74	.endr
69	/* shift remainder/quotient left one, add final quotient bit */	75	/* Shift remainder/quotient left one, add final quotient bit */
70	adc \result, \result, \result	76	adc \result, \result, \result
71	mov \dividend, \result, lsr \bits	77	mov \remainder, \result, lsr \bits
72	eor \quotient, \result, \dividend, lsl \bits	78	eor \quotient, \result, \remainder, lsl \bits
73	.endm
74
75	.macro ARM_DIV_32_BODY dividend, divisor, result, curbit
76
77	mov \result, \dividend
78	mov \curbit, #90 @ 3 * 30, (calculating branch dest)
79	cmp \divisor, \result, lsr #16
80	movls \result,\result, lsr #16
81	subls \curbit, \curbit, #48
82	cmp \divisor, \result, lsr #8
83	movls \result,\result, lsr #8
84	subls \curbit, \curbit, #24
85	cmp \divisor, \result, lsr #4
86	movls \result,\result, lsr #4
87	subls \curbit, \curbit, #12
88	cmp \divisor, \result, lsr #2
89	subls \curbit, \curbit, #6
90	@ Calculation is only done down to shift=2, because the shift=1 step
91	@ would need 3 more cycles, but would only gain 1.5 cycles on average.
92	mov \result, #0
93	add pc, pc, \curbit, lsl #2
94	nop
95	.set shift, 32
96	.rept 31
97	.set shift, shift - 1
98	cmp \divisor, \dividend, lsr #shift
99	orrls \result, \result, #(1 << shift)
100	subls \dividend, \dividend, \divisor, lsl #shift
101	.endr @ shift==0 in the .rept would cause a warning for lsr #0
102	cmp \divisor, \dividend
103	orrls \result, \result, #1
104	@subls \dividend, \dividend, \divisor @ correct remainder not needed
105	.endm	79	.endm
106		80
107	#ifdef USE_IRAM	81	#ifdef USE_IRAM
@@ -114,21 +88,48 @@
114	.type udiv32_arm,%function	88	.type udiv32_arm,%function
115		89
116	udiv32_arm:	90	udiv32_arm:
117	cmp r1, #0
118	beq 20f
119	tst r0, r0	91	tst r0, r0
120	/* High bit must be unset, otherwise use ARM_DIV_32_BODY. High bit of	92	/* High bit must be unset, otherwise shift numerator right, calculate,
121	divisor is also unset dividend has been tested to be >= divisor.	93	and correct results. As this case is very uncommon we want to avoid
122	*/	94	any other delays on the main path in handling it, so the long divide
123	bmi 10f	95	calls the short divide as a function. */
124	ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0	96	bmi .L_udiv32
125	bx lr	97	.L_udiv31:
126		98	ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0, r1
127	10:
128	ARM_DIV_32_BODY r0, r1, r2, r3
129	mov r0, r2
130	bx lr	99	bx lr
131		100
132	20:	101	.L_udiv32:
133	movne r0, #0	102	/* store original numerator and divisor, we'll need them to correct the
		103	result, */
		104	stmdb sp, { r0, r1, lr }
		105	/* Call __div0 here if divisor is zero, otherwise it would report the wrong
		106	address. */
		107	mov r0, r0, lsr #1
		108	bl .L_udiv31
		109	/* This address is never a branch target, but is used to test lr before
		110	calling __div0. */
		111	.L_udiv32_div0_trap:
		112	ldmdb sp, { r2, r3, lr }
		113	/* Move the low bit of the original numerator to the carry bit */
		114	movs r2, r2, lsr #1
		115	/* Shift the remainder left one and add in the carry bit */
		116	adc r1, r1, r1
		117	/* Subtract the original divisor from the remainder, setting carry if the
		118	result is non-negative */
		119	subs r1, r1, r3
		120	/* Shift quotient left one and add carry bit */
		121	adc r0, r0, r0
134	bx lr	122	bx lr
		123	.L_div0:
		124	/* Check the return address, since .L_udiv32 uses bl to wrap the 31-bit
		125	divider. If the return address is at .L_udiv32_div0_trap, then the
		126	the return address of the original caller is at sp - 4
		127	*/
		128	adr r2, .L_udiv32_div0_trap
		129	cmp r2, lr
		130	subeq sp, sp, #4
		131	bleq __div0
		132	/* Otherwise, push lr to the stack before calling __div0 */
		133	stmdb sp!, { lr }
		134	bl __div0
		135	.size udiv32_arm, . - udiv32_arm