diff options
Diffstat (limited to 'apps/codecs')
-rw-r--r-- | apps/codecs/lib/udiv32_armv4.S | 107 |
1 files changed, 54 insertions, 53 deletions
diff --git a/apps/codecs/lib/udiv32_armv4.S b/apps/codecs/lib/udiv32_armv4.S index 491cf43862..b54156809c 100644 --- a/apps/codecs/lib/udiv32_armv4.S +++ b/apps/codecs/lib/udiv32_armv4.S | |||
@@ -36,11 +36,14 @@ | |||
36 | iteration by storing quotient and remainder together and adding the previous | 36 | iteration by storing quotient and remainder together and adding the previous |
37 | quotient bit during trial subtraction. Modified to work with any dividend | 37 | quotient bit during trial subtraction. Modified to work with any dividend |
38 | and divisor both less than 1 << 30, and skipping trials by calculating bits | 38 | and divisor both less than 1 << 30, and skipping trials by calculating bits |
39 | in output. | 39 | in output. */ |
40 | */ | 40 | .macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient, remainder |
41 | .macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient | ||
42 | 41 | ||
43 | mov \bits, #1 | 42 | mov \bits, #1 |
43 | /* Shift the divisor left until it aligns with the numerator. If it already | ||
44 | has the high bit set, this is fine, everything inside .rept will be | ||
45 | skipped, and the add before and adcs after will set the one-bit result | ||
46 | to zero. */ | ||
44 | cmp \divisor, \dividend, lsr #16 | 47 | cmp \divisor, \dividend, lsr #16 |
45 | movls \divisor, \divisor, lsl #16 | 48 | movls \divisor, \divisor, lsl #16 |
46 | addls \bits, \bits, #16 | 49 | addls \bits, \bits, #16 |
@@ -56,7 +59,8 @@ | |||
56 | cmp \divisor, \dividend, lsr #1 | 59 | cmp \divisor, \dividend, lsr #1 |
57 | movls \divisor, \divisor, lsl #1 | 60 | movls \divisor, \divisor, lsl #1 |
58 | addls \bits, \bits, #1 | 61 | addls \bits, \bits, #1 |
59 | rsb \divisor, \divisor, #0 | 62 | rsbs \divisor, \divisor, #0 |
63 | bcs .L_div0 | ||
60 | adds \result, \dividend, \divisor | 64 | adds \result, \dividend, \divisor |
61 | subcc \result, \result, \divisor | 65 | subcc \result, \result, \divisor |
62 | rsb \curbit, \bits, #31 | 66 | rsb \curbit, \bits, #31 |
@@ -64,44 +68,14 @@ | |||
64 | nop | 68 | nop |
65 | .rept 30 | 69 | .rept 30 |
66 | adcs \result, \divisor, \result, lsl #1 | 70 | adcs \result, \divisor, \result, lsl #1 |
71 | /* Fix the remainder portion of the result. This must be done because the | ||
72 | handler for 32-bit numerators needs the remainder. */ | ||
67 | subcc \result, \result, \divisor | 73 | subcc \result, \result, \divisor |
68 | .endr | 74 | .endr |
69 | /* shift remainder/quotient left one, add final quotient bit */ | 75 | /* Shift remainder/quotient left one, add final quotient bit */ |
70 | adc \result, \result, \result | 76 | adc \result, \result, \result |
71 | mov \dividend, \result, lsr \bits | 77 | mov \remainder, \result, lsr \bits |
72 | eor \quotient, \result, \dividend, lsl \bits | 78 | eor \quotient, \result, \remainder, lsl \bits |
73 | .endm | ||
74 | |||
75 | .macro ARM_DIV_32_BODY dividend, divisor, result, curbit | ||
76 | |||
77 | mov \result, \dividend | ||
78 | mov \curbit, #90 @ 3 * 30, (calculating branch dest) | ||
79 | cmp \divisor, \result, lsr #16 | ||
80 | movls \result,\result, lsr #16 | ||
81 | subls \curbit, \curbit, #48 | ||
82 | cmp \divisor, \result, lsr #8 | ||
83 | movls \result,\result, lsr #8 | ||
84 | subls \curbit, \curbit, #24 | ||
85 | cmp \divisor, \result, lsr #4 | ||
86 | movls \result,\result, lsr #4 | ||
87 | subls \curbit, \curbit, #12 | ||
88 | cmp \divisor, \result, lsr #2 | ||
89 | subls \curbit, \curbit, #6 | ||
90 | @ Calculation is only done down to shift=2, because the shift=1 step | ||
91 | @ would need 3 more cycles, but would only gain 1.5 cycles on average. | ||
92 | mov \result, #0 | ||
93 | add pc, pc, \curbit, lsl #2 | ||
94 | nop | ||
95 | .set shift, 32 | ||
96 | .rept 31 | ||
97 | .set shift, shift - 1 | ||
98 | cmp \divisor, \dividend, lsr #shift | ||
99 | orrls \result, \result, #(1 << shift) | ||
100 | subls \dividend, \dividend, \divisor, lsl #shift | ||
101 | .endr @ shift==0 in the .rept would cause a warning for lsr #0 | ||
102 | cmp \divisor, \dividend | ||
103 | orrls \result, \result, #1 | ||
104 | @subls \dividend, \dividend, \divisor @ correct remainder not needed | ||
105 | .endm | 79 | .endm |
106 | 80 | ||
107 | #ifdef USE_IRAM | 81 | #ifdef USE_IRAM |
@@ -114,21 +88,48 @@ | |||
114 | .type udiv32_arm,%function | 88 | .type udiv32_arm,%function |
115 | 89 | ||
116 | udiv32_arm: | 90 | udiv32_arm: |
117 | cmp r1, #0 | ||
118 | beq 20f | ||
119 | tst r0, r0 | 91 | tst r0, r0 |
120 | /* High bit must be unset, otherwise use ARM_DIV_32_BODY. High bit of | 92 | /* High bit must be unset, otherwise shift numerator right, calculate, |
121 | divisor is also unset dividend has been tested to be >= divisor. | 93 | and correct results. As this case is very uncommon we want to avoid |
122 | */ | 94 | any other delays on the main path in handling it, so the long divide |
123 | bmi 10f | 95 | calls the short divide as a function. */ |
124 | ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0 | 96 | bmi .L_udiv32 |
125 | bx lr | 97 | .L_udiv31: |
126 | 98 | ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0, r1 | |
127 | 10: | ||
128 | ARM_DIV_32_BODY r0, r1, r2, r3 | ||
129 | mov r0, r2 | ||
130 | bx lr | 99 | bx lr |
131 | 100 | ||
132 | 20: | 101 | .L_udiv32: |
133 | movne r0, #0 | 102 | /* store original numerator and divisor, we'll need them to correct the |
103 | result, */ | ||
104 | stmdb sp, { r0, r1, lr } | ||
105 | /* Call __div0 here if divisor is zero, otherwise it would report the wrong | ||
106 | address. */ | ||
107 | mov r0, r0, lsr #1 | ||
108 | bl .L_udiv31 | ||
109 | /* This address is never a branch target, but is used to test lr before | ||
110 | calling __div0. */ | ||
111 | .L_udiv32_div0_trap: | ||
112 | ldmdb sp, { r2, r3, lr } | ||
113 | /* Move the low bit of the original numerator to the carry bit */ | ||
114 | movs r2, r2, lsr #1 | ||
115 | /* Shift the remainder left one and add in the carry bit */ | ||
116 | adc r1, r1, r1 | ||
117 | /* Subtract the original divisor from the remainder, setting carry if the | ||
118 | result is non-negative */ | ||
119 | subs r1, r1, r3 | ||
120 | /* Shift quotient left one and add carry bit */ | ||
121 | adc r0, r0, r0 | ||
134 | bx lr | 122 | bx lr |
123 | .L_div0: | ||
124 | /* Check the return address, since .L_udiv32 uses bl to wrap the 31-bit | ||
125 | divider. If the return address is at .L_udiv32_div0_trap, then the | ||
126 | the return address of the original caller is at sp - 4 | ||
127 | */ | ||
128 | adr r2, .L_udiv32_div0_trap | ||
129 | cmp r2, lr | ||
130 | subeq sp, sp, #4 | ||
131 | bleq __div0 | ||
132 | /* Otherwise, push lr to the stack before calling __div0 */ | ||
133 | stmdb sp!, { lr } | ||
134 | bl __div0 | ||
135 | .size udiv32_arm, . - udiv32_arm | ||