summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2008-11-06 21:21:33 +0000
committerJens Arnold <amiconn@rockbox.org>2008-11-06 21:21:33 +0000
commit545b51e2e46db6e9c6dd59ef45f73cc86529bace (patch)
treece3f11540b1bfb7e7ee1127bdbd3c587289b26e6
parent35823422c225336ac3788a4919a06d6cafaa3c1f (diff)
downloadrockbox-545b51e2e46db6e9c6dd59ef45f73cc86529bace.tar.gz
rockbox-545b51e2e46db6e9c6dd59ef45f73cc86529bace.zip
ARMv4 unsigned integer division: Using an overflow-safe comparison method in the main calculation allows to put back the 1.5 cyle (average) optimisation. Shaved off another instruction, as we don't need the remainder. * Use the very efficient ffs algorithm from ffs-arm.S for dividing by a power of 2.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19032 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/lib/udiv32_armv4.S55
1 files changed, 31 insertions, 24 deletions
diff --git a/apps/codecs/lib/udiv32_armv4.S b/apps/codecs/lib/udiv32_armv4.S
index 2cb6fc8088..6b34cae1b3 100644
--- a/apps/codecs/lib/udiv32_armv4.S
+++ b/apps/codecs/lib/udiv32_armv4.S
@@ -33,7 +33,7 @@
33.macro ARM_DIV_BODY dividend, divisor, result, curbit 33.macro ARM_DIV_BODY dividend, divisor, result, curbit
34 34
35 mov \result, \dividend 35 mov \result, \dividend
36 mov \curbit, #93 @ 3 * 31, (calculating branch dest) 36 mov \curbit, #90 @ 3 * 30, (calculating branch dest)
37 cmp \divisor, \result, lsr #16 37 cmp \divisor, \result, lsr #16
38 movls \result,\result, lsr #16 38 movls \result,\result, lsr #16
39 subls \curbit, \curbit, #48 39 subls \curbit, \curbit, #48
@@ -44,40 +44,35 @@
44 movls \result,\result, lsr #4 44 movls \result,\result, lsr #4
45 subls \curbit, \curbit, #12 45 subls \curbit, \curbit, #12
46 cmp \divisor, \result, lsr #2 46 cmp \divisor, \result, lsr #2
47 movls \result,\result, lsr #2
48 subls \curbit, \curbit, #6 47 subls \curbit, \curbit, #6
49 cmp \divisor, \result, lsr #1 48 @ Calculation is only done down to shift=2, because the shift=1 step
50 subls \curbit, \curbit, #3 49 @ would need 3 more cycles, but would only gain 1.5 cycles on average.
51 mov \result, #0 50 mov \result, #0
52 add pc, pc, \curbit, lsl #2 51 add pc, pc, \curbit, lsl #2
53 nop 52 nop
54 .set shift, 32 53 .set shift, 32
55 .rept 32 54 .rept 31
56 .set shift, shift - 1 55 .set shift, shift - 1
57 cmp \dividend, \divisor, lsl #shift 56 cmp \divisor, \dividend, lsr #shift
58 adc \result, \result, \result 57 orrls \result, \result, #(1 << shift)
59 subcs \dividend, \dividend, \divisor, lsl #shift 58 subls \dividend, \dividend, \divisor, lsl #shift
60 .endr 59 .endr @ shift==0 in the .rept would cause a warning for lsr #0
60 cmp \divisor, \dividend
61 orrls \result, \result, #1
62 @subls \dividend, \dividend, \divisor @ correct remainder not needed
61.endm 63.endm
62 64
63.macro ARM_DIV2_ORDER divisor, order 65.macro ARM_DIV2_ORDER divisor, order
64 66
65 cmp \divisor, #(1 << 16) 67 @ There's exactly one bit set in the divisor, so ffs() can be used
66 movhs \divisor, \divisor, lsr #16 68 @ This is the ffs algorithm devised by D.Seal and posted to
67 movhs \order, #16 69 @ comp.sys.arm on 16 Feb 1994.
68 movlo \order, #0 70 adr \order, L_ffs_table
71 orr \divisor, \divisor, \divisor, lsl #4 @ = X * 0x11
72 orr \divisor, \divisor, \divisor, lsl #6 @ = X * 0x451
73 rsb \divisor, \divisor, \divisor, lsl #16 @ = X * 0x0450fbaf
69 74
70 cmp \divisor, #(1 << 8) 75 ldrb \order, [\order, \divisor, lsr #26]
71 movhs \divisor, \divisor, lsr #8
72 addhs \order, \order, #8
73
74 cmp \divisor, #(1 << 4)
75 movhs \divisor, \divisor, lsr #4
76 addhs \order, \order, #4
77
78 cmp \divisor, #(1 << 2)
79 addhi \order, \order, #3
80 addls \order, \order, \divisor, lsr #1
81.endm 76.endm
82 77
83 78
@@ -113,3 +108,15 @@ udiv32_arm:
113 ARM_DIV2_ORDER r1, r2 108 ARM_DIV2_ORDER r1, r2
114 mov r0, r0, lsr r2 109 mov r0, r0, lsr r2
115 bx lr 110 bx lr
111
112L_ffs_table:
113 @ 0 1 2 3 4 5 6 7
114 @----------------------------------------------
115 .byte 32, 0, 1, 12, 2, 6, 0, 13 @ 0- 7
116 .byte 3, 0, 7, 0, 0, 0, 0, 14 @ 8-15
117 .byte 10, 4, 0, 0, 8, 0, 0, 25 @ 16-23
118 .byte 0, 0, 0, 0, 0, 21, 27, 15 @ 24-31
119 .byte 31, 11, 5, 0, 0, 0, 0, 0 @ 32-39
120 .byte 9, 0, 0, 24, 0, 0, 20, 26 @ 40-47
121 .byte 30, 0, 0, 0, 0, 23, 0, 19 @ 48-55
122 .byte 29, 0, 22, 18, 28, 17, 16, 0 @ 56-63