summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2010-05-02 12:13:26 +0000
committerJens Arnold <amiconn@rockbox.org>2010-05-02 12:13:26 +0000
commitf2759305a9ccf5bf987548a7dedd48b428b34fdb (patch)
tree7f922bfcb5f152aea0e5e73efda1cf5082e0051a
parent9de9b9dfbe6b00c63e44e5497718ae4fc77b8ffe (diff)
downloadrockbox-f2759305a9ccf5bf987548a7dedd48b428b34fdb.tar.gz
rockbox-f2759305a9ccf5bf987548a7dedd48b428b34fdb.zip
Gigabeat S: Reduce stalling in the ARMv6 IDCT. Also save one instruction per loop, and fix comments. Speeds up fullscreen video decoding by about 5% (excluding video output). Still not perfect...
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@25775 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/plugins/mpegplayer/idct_armv6.S102
1 files changed, 52 insertions, 50 deletions
diff --git a/apps/plugins/mpegplayer/idct_armv6.S b/apps/plugins/mpegplayer/idct_armv6.S
index fbffa4dfa9..dc53cbd7bd 100644
--- a/apps/plugins/mpegplayer/idct_armv6.S
+++ b/apps/plugins/mpegplayer/idct_armv6.S
@@ -39,42 +39,44 @@
39 ldrd r4, L_W1357 @ load W1, W3, W5, W7 39 ldrd r4, L_W1357 @ load W1, W3, W5, W7
40 40
41 smuad r6, r4, r10 @ b0 = W1 * f1 + W3 * f3 41 smuad r6, r4, r10 @ b0 = W1 * f1 + W3 * f3
42 smlad r6, r5, r11, r6 @ + W5 * f5 + W7 * f7 42 smultt r7, r5, r10 @ -b1 = W7 * f3
43 smulbt r8, r4, r10 @ -b2 = W1 * f3
43 44
44 smultt r7, r5, r10 @ b1 = -W7 * f3 45 smusdx r9, r10, r5 @ b3 = f1 * W7 - f3 * W5
45 smlabb r7, r4, r11, r7 @ + -W1 * f5 46 smlabb r7, r4, r11, r7 @ -b1 += W1 * f5
46 smlabt r7, r5, r11, r7 @ + -W5 * f7 47 rsb r8, r8, #0 @ b2 = -b2
47 rsb r7, r7, #0 48 smlabb r8, r5, r10, r8 @ b2 += W5 * f1
48 smlatb r7, r4, r10, r7 @ + W3 * f1
49 49
50 smulbt r8, r4, r10 @ b2 = -W1 * f3 50 smlad r6, r5, r11, r6 @ b0 += W5 * f5 + W7 * f7
51 rsb r8, r8, #0 51 smlabt r7, r5, r11, r7 @ -b1 += W5 * f7
52 smlabb r8, r5, r10, r8 @ + W5 * f1 52 smlatb r8, r5, r11, r8 @ b2 += W7 * f5
53 smlatb r8, r5, r11, r8 @ + W7 * f5
54 smlatt r8, r4, r11, r8 @ + W3 * f7
55 53
56 smusdx r9, r10, r5 @ b3 = f1 * W7 - f3 * W5 54 smlsdx r9, r11, r4, r9 @ b3 += f5 * W3 - f7 * W1
57 smlsdx r9, r11, r4, r9 @ + f5 * W3 - f1 * W1 55 rsb r7, r7, #0 @ b1 = -b1
56 smlatb r7, r4, r10, r7 @ b1 += W3 * f1
57 smlatt r8, r4, r11, r8 @ b2 += W3 * f7
58 58
59 ldrd r4, L_W0246 @ load W0, W2, W4, W6 59 ldrd r4, L_W0246 @ load W0, W2, W4, W6
60 add r2, r2, #1 @ f0 += 1 60 add r2, r2, #1 @ f0 += 1
61 61
62 smulbb r10, r4, r2 @ a0' = W0 * f0 62 smulbb r10, r5, r3 @ a0' = W4 * f4
63 smlabb r10, r5, r3, r10 @ + W4 * f4 63 smultt r12, r5, r3 @ a3' = W6 * f6
64 smultt r12, r4, r2 @ a3' = W2 * f2 64 smultt r3, r4, r3 @ -a2' = W2 * f6
65 smlatt r12, r5, r3, r12 @ + W6 * f6 65
66 rsb r11, r10, #0 @ a1' = -W4 * f4
67 smlabb r10, r4, r2, r10 @ a0' += W0 * f0
68 smlabb r11, r4, r2, r11 @ a1' += W0 * f0
69 smlatt r12, r4, r2, r12 @ a3' += W2 * f2
70 rsb r3, r3, #0 @ a2' = -a2'
71 smlatt r3, r5, r2, r3 @ a2' += W6 * f2
72
66 add r10, r10, r12 @ a0 = a0' + a3' 73 add r10, r10, r12 @ a0 = a0' + a3'
67 sub r12, r10, r12, lsl #1 @ a3 = a0 - 2 * a3' 74 sub r12, r10, r12, lsl #1 @ a3 = a0 - 2 * a3'
68
69 smulbb r11, r5, r3 @ a1' = -W4 * f4
70 rsb r11, r11, #0
71 smlabb r11, r4, r2, r11 @ + W0 * f0
72 smultt r3, r4, r3 @ a2' = -W2 * f6
73 rsb r3, r3, #0
74 smlatt r3, r5, r2, r3 @ + W6 * f2
75 add r11, r11, r3 @ a1 = a1' + a2' 75 add r11, r11, r3 @ a1 = a1' + a2'
76 sub r3, r11, r3, lsl #1 @ a2 = a1 - 2 * a2' 76 sub r3, r11, r3, lsl #1 @ a2 = a1 - 2 * a2'
77 77
78 subs r14, r14, #1 @ decrease loop count
79
78 @ Special store order for making the column pass calculate columns in 80 @ Special store order for making the column pass calculate columns in
79 @ the order 0-2-1-3-4-6-5-7, allowing for uxtab16 use in later stages. 81 @ the order 0-2-1-3-4-6-5-7, allowing for uxtab16 use in later stages.
80 sub r2, r10, r6 @ block[7] = (a0 - b0) 82 sub r2, r10, r6 @ block[7] = (a0 - b0)
@@ -102,7 +104,6 @@
102 mov r2, r2, asr #12 @ >> 12 104 mov r2, r2, asr #12 @ >> 12
103 strh r2, [r1], #2 @ advance to next temp column 105 strh r2, [r1], #2 @ advance to next temp column
104 106
105 subs r14, r14, #1
106 bne .row_loop 107 bne .row_loop
107 b .col_start 108 b .col_start
108 109
@@ -129,42 +130,44 @@ L_W0246:
129 ldrd r4, L_W1357 @ load W1, W3, W5, W7 130 ldrd r4, L_W1357 @ load W1, W3, W5, W7
130 131
131 smuad r6, r4, r10 @ b0 = W1 * f1 + W3 * f3 132 smuad r6, r4, r10 @ b0 = W1 * f1 + W3 * f3
132 smlad r6, r5, r11, r6 @ + W5 * f5 + W7 * f7 133 smultt r7, r5, r10 @ -b1 = W7 * f3
134 smulbt r8, r4, r10 @ -b2 = W1 * f3
133 135
134 smultt r7, r5, r10 @ b1 = -W7 * f3 136 smusdx r9, r10, r5 @ b3 = f1 * W7 - f3 * W5
135 smlabb r7, r4, r11, r7 @ + -W1 * f5 137 smlabb r7, r4, r11, r7 @ -b1 += W1 * f5
136 smlabt r7, r5, r11, r7 @ + -W5 * f7 138 rsb r8, r8, #0 @ b2 = -b2
137 rsb r7, r7, #0 139 smlabb r8, r5, r10, r8 @ b2 += W5 * f1
138 smlatb r7, r4, r10, r7 @ + W3 * f1
139 140
140 smulbt r8, r4, r10 @ b2 = -W1 * f3 141 smlad r6, r5, r11, r6 @ b0 += W5 * f5 + W7 * f7
141 rsb r8, r8, #0 142 smlabt r7, r5, r11, r7 @ -b1 += W5 * f7
142 smlabb r8, r5, r10, r8 @ + W5 * f1 143 smlatb r8, r5, r11, r8 @ b2 += W7 * f5
143 smlatb r8, r5, r11, r8 @ + W7 * f5
144 smlatt r8, r4, r11, r8 @ + W3 * f7
145 144
146 smusdx r9, r10, r5 @ b3 = f1 * W7 - f3 * W5 145 smlsdx r9, r11, r4, r9 @ b3 += f5 * W3 - f7 * W1
147 smlsdx r9, r11, r4, r9 @ + f5 * W3 - f1 * W1 146 rsb r7, r7, #0 @ b1 = -b1
147 smlatb r7, r4, r10, r7 @ b1 += W3 * f1
148 smlatt r8, r4, r11, r8 @ b2 += W3 * f7
148 149
149 ldrd r4, L_W0246 @ load W0, W2, W4, W6 150 ldrd r4, L_W0246 @ load W0, W2, W4, W6
150 add r2, r2, #32 @ DC offset: 0.5 151 add r2, r2, #32 @ DC offset: 0.5
151 152
152 smulbb r10, r4, r2 @ a0' = W0 * f0 153 smulbb r10, r5, r3 @ a0' = W4 * f4
153 smlabb r10, r5, r3, r10 @ + W4 * f4 154 smultt r12, r5, r3 @ a3' = W6 * f6
154 smultt r12, r4, r2 @ a3' = W2 * f2 155 smultt r3, r4, r3 @ -a2' = W2 * f6
155 smlatt r12, r5, r3, r12 @ + W6 * f6 156
157 rsb r11, r10, #0 @ a1' = -W4 * f4
158 smlabb r10, r4, r2, r10 @ a0' += W0 * f0
159 smlabb r11, r4, r2, r11 @ a1' += W0 * f0
160 smlatt r12, r4, r2, r12 @ a3' += W2 * f2
161 rsb r3, r3, #0 @ a2' = -a2'
162 smlatt r3, r5, r2, r3 @ a2' += W6 * f2
163
156 add r10, r10, r12 @ a0 = a0' + a3' 164 add r10, r10, r12 @ a0 = a0' + a3'
157 sub r12, r10, r12, lsl #1 @ a3 = a0 - 2 * a3' 165 sub r12, r10, r12, lsl #1 @ a3 = a0 - 2 * a3'
158
159 smulbb r11, r5, r3 @ a1' = -W4 * f4
160 rsb r11, r11, #0
161 smlabb r11, r4, r2, r11 @ + W0 * f0
162 smultt r3, r4, r3 @ a2' = -W2 * f6
163 rsb r3, r3, #0
164 smlatt r3, r5, r2, r3 @ + W6 * f2
165 add r11, r11, r3 @ a1 = a1' + a2' 166 add r11, r11, r3 @ a1 = a1' + a2'
166 sub r3, r11, r3, lsl #1 @ a2 = a1 - 2 * a2' 167 sub r3, r11, r3, lsl #1 @ a2 = a1 - 2 * a2'
167 168
169 subs r14, r14, #1 @ decrease loop count
170
168 sub r2, r10, r6 @ block[7] = (a0 - b0) 171 sub r2, r10, r6 @ block[7] = (a0 - b0)
169 mov r2, r2, asr #17 @ >> 17 172 mov r2, r2, asr #17 @ >> 17
170 strh r2, [r1, #7*16] 173 strh r2, [r1, #7*16]
@@ -190,7 +193,6 @@ L_W0246:
190 mov r2, r2, asr #17 @ >> 17 193 mov r2, r2, asr #17 @ >> 17
191 strh r2, [r1], #2 @ advance to next column 194 strh r2, [r1], #2 @ advance to next column
192 195
193 subs r14, r14, #1
194 bne .col_loop 196 bne .col_loop
195 197
196 sub r0, r0, #256 @ point r0 back to the input block 198 sub r0, r0, #256 @ point r0 back to the input block