diff options
author | Jens Arnold <amiconn@rockbox.org> | 2010-05-02 12:13:26 +0000 |
---|---|---|
committer | Jens Arnold <amiconn@rockbox.org> | 2010-05-02 12:13:26 +0000 |
commit | f2759305a9ccf5bf987548a7dedd48b428b34fdb (patch) | |
tree | 7f922bfcb5f152aea0e5e73efda1cf5082e0051a /apps/plugins | |
parent | 9de9b9dfbe6b00c63e44e5497718ae4fc77b8ffe (diff) | |
download | rockbox-f2759305a9ccf5bf987548a7dedd48b428b34fdb.tar.gz rockbox-f2759305a9ccf5bf987548a7dedd48b428b34fdb.zip |
Gigabeat S: Reduce stalling in the ARMv6 IDCT. Also save one instruction per loop, and fix comments. Speeds up fullscreen video decoding by about 5% (excluding video output). Still not perfect...
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@25775 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'apps/plugins')
-rw-r--r-- | apps/plugins/mpegplayer/idct_armv6.S | 102 |
1 files changed, 52 insertions, 50 deletions
diff --git a/apps/plugins/mpegplayer/idct_armv6.S b/apps/plugins/mpegplayer/idct_armv6.S index fbffa4dfa9..dc53cbd7bd 100644 --- a/apps/plugins/mpegplayer/idct_armv6.S +++ b/apps/plugins/mpegplayer/idct_armv6.S | |||
@@ -39,42 +39,44 @@ | |||
39 | ldrd r4, L_W1357 @ load W1, W3, W5, W7 | 39 | ldrd r4, L_W1357 @ load W1, W3, W5, W7 |
40 | 40 | ||
41 | smuad r6, r4, r10 @ b0 = W1 * f1 + W3 * f3 | 41 | smuad r6, r4, r10 @ b0 = W1 * f1 + W3 * f3 |
42 | smlad r6, r5, r11, r6 @ + W5 * f5 + W7 * f7 | 42 | smultt r7, r5, r10 @ -b1 = W7 * f3 |
43 | smulbt r8, r4, r10 @ -b2 = W1 * f3 | ||
43 | 44 | ||
44 | smultt r7, r5, r10 @ b1 = -W7 * f3 | 45 | smusdx r9, r10, r5 @ b3 = f1 * W7 - f3 * W5 |
45 | smlabb r7, r4, r11, r7 @ + -W1 * f5 | 46 | smlabb r7, r4, r11, r7 @ -b1 += W1 * f5 |
46 | smlabt r7, r5, r11, r7 @ + -W5 * f7 | 47 | rsb r8, r8, #0 @ b2 = -b2 |
47 | rsb r7, r7, #0 | 48 | smlabb r8, r5, r10, r8 @ b2 += W5 * f1 |
48 | smlatb r7, r4, r10, r7 @ + W3 * f1 | ||
49 | 49 | ||
50 | smulbt r8, r4, r10 @ b2 = -W1 * f3 | 50 | smlad r6, r5, r11, r6 @ b0 += W5 * f5 + W7 * f7 |
51 | rsb r8, r8, #0 | 51 | smlabt r7, r5, r11, r7 @ -b1 += W5 * f7 |
52 | smlabb r8, r5, r10, r8 @ + W5 * f1 | 52 | smlatb r8, r5, r11, r8 @ b2 += W7 * f5 |
53 | smlatb r8, r5, r11, r8 @ + W7 * f5 | ||
54 | smlatt r8, r4, r11, r8 @ + W3 * f7 | ||
55 | 53 | ||
56 | smusdx r9, r10, r5 @ b3 = f1 * W7 - f3 * W5 | 54 | smlsdx r9, r11, r4, r9 @ b3 += f5 * W3 - f7 * W1 |
57 | smlsdx r9, r11, r4, r9 @ + f5 * W3 - f1 * W1 | 55 | rsb r7, r7, #0 @ b1 = -b1 |
56 | smlatb r7, r4, r10, r7 @ b1 += W3 * f1 | ||
57 | smlatt r8, r4, r11, r8 @ b2 += W3 * f7 | ||
58 | 58 | ||
59 | ldrd r4, L_W0246 @ load W0, W2, W4, W6 | 59 | ldrd r4, L_W0246 @ load W0, W2, W4, W6 |
60 | add r2, r2, #1 @ f0 += 1 | 60 | add r2, r2, #1 @ f0 += 1 |
61 | 61 | ||
62 | smulbb r10, r4, r2 @ a0' = W0 * f0 | 62 | smulbb r10, r5, r3 @ a0' = W4 * f4 |
63 | smlabb r10, r5, r3, r10 @ + W4 * f4 | 63 | smultt r12, r5, r3 @ a3' = W6 * f6 |
64 | smultt r12, r4, r2 @ a3' = W2 * f2 | 64 | smultt r3, r4, r3 @ -a2' = W2 * f6 |
65 | smlatt r12, r5, r3, r12 @ + W6 * f6 | 65 | |
66 | rsb r11, r10, #0 @ a1' = -W4 * f4 | ||
67 | smlabb r10, r4, r2, r10 @ a0' += W0 * f0 | ||
68 | smlabb r11, r4, r2, r11 @ a1' += W0 * f0 | ||
69 | smlatt r12, r4, r2, r12 @ a3' += W2 * f2 | ||
70 | rsb r3, r3, #0 @ a2' = -a2' | ||
71 | smlatt r3, r5, r2, r3 @ a2' += W6 * f2 | ||
72 | |||
66 | add r10, r10, r12 @ a0 = a0' + a3' | 73 | add r10, r10, r12 @ a0 = a0' + a3' |
67 | sub r12, r10, r12, lsl #1 @ a3 = a0 - 2 * a3' | 74 | sub r12, r10, r12, lsl #1 @ a3 = a0 - 2 * a3' |
68 | |||
69 | smulbb r11, r5, r3 @ a1' = -W4 * f4 | ||
70 | rsb r11, r11, #0 | ||
71 | smlabb r11, r4, r2, r11 @ + W0 * f0 | ||
72 | smultt r3, r4, r3 @ a2' = -W2 * f6 | ||
73 | rsb r3, r3, #0 | ||
74 | smlatt r3, r5, r2, r3 @ + W6 * f2 | ||
75 | add r11, r11, r3 @ a1 = a1' + a2' | 75 | add r11, r11, r3 @ a1 = a1' + a2' |
76 | sub r3, r11, r3, lsl #1 @ a2 = a1 - 2 * a2' | 76 | sub r3, r11, r3, lsl #1 @ a2 = a1 - 2 * a2' |
77 | 77 | ||
78 | subs r14, r14, #1 @ decrease loop count | ||
79 | |||
78 | @ Special store order for making the column pass calculate columns in | 80 | @ Special store order for making the column pass calculate columns in |
79 | @ the order 0-2-1-3-4-6-5-7, allowing for uxtab16 use in later stages. | 81 | @ the order 0-2-1-3-4-6-5-7, allowing for uxtab16 use in later stages. |
80 | sub r2, r10, r6 @ block[7] = (a0 - b0) | 82 | sub r2, r10, r6 @ block[7] = (a0 - b0) |
@@ -102,7 +104,6 @@ | |||
102 | mov r2, r2, asr #12 @ >> 12 | 104 | mov r2, r2, asr #12 @ >> 12 |
103 | strh r2, [r1], #2 @ advance to next temp column | 105 | strh r2, [r1], #2 @ advance to next temp column |
104 | 106 | ||
105 | subs r14, r14, #1 | ||
106 | bne .row_loop | 107 | bne .row_loop |
107 | b .col_start | 108 | b .col_start |
108 | 109 | ||
@@ -129,42 +130,44 @@ L_W0246: | |||
129 | ldrd r4, L_W1357 @ load W1, W3, W5, W7 | 130 | ldrd r4, L_W1357 @ load W1, W3, W5, W7 |
130 | 131 | ||
131 | smuad r6, r4, r10 @ b0 = W1 * f1 + W3 * f3 | 132 | smuad r6, r4, r10 @ b0 = W1 * f1 + W3 * f3 |
132 | smlad r6, r5, r11, r6 @ + W5 * f5 + W7 * f7 | 133 | smultt r7, r5, r10 @ -b1 = W7 * f3 |
134 | smulbt r8, r4, r10 @ -b2 = W1 * f3 | ||
133 | 135 | ||
134 | smultt r7, r5, r10 @ b1 = -W7 * f3 | 136 | smusdx r9, r10, r5 @ b3 = f1 * W7 - f3 * W5 |
135 | smlabb r7, r4, r11, r7 @ + -W1 * f5 | 137 | smlabb r7, r4, r11, r7 @ -b1 += W1 * f5 |
136 | smlabt r7, r5, r11, r7 @ + -W5 * f7 | 138 | rsb r8, r8, #0 @ b2 = -b2 |
137 | rsb r7, r7, #0 | 139 | smlabb r8, r5, r10, r8 @ b2 += W5 * f1 |
138 | smlatb r7, r4, r10, r7 @ + W3 * f1 | ||
139 | 140 | ||
140 | smulbt r8, r4, r10 @ b2 = -W1 * f3 | 141 | smlad r6, r5, r11, r6 @ b0 += W5 * f5 + W7 * f7 |
141 | rsb r8, r8, #0 | 142 | smlabt r7, r5, r11, r7 @ -b1 += W5 * f7 |
142 | smlabb r8, r5, r10, r8 @ + W5 * f1 | 143 | smlatb r8, r5, r11, r8 @ b2 += W7 * f5 |
143 | smlatb r8, r5, r11, r8 @ + W7 * f5 | ||
144 | smlatt r8, r4, r11, r8 @ + W3 * f7 | ||
145 | 144 | ||
146 | smusdx r9, r10, r5 @ b3 = f1 * W7 - f3 * W5 | 145 | smlsdx r9, r11, r4, r9 @ b3 += f5 * W3 - f7 * W1 |
147 | smlsdx r9, r11, r4, r9 @ + f5 * W3 - f1 * W1 | 146 | rsb r7, r7, #0 @ b1 = -b1 |
147 | smlatb r7, r4, r10, r7 @ b1 += W3 * f1 | ||
148 | smlatt r8, r4, r11, r8 @ b2 += W3 * f7 | ||
148 | 149 | ||
149 | ldrd r4, L_W0246 @ load W0, W2, W4, W6 | 150 | ldrd r4, L_W0246 @ load W0, W2, W4, W6 |
150 | add r2, r2, #32 @ DC offset: 0.5 | 151 | add r2, r2, #32 @ DC offset: 0.5 |
151 | 152 | ||
152 | smulbb r10, r4, r2 @ a0' = W0 * f0 | 153 | smulbb r10, r5, r3 @ a0' = W4 * f4 |
153 | smlabb r10, r5, r3, r10 @ + W4 * f4 | 154 | smultt r12, r5, r3 @ a3' = W6 * f6 |
154 | smultt r12, r4, r2 @ a3' = W2 * f2 | 155 | smultt r3, r4, r3 @ -a2' = W2 * f6 |
155 | smlatt r12, r5, r3, r12 @ + W6 * f6 | 156 | |
157 | rsb r11, r10, #0 @ a1' = -W4 * f4 | ||
158 | smlabb r10, r4, r2, r10 @ a0' += W0 * f0 | ||
159 | smlabb r11, r4, r2, r11 @ a1' += W0 * f0 | ||
160 | smlatt r12, r4, r2, r12 @ a3' += W2 * f2 | ||
161 | rsb r3, r3, #0 @ a2' = -a2' | ||
162 | smlatt r3, r5, r2, r3 @ a2' += W6 * f2 | ||
163 | |||
156 | add r10, r10, r12 @ a0 = a0' + a3' | 164 | add r10, r10, r12 @ a0 = a0' + a3' |
157 | sub r12, r10, r12, lsl #1 @ a3 = a0 - 2 * a3' | 165 | sub r12, r10, r12, lsl #1 @ a3 = a0 - 2 * a3' |
158 | |||
159 | smulbb r11, r5, r3 @ a1' = -W4 * f4 | ||
160 | rsb r11, r11, #0 | ||
161 | smlabb r11, r4, r2, r11 @ + W0 * f0 | ||
162 | smultt r3, r4, r3 @ a2' = -W2 * f6 | ||
163 | rsb r3, r3, #0 | ||
164 | smlatt r3, r5, r2, r3 @ + W6 * f2 | ||
165 | add r11, r11, r3 @ a1 = a1' + a2' | 166 | add r11, r11, r3 @ a1 = a1' + a2' |
166 | sub r3, r11, r3, lsl #1 @ a2 = a1 - 2 * a2' | 167 | sub r3, r11, r3, lsl #1 @ a2 = a1 - 2 * a2' |
167 | 168 | ||
169 | subs r14, r14, #1 @ decrease loop count | ||
170 | |||
168 | sub r2, r10, r6 @ block[7] = (a0 - b0) | 171 | sub r2, r10, r6 @ block[7] = (a0 - b0) |
169 | mov r2, r2, asr #17 @ >> 17 | 172 | mov r2, r2, asr #17 @ >> 17 |
170 | strh r2, [r1, #7*16] | 173 | strh r2, [r1, #7*16] |
@@ -190,7 +193,6 @@ L_W0246: | |||
190 | mov r2, r2, asr #17 @ >> 17 | 193 | mov r2, r2, asr #17 @ >> 17 |
191 | strh r2, [r1], #2 @ advance to next column | 194 | strh r2, [r1], #2 @ advance to next column |
192 | 195 | ||
193 | subs r14, r14, #1 | ||
194 | bne .col_loop | 196 | bne .col_loop |
195 | 197 | ||
196 | sub r0, r0, #256 @ point r0 back to the input block | 198 | sub r0, r0, #256 @ point r0 back to the input block |