summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Arnold <amiconn@rockbox.org>2008-12-02 02:26:04 +0000
committerJens Arnold <amiconn@rockbox.org>2008-12-02 02:26:04 +0000
commitc1cd0469ca9f084b39d747ccca5d64442c3833ca (patch)
tree976efee8d7131013414583e5bd2ad2fe323c8063
parent6c65b357bca384a3d65a6795edc2928b889254ee (diff)
downloadrockbox-c1cd0469ca9f084b39d747ccca5d64442c3833ca.tar.gz
rockbox-c1cd0469ca9f084b39d747ccca5d64442c3833ca.zip
Implement mono predictor in assembler for coldfire, yielding a ~6% speedup for mono -c1000. Apply ideas gained from it back to the stereo predictor, saving 4 instructions. No speed increase for stereo, probably due to cache aliasing effects. * 80-column police.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19296 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/demac/libdemac/predictor-cf.S435
-rw-r--r--apps/codecs/demac/libdemac/predictor.c2
2 files changed, 291 insertions, 146 deletions
diff --git a/apps/codecs/demac/libdemac/predictor-cf.S b/apps/codecs/demac/libdemac/predictor-cf.S
index cd2e07fd5e..c76d7f629a 100644
--- a/apps/codecs/demac/libdemac/predictor-cf.S
+++ b/apps/codecs/demac/libdemac/predictor-cf.S
@@ -25,13 +25,6 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
25*/ 25*/
26#include "demac_config.h" 26#include "demac_config.h"
27 27
28 .text
29
30 .align 2
31
32 .global predictor_decode_stereo
33 .type predictor_decode_stereo,@function
34
35/* NOTE: The following need to be kept in sync with parser.h */ 28/* NOTE: The following need to be kept in sync with parser.h */
36 29
37#define YDELAYA 200 30#define YDELAYA 200
@@ -63,6 +56,13 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
63#define historybuffer 100 /* int32_t historybuffer[] */ 56#define historybuffer 100 /* int32_t historybuffer[] */
64 57
65 58
59 .text
60
61 .align 2
62
63 .global predictor_decode_stereo
64 .type predictor_decode_stereo,@function
65
66| void predictor_decode_stereo(struct predictor_t* p, 66| void predictor_decode_stereo(struct predictor_t* p,
67| int32_t* decoded0, 67| int32_t* decoded0,
68| int32_t* decoded1, 68| int32_t* decoded1,
@@ -92,6 +92,8 @@ predictor_decode_stereo:
92 | %d1 = p->buf[YDELAYA-2] 92 | %d1 = p->buf[YDELAYA-2]
93 | %d2 = p->buf[YDELAYA-1] 93 | %d2 = p->buf[YDELAYA-1]
94 94
95 move.l %d3, (YDELAYA,%a5) | p->buf[YDELAYA] = %d3
96
95 sub.l %d3, %d2 97 sub.l %d3, %d2
96 neg.l %d2 | %d2 = %d3 - %d2 98 neg.l %d2 | %d2 = %d3 - %d2
97 99
@@ -102,12 +104,10 @@ predictor_decode_stereo:
102 | %d6 = p->YcoeffsA[2] 104 | %d6 = p->YcoeffsA[2]
103 | %d7 = p->YcoeffsA[3] 105 | %d7 = p->YcoeffsA[3]
104 106
105 mac.l %d3, %d4, %acc0 | %acc0 = p->buf[YDELAYA] * p->YcoeffsA[0] 107 mac.l %d3, %d4, %acc0 | %acc0 = p->buf[YDELAYA] * p->YcoeffsA[0]
106 mac.l %d2, %d5, %acc0 | %acc0 += p->buf[YDELAYA-1] * p->YcoeffsA[1] 108 mac.l %d2, %d5, %acc0 | %acc0 += p->buf[YDELAYA-1] * p->YcoeffsA[1]
107 mac.l %d1, %d6, %acc0 | %acc0 += p->buf[YDELAYA-2] * p->YcoeffsA[2] 109 mac.l %d1, %d6, %acc0 | %acc0 += p->buf[YDELAYA-2] * p->YcoeffsA[2]
108 mac.l %d0, %d7, %acc0 | %acc0 += p->buf[YDELAYA-3] * p->YcoeffsA[3] 110 mac.l %d0, %d7, %acc0 | %acc0 += p->buf[YDELAYA-3] * p->YcoeffsA[3]
109
110 move.l %d3, (YDELAYA,%a5) | p->buf[YDELAYA] = %d3
111 111
112 tst.l %d2 112 tst.l %d2
113 beq.s 1f 113 beq.s 1f
@@ -125,10 +125,6 @@ predictor_decode_stereo:
1251: | %d3 = SIGN(%d3) 1251: | %d3 = SIGN(%d3)
126 move.l %d3, (YADAPTCOEFFSA,%a5) | p->buf[YADAPTCOEFFSA] = %d3 126 move.l %d3, (YADAPTCOEFFSA,%a5) | p->buf[YADAPTCOEFFSA] = %d3
127 127
128 movclr.l %acc0, %d0
129
130 | NOTE: %d0 now contains predictionA - don't overwrite.
131
132 | Predictor Y, Filter B 128 | Predictor Y, Filter B
133 129
134 movem.l (YfilterB,%a6), %d2-%d3 | %d2 = p->YfilterB 130 movem.l (YfilterB,%a6), %d2-%d3 | %d2 = p->YfilterB
@@ -156,11 +152,11 @@ predictor_decode_stereo:
156 | %a1 = p->YcoeffsB[3] 152 | %a1 = p->YcoeffsB[3]
157 | %a2 = p->YcoeffsB[4] 153 | %a2 = p->YcoeffsB[4]
158 154
159 mac.l %d3, %d1, %acc0 | %acc0 = p->buf[YDELAYB] * p->YcoeffsB[0] 155 mac.l %d3, %d1, %acc1 | %acc1 = p->buf[YDELAYB] * p->YcoeffsB[0]
160 mac.l %d7, %d2, %acc0 | %acc0 += p->buf[YDELAYB-1] * p->YcoeffsB[1] 156 mac.l %d7, %d2, %acc1 | %acc1 += p->buf[YDELAYB-1] * p->YcoeffsB[1]
161 mac.l %d6, %a0, %acc0 | %acc0 += p->buf[YDELAYB-2] * p->YcoeffsB[2] 157 mac.l %d6, %a0, %acc1 | %acc1 += p->buf[YDELAYB-2] * p->YcoeffsB[2]
162 mac.l %d5, %a1, %acc0 | %acc0 += p->buf[YDELAYB-3] * p->YcoeffsB[3] 158 mac.l %d5, %a1, %acc1 | %acc1 += p->buf[YDELAYB-3] * p->YcoeffsB[3]
163 mac.l %d4, %a2, %acc0 | %acc0 += p->buf[YDELAYB-4] * p->YcoeffsB[4] 159 mac.l %d4, %a2, %acc1 | %acc1 += p->buf[YDELAYB-4] * p->YcoeffsB[4]
164 160
165 move.l %d3, (YDELAYB, %a5) | p->buf[YDELAYB] = %d3 161 move.l %d3, (YDELAYB, %a5) | p->buf[YDELAYB] = %d3
166 162
@@ -179,38 +175,10 @@ predictor_decode_stereo:
1791: | %d3 = SIGN(%d3) 1751: | %d3 = SIGN(%d3)
180 move.l %d3, (YADAPTCOEFFSB, %a5) | p->buf[YADAPTCOEFFSB] = %d3 176 move.l %d3, (YADAPTCOEFFSB, %a5) | p->buf[YADAPTCOEFFSB] = %d3
181 177
182 movclr.l %acc0, %d4
183
184 | %d0 still contains predictionA
185 | %d4 contains predictionB
186
187 | Finish Predictor Y
188
189 asr.l #1, %d4
190 add.l %d4, %d0 | %d0 += (%d1 >> 1)
191 move.l (%a3), %d5 | %d5 = *decoded0
192 move.l %d5, %d4 | %d4 = %d5
193 asr.l #8, %d0
194 asr.l #2, %d0 | %d0 >>= 10
195 add.l %d0, %d4 | %d4 += %d0
196 move.l %d4, (YlastA,%a6) | p->YlastA = %d4
197
198 move.l (YfilterA,%a6), %d6 | %d6 = p->YfilterA
199 move.l %d6, %d0
200 lsl.l #5, %d6
201 sub.l %d0, %d6 | %d6 = 31 * %d6
202 asr.l #5, %d6 | %d6 >>= 5
203 add.l %d6, %d4
204 move.l %d4, (YfilterA,%a6) | p->YfilterA = %d4
205
206 | %d4 contains p->YfilterA
207 | %d5 contains *decoded0
208
209 | %d1, %d2, %a0, %a1, %a2 contain p->YcoeffsB[0..4] 178 | %d1, %d2, %a0, %a1, %a2 contain p->YcoeffsB[0..4]
210 | %d7, %d3 contain p->buf[YADAPTCOEFFSB-1] and p->buf[YADAPTCOEFFSB] 179 | %d7, %d3 contain p->buf[YADAPTCOEFFSB-1] and p->buf[YADAPTCOEFFSB]
211 180
212 move.l %d4, (%a3)+ | *(decoded0++) = %d1 (p->YfilterA) 181 move.l (%a3), %d0 | %d0 = *decoded0
213 tst.l %d5
214 beq.s 3f 182 beq.s 3f
215 183
216 movem.l (YADAPTCOEFFSB-16,%a5), %d4-%d6 | %d4 = p->buf[YADAPTCOEFFSB-4] 184 movem.l (YADAPTCOEFFSB-16,%a5), %d4-%d6 | %d4 = p->buf[YADAPTCOEFFSB-4]
@@ -221,11 +189,11 @@ predictor_decode_stereo:
221 189
222 | *decoded0 > 0 190 | *decoded0 > 0
223 191
224 sub.l %d3, %d1 | %d1 = p->YcoeffsB[0] - p->buf[YADAPTCOEFFSB] 192 sub.l %d3, %d1 | %d1 = p->YcoeffsB[0] - p->buf[YADAPTCOEFFSB]
225 sub.l %d7, %d2 | %d2 = p->YcoeffsB[1] - p->buf[YADAPTCOEFFSB-1] 193 sub.l %d7, %d2 | %d2 = p->YcoeffsB[1] - p->buf[YADAPTCOEFFSB-1]
226 sub.l %d6, %a0 | %a0 = p->YcoeffsB[2] - p->buf[YADAPTCOEFFSB-2] 194 sub.l %d6, %a0 | %a0 = p->YcoeffsB[2] - p->buf[YADAPTCOEFFSB-2]
227 sub.l %d5, %a1 | %a1 = p->YcoeffsB[3] - p->buf[YADAPTCOEFFSB-3] 195 sub.l %d5, %a1 | %a1 = p->YcoeffsB[3] - p->buf[YADAPTCOEFFSB-3]
228 sub.l %d4, %a2 | %a2 = p->YcoeffsB[4] - p->buf[YADAPTCOEFFSB-4] 196 sub.l %d4, %a2 | %a2 = p->YcoeffsB[4] - p->buf[YADAPTCOEFFSB-4]
229 197
230 movem.l %d1-%d2/%a0-%a2, (YcoeffsB,%a6) | Save p->YcoeffsB[] 198 movem.l %d1-%d2/%a0-%a2, (YcoeffsB,%a6) | Save p->YcoeffsB[]
231 199
@@ -234,47 +202,69 @@ predictor_decode_stereo:
234 | %d6 = p->YcoeffsA[2] 202 | %d6 = p->YcoeffsA[2]
235 | %d7 = p->YcoeffsA[3] 203 | %d7 = p->YcoeffsA[3]
236 204
237 movem.l (YADAPTCOEFFSA-12,%a5), %d2/%a0-%a2 | %d2 = p->buf[YADAPTCOEFFSA-3] 205 movem.l (YADAPTCOEFFSA-12,%a5), %d2/%a0-%a2
206 | %d2 = p->buf[YADAPTCOEFFSA-3]
238 | %a0 = p->buf[YADAPTCOEFFSA-2] 207 | %a0 = p->buf[YADAPTCOEFFSA-2]
239 | %a1 = p->buf[YADAPTCOEFFSA-1] 208 | %a1 = p->buf[YADAPTCOEFFSA-1]
240 | %a2 = p->buf[YADAPTCOEFFSA] 209 | %a2 = p->buf[YADAPTCOEFFSA]
241 210
242 sub.l %a2, %d4 | %d4 = p->YcoeffsA[0] - p->buf[YADAPTCOEFFSA] 211 sub.l %a2, %d4 | %d4 = p->YcoeffsA[0] - p->buf[YADAPTCOEFFSA]
243 sub.l %a1, %d5 | %d5 = p->YcoeffsA[1] - p->buf[YADAPTCOEFFSA-1] 212 sub.l %a1, %d5 | %d5 = p->YcoeffsA[1] - p->buf[YADAPTCOEFFSA-1]
244 sub.l %a0, %d6 | %d6 = p->YcoeffsA[2] - p->buf[YADAPTCOEFFSA-2] 213 sub.l %a0, %d6 | %d6 = p->YcoeffsA[2] - p->buf[YADAPTCOEFFSA-2]
245 sub.l %d2, %d7 | %d7 = p->YcoeffsA[3] - p->buf[YADAPTCOEFFSA-3] 214 sub.l %d2, %d7 | %d7 = p->YcoeffsA[3] - p->buf[YADAPTCOEFFSA-3]
246 215
247 bra.s 2f 216 bra.s 2f
248 217
2491: | *decoded0 < 0 2181: | *decoded0 < 0
250 219
251 add.l %d3, %d1 | %d1 = p->YcoeffsB[0] + p->buf[YADAPTCOEFFSB] 220 add.l %d3, %d1 | %d1 = p->YcoeffsB[0] + p->buf[YADAPTCOEFFSB]
252 add.l %d7, %d2 | %d2 = p->YcoeffsB[1] + p->buf[YADAPTCOEFFSB-1] 221 add.l %d7, %d2 | %d2 = p->YcoeffsB[1] + p->buf[YADAPTCOEFFSB-1]
253 add.l %d6, %a0 | %a0 = p->YcoeffsB[2] + p->buf[YADAPTCOEFFSB-2] 222 add.l %d6, %a0 | %a0 = p->YcoeffsB[2] + p->buf[YADAPTCOEFFSB-2]
254 add.l %d5, %a1 | %a1 = p->YcoeffsB[3] + p->buf[YADAPTCOEFFSB-3] 223 add.l %d5, %a1 | %a1 = p->YcoeffsB[3] + p->buf[YADAPTCOEFFSB-3]
255 add.l %d4, %a2 | %a2 = p->YcoeffsB[4] + p->buf[YADAPTCOEFFSB-4] 224 add.l %d4, %a2 | %a2 = p->YcoeffsB[4] + p->buf[YADAPTCOEFFSB-4]
256 225
257 movem.l %d1-%d2/%a0-%a2, (YcoeffsB,%a6) | Save p->YcoeffsB[] 226 movem.l %d1-%d2/%a0-%a2, (YcoeffsB,%a6) | Save p->YcoeffsB[]
258 227
259 movem.l (YcoeffsA,%a6), %d4-%d7 | %d4 = p->YcoeffsA[0] 228 movem.l (YcoeffsA,%a6), %d4-%d7 | %d4 = p->YcoeffsA[0]
260 | %d5 = p->YcoeffsA[1] 229 | %d5 = p->YcoeffsA[1]
261 | %d6 = p->YcoeffsA[2] 230 | %d6 = p->YcoeffsA[2]
262 | %d7 = p->YcoeffsA[3] 231 | %d7 = p->YcoeffsA[3]
263 232
264 movem.l (YADAPTCOEFFSA-12,%a5), %d2/%a0-%a2 | %d2 = p->buf[YADAPTCOEFFSA-3] 233 movem.l (YADAPTCOEFFSA-12,%a5), %d2/%a0-%a2
234 | %d2 = p->buf[YADAPTCOEFFSA-3]
265 | %a0 = p->buf[YADAPTCOEFFSA-2] 235 | %a0 = p->buf[YADAPTCOEFFSA-2]
266 | %a1 = p->buf[YADAPTCOEFFSA-1] 236 | %a1 = p->buf[YADAPTCOEFFSA-1]
267 | %a2 = p->buf[YADAPTCOEFFSA] 237 | %a2 = p->buf[YADAPTCOEFFSA]
268 238
269 add.l %a2, %d4 | %d4 = p->YcoeffsA[0] + p->buf[YADAPTCOEFFSA] 239 add.l %a2, %d4 | %d4 = p->YcoeffsA[0] + p->buf[YADAPTCOEFFSA]
270 add.l %a1, %d5 | %d5 = p->YcoeffsA[1] + p->buf[YADAPTCOEFFSA-1] 240 add.l %a1, %d5 | %d5 = p->YcoeffsA[1] + p->buf[YADAPTCOEFFSA-1]
271 add.l %a0, %d6 | %d6 = p->YcoeffsA[2] + p->buf[YADAPTCOEFFSA-2] 241 add.l %a0, %d6 | %d6 = p->YcoeffsA[2] + p->buf[YADAPTCOEFFSA-2]
272 add.l %d2, %d7 | %d7 = p->YcoeffsA[3] + p->buf[YADAPTCOEFFSA-3] 242 add.l %d2, %d7 | %d7 = p->YcoeffsA[3] + p->buf[YADAPTCOEFFSA-3]
273 243
2742: 2442:
275 movem.l %d4-%d7, (YcoeffsA,%a6) | Save p->YcoeffsA[] 245 movem.l %d4-%d7, (YcoeffsA,%a6) | Save p->YcoeffsA[]
276 246
2773: 2473:
248 | Finish Predictor Y
249
250 movclr.l %acc0, %d1 | %d1 = predictionA
251 movclr.l %acc1, %d2 | %d2 = predictionB
252 asr.l #1, %d2
253 add.l %d2, %d1 | %d1 += (%d2 >> 1)
254 asr.l #8, %d1
255 asr.l #2, %d1 | %d1 >>= 10
256 add.l %d0, %d1 | %d1 += %d0
257 move.l %d1, (YlastA,%a6) | p->YlastA = %d1
258
259 move.l (YfilterA,%a6), %d2 | %d2 = p->YfilterA
260 move.l %d2, %d0
261 lsl.l #5, %d2
262 sub.l %d0, %d2 | %d2 = 31 * %d2
263 asr.l #5, %d2 | %d2 >>= 5
264 add.l %d1, %d2
265 move.l %d2, (YfilterA,%a6) | p->YfilterA = %d2
266
267 | *decoded0 stored 2 instructions down, avoiding pipeline stall
278 268
279 | ***** PREDICTOR X ***** 269 | ***** PREDICTOR X *****
280 270
@@ -282,11 +272,15 @@ predictor_decode_stereo:
282 272
283 move.l (XlastA,%a6), %d3 | %d3 = p->XlastA 273 move.l (XlastA,%a6), %d3 | %d3 = p->XlastA
284 274
275 move.l %d2, (%a3)+ | *(decoded0++) = %d2 (p->YfilterA)
276
285 movem.l (XDELAYA-12,%a5), %d0-%d2 | %d0 = p->buf[XDELAYA-3] 277 movem.l (XDELAYA-12,%a5), %d0-%d2 | %d0 = p->buf[XDELAYA-3]
286 | %d1 = p->buf[XDELAYA-2] 278 | %d1 = p->buf[XDELAYA-2]
287 | %d2 = p->buf[XDELAYA-1] 279 | %d2 = p->buf[XDELAYA-1]
288 280
289 sub.l %d3, %d2 281 move.l %d3, (XDELAYA,%a5) | p->buf[XDELAYA] = %d3
282
283 sub.l %d3, %d2
290 neg.l %d2 | %d2 = %d3 -%d2 284 neg.l %d2 | %d2 = %d3 -%d2
291 285
292 move.l %d2, (XDELAYA-4,%a5) | p->buf[XDELAYA-1] = %d2 286 move.l %d2, (XDELAYA-4,%a5) | p->buf[XDELAYA-1] = %d2
@@ -296,13 +290,11 @@ predictor_decode_stereo:
296 | %d6 = p->XcoeffsA[2] 290 | %d6 = p->XcoeffsA[2]
297 | %d7 = p->XcoeffsA[3] 291 | %d7 = p->XcoeffsA[3]
298 292
299 mac.l %d3, %d4, %acc0 | %acc0 = p->buf[XDELAYA] * p->XcoeffsA[0] 293 mac.l %d3, %d4, %acc0 | %acc0 = p->buf[XDELAYA] * p->XcoeffsA[0]
300 mac.l %d2, %d5, %acc0 | %acc0 += p->buf[XDELAYA-1] * p->XcoeffsA[1] 294 mac.l %d2, %d5, %acc0 | %acc0 += p->buf[XDELAYA-1] * p->XcoeffsA[1]
301 mac.l %d1, %d6, %acc0 | %acc0 += p->buf[XDELAYA-2] * p->XcoeffsA[2] 295 mac.l %d1, %d6, %acc0 | %acc0 += p->buf[XDELAYA-2] * p->XcoeffsA[2]
302 mac.l %d0, %d7, %acc0 | %acc0 += p->buf[XDELAYA-3] * p->XcoeffsA[3] 296 mac.l %d0, %d7, %acc0 | %acc0 += p->buf[XDELAYA-3] * p->XcoeffsA[3]
303 297
304 move.l %d3, (XDELAYA,%a5) | p->buf[XDELAYA] = %d3
305
306 tst.l %d2 298 tst.l %d2
307 beq.s 1f 299 beq.s 1f
308 spl.b %d2 | pos: 0x??????ff, neg: 0x??????00 300 spl.b %d2 | pos: 0x??????ff, neg: 0x??????00
@@ -319,10 +311,6 @@ predictor_decode_stereo:
3191: | %d3 = SIGN(%d3) 3111: | %d3 = SIGN(%d3)
320 move.l %d3, (XADAPTCOEFFSA,%a5) | p->buf[XADAPTCOEFFSA] = %d3 312 move.l %d3, (XADAPTCOEFFSA,%a5) | p->buf[XADAPTCOEFFSA] = %d3
321 313
322 movclr.l %acc0, %d0
323
324 | NOTE: %d0 now contains predictionA - don't overwrite.
325
326 | Predictor X, Filter B 314 | Predictor X, Filter B
327 315
328 movem.l (XfilterB,%a6), %d2-%d3 | %d2 = p->XfilterB 316 movem.l (XfilterB,%a6), %d2-%d3 | %d2 = p->XfilterB
@@ -350,11 +338,11 @@ predictor_decode_stereo:
350 | %a1 = p->XcoeffsB[3] 338 | %a1 = p->XcoeffsB[3]
351 | %a2 = p->XcoeffsB[4] 339 | %a2 = p->XcoeffsB[4]
352 340
353 mac.l %d3, %d1, %acc0 | %acc0 = p->buf[XDELAYB] * p->XcoeffsB[0] 341 mac.l %d3, %d1, %acc1 | %acc1 = p->buf[XDELAYB] * p->XcoeffsB[0]
354 mac.l %d7, %d2, %acc0 | %acc0 += p->buf[XDELAYB-1] * p->XcoeffsB[1] 342 mac.l %d7, %d2, %acc1 | %acc1 += p->buf[XDELAYB-1] * p->XcoeffsB[1]
355 mac.l %d6, %a0, %acc0 | %acc0 += p->buf[XDELAYB-2] * p->XcoeffsB[2] 343 mac.l %d6, %a0, %acc1 | %acc1 += p->buf[XDELAYB-2] * p->XcoeffsB[2]
356 mac.l %d5, %a1, %acc0 | %acc0 += p->buf[XDELAYB-3] * p->XcoeffsB[3] 344 mac.l %d5, %a1, %acc1 | %acc1 += p->buf[XDELAYB-3] * p->XcoeffsB[3]
357 mac.l %d4, %a2, %acc0 | %acc0 += p->buf[XDELAYB-4] * p->XcoeffsB[4] 345 mac.l %d4, %a2, %acc1 | %acc1 += p->buf[XDELAYB-4] * p->XcoeffsB[4]
358 346
359 move.l %d3, (XDELAYB, %a5) | p->buf[XDELAYB] = %d3 347 move.l %d3, (XDELAYB, %a5) | p->buf[XDELAYB] = %d3
360 348
@@ -374,38 +362,10 @@ predictor_decode_stereo:
3741: | %d3 = SIGN(%d3) 3621: | %d3 = SIGN(%d3)
375 move.l %d3, (XADAPTCOEFFSB, %a5) | p->buf[XADAPTCOEFFSB] = %d3 363 move.l %d3, (XADAPTCOEFFSB, %a5) | p->buf[XADAPTCOEFFSB] = %d3
376 364
377 movclr.l %acc0, %d4
378
379 | %d0 still contains predictionA
380 | %d4 contains predictionB
381
382 | Finish Predictor X
383
384 asr.l #1, %d4
385 add.l %d4, %d0 | %d0 += (%d1 >> 1)
386 move.l (%a4), %d5 | %d5 = *decoded1
387 move.l %d5, %d4 | %d4 = %d5
388 asr.l #8, %d0
389 asr.l #2, %d0 | %d0 >>= 10
390 add.l %d0, %d4 | %d4 += %d0
391 move.l %d4, (XlastA,%a6) | p->XlastA = %d1
392
393 move.l (XfilterA,%a6), %d6 | %d6 = p->XfilterA
394 move.l %d6, %d0
395 lsl.l #5, %d6
396 sub.l %d0, %d6 | %d6 = 31 * %d6
397 asr.l #5, %d6 | %d6 >>= 5
398 add.l %d6, %d4
399 move.l %d4, (XfilterA,%a6) | p->XfilterA = %d6
400
401 | %d4 contains p->XfilterA
402 | %d5 contains *decoded1
403
404 | %d1, %d2, %a0, %a1, %a2 contain p->XcoeffsB[0..4] 365 | %d1, %d2, %a0, %a1, %a2 contain p->XcoeffsB[0..4]
405 | %d7, %d3 contain p->buf[XADAPTCOEFFSB-1] and p->buf[XADAPTCOEFFSB] 366 | %d7, %d3 contain p->buf[XADAPTCOEFFSB-1] and p->buf[XADAPTCOEFFSB]
406 367
407 move.l %d4, (%a4)+ | *(decoded1++) = %d1 (p->XfilterA) 368 move.l (%a4), %d0 | %d0 = *decoded1
408 tst.l %d5
409 beq.s 3f 369 beq.s 3f
410 370
411 movem.l (XADAPTCOEFFSB-16,%a5), %d4-%d6 | %d4 = p->buf[XADAPTCOEFFSB-4] 371 movem.l (XADAPTCOEFFSB-16,%a5), %d4-%d6 | %d4 = p->buf[XADAPTCOEFFSB-4]
@@ -416,38 +376,39 @@ predictor_decode_stereo:
416 376
417 | *decoded1 > 0 377 | *decoded1 > 0
418 378
419 sub.l %d3, %d1 | %d1 = p->XcoeffsB[0] - p->buf[XADAPTCOEFFSB] 379 sub.l %d3, %d1 | %d1 = p->XcoeffsB[0] - p->buf[XADAPTCOEFFSB]
420 sub.l %d7, %d2 | %d2 = p->XcoeffsB[1] - p->buf[XADAPTCOEFFSB-1] 380 sub.l %d7, %d2 | %d2 = p->XcoeffsB[1] - p->buf[XADAPTCOEFFSB-1]
421 sub.l %d6, %a0 | %a0 = p->XcoeffsB[2] - p->buf[XADAPTCOEFFSB-2] 381 sub.l %d6, %a0 | %a0 = p->XcoeffsB[2] - p->buf[XADAPTCOEFFSB-2]
422 sub.l %d5, %a1 | %a1 = p->XcoeffsB[3] - p->buf[XADAPTCOEFFSB-3] 382 sub.l %d5, %a1 | %a1 = p->XcoeffsB[3] - p->buf[XADAPTCOEFFSB-3]
423 sub.l %d4, %a2 | %a2 = p->XcoeffsB[4] - p->buf[XADAPTCOEFFSB-4] 383 sub.l %d4, %a2 | %a2 = p->XcoeffsB[4] - p->buf[XADAPTCOEFFSB-4]
424 384
425 movem.l %d1-%d2/%a0-%a2, (XcoeffsB,%a6) | Save p->XcoeffsB[] 385 movem.l %d1-%d2/%a0-%a2, (XcoeffsB,%a6) | Save p->XcoeffsB[]
426 386
427 movem.l (XcoeffsA,%a6), %d4-%d7 | %d4 = p->XcoeffsA[0] 387 movem.l (XcoeffsA,%a6), %d4-%d7 | %d4 = p->XcoeffsA[0]
428 | %d5 = p->XcoeffsA[1] 388 | %d5 = p->XcoeffsA[1]
429 | %d6 = p->XcoeffsA[2] 389 | %d6 = p->XcoeffsA[2]
430 | %d7 = p->XcoeffsA[3] 390 | %d7 = p->XcoeffsA[3]
431 391
432 movem.l (XADAPTCOEFFSA-12,%a5), %d2/%a0-%a2 | %d2 = p->buf[XADAPTCOEFFSA-3] 392 movem.l (XADAPTCOEFFSA-12,%a5), %d2/%a0-%a2
393 | %d2 = p->buf[XADAPTCOEFFSA-3]
433 | %a0 = p->buf[XADAPTCOEFFSA-2] 394 | %a0 = p->buf[XADAPTCOEFFSA-2]
434 | %a1 = p->buf[XADAPTCOEFFSA-1] 395 | %a1 = p->buf[XADAPTCOEFFSA-1]
435 | %a2 = p->buf[XADAPTCOEFFSA] 396 | %a2 = p->buf[XADAPTCOEFFSA]
436 397
437 sub.l %a2, %d4 | %d4 = p->XcoeffsA[0] - p->buf[XADAPTCOEFFSA] 398 sub.l %a2, %d4 | %d4 = p->XcoeffsA[0] - p->buf[XADAPTCOEFFSA]
438 sub.l %a1, %d5 | %d5 = p->XcoeffsA[1] - p->buf[XADAPTCOEFFSA-1] 399 sub.l %a1, %d5 | %d5 = p->XcoeffsA[1] - p->buf[XADAPTCOEFFSA-1]
439 sub.l %a0, %d6 | %d6 = p->XcoeffsA[2] - p->buf[XADAPTCOEFFSA-2] 400 sub.l %a0, %d6 | %d6 = p->XcoeffsA[2] - p->buf[XADAPTCOEFFSA-2]
440 sub.l %d2, %d7 | %d7 = p->XcoeffsA[3] - p->buf[XADAPTCOEFFSA-3] 401 sub.l %d2, %d7 | %d7 = p->XcoeffsA[3] - p->buf[XADAPTCOEFFSA-3]
441 402
442 bra.s 2f 403 bra.s 2f
443 404
4441: | *decoded1 < 0 4051: | *decoded1 < 0
445 406
446 add.l %d3, %d1 | %d1 = p->XcoeffsB[0] + p->buf[XADAPTCOEFFSB] 407 add.l %d3, %d1 | %d1 = p->XcoeffsB[0] + p->buf[XADAPTCOEFFSB]
447 add.l %d7, %d2 | %d2 = p->XcoeffsB[1] + p->buf[XADAPTCOEFFSB-1] 408 add.l %d7, %d2 | %d2 = p->XcoeffsB[1] + p->buf[XADAPTCOEFFSB-1]
448 add.l %d6, %a0 | %a0 = p->XcoeffsB[2] + p->buf[XADAPTCOEFFSB-2] 409 add.l %d6, %a0 | %a0 = p->XcoeffsB[2] + p->buf[XADAPTCOEFFSB-2]
449 add.l %d5, %a1 | %a1 = p->XcoeffsB[3] + p->buf[XADAPTCOEFFSB-3] 410 add.l %d5, %a1 | %a1 = p->XcoeffsB[3] + p->buf[XADAPTCOEFFSB-3]
450 add.l %d4, %a2 | %a2 = p->XcoeffsB[4] + p->buf[XADAPTCOEFFSB-4] 411 add.l %d4, %a2 | %a2 = p->XcoeffsB[4] + p->buf[XADAPTCOEFFSB-4]
451 412
452 movem.l %d1-%d2/%a0-%a2, (XcoeffsB,%a6) | Save p->XcoeffsB[] 413 movem.l %d1-%d2/%a0-%a2, (XcoeffsB,%a6) | Save p->XcoeffsB[]
453 414
@@ -456,31 +417,53 @@ predictor_decode_stereo:
456 | %d6 = p->XcoeffsA[2] 417 | %d6 = p->XcoeffsA[2]
457 | %d7 = p->XcoeffsA[3] 418 | %d7 = p->XcoeffsA[3]
458 419
459 movem.l (XADAPTCOEFFSA-12,%a5), %d2/%a0-%a2 | %d2 = p->buf[XADAPTCOEFFSA-3] 420 movem.l (XADAPTCOEFFSA-12,%a5), %d2/%a0-%a2
421 | %d2 = p->buf[XADAPTCOEFFSA-3]
460 | %a0 = p->buf[XADAPTCOEFFSA-2] 422 | %a0 = p->buf[XADAPTCOEFFSA-2]
461 | %a1 = p->buf[XADAPTCOEFFSA-1] 423 | %a1 = p->buf[XADAPTCOEFFSA-1]
462 | %a2 = p->buf[XADAPTCOEFFSA] 424 | %a2 = p->buf[XADAPTCOEFFSA]
463 425
464 add.l %a2, %d4 | %d4 = p->XcoeffsA[0] + p->buf[XADAPTCOEFFSA] 426 add.l %a2, %d4 | %d4 = p->XcoeffsA[0] + p->buf[XADAPTCOEFFSA]
465 add.l %a1, %d5 | %d5 = p->XcoeffsA[1] + p->buf[XADAPTCOEFFSA-1] 427 add.l %a1, %d5 | %d5 = p->XcoeffsA[1] + p->buf[XADAPTCOEFFSA-1]
466 add.l %a0, %d6 | %d6 = p->XcoeffsA[2] + p->buf[XADAPTCOEFFSA-2] 428 add.l %a0, %d6 | %d6 = p->XcoeffsA[2] + p->buf[XADAPTCOEFFSA-2]
467 add.l %d2, %d7 | %d7 = p->XcoeffsA[3] + p->buf[XADAPTCOEFFSA-3] 429 add.l %d2, %d7 | %d7 = p->XcoeffsA[3] + p->buf[XADAPTCOEFFSA-3]
468 430
4692: 4312:
470 movem.l %d4-%d7, (XcoeffsA,%a6) | Save p->XcoeffsA[] 432 movem.l %d4-%d7, (XcoeffsA,%a6) | Save p->XcoeffsA[]
471 433
4723: 4343:
435 | Finish Predictor X
436
437 movclr.l %acc0, %d1 | %d1 = predictionA
438 movclr.l %acc1, %d2 | %d2 = predictionB
439 asr.l #1, %d2
440 add.l %d2, %d1 | %d1 += (%d2 >> 1)
441 asr.l #8, %d1
442 asr.l #2, %d1 | %d1 >>= 10
443 add.l %d0, %d1 | %d1 += %d0
444 move.l %d1, (XlastA,%a6) | p->XlastA = %d1
445
446 move.l (XfilterA,%a6), %d2 | %d2 = p->XfilterA
447 move.l %d2, %d0
448 lsl.l #5, %d2
449 sub.l %d0, %d2 | %d2 = 31 * %d2
450 asr.l #5, %d2 | %d6 >>= 2
451 add.l %d1, %d2
452 move.l %d2, (XfilterA,%a6) | p->XfilterA = %d2
453
454 | *decoded1 stored 3 instructions down, avoiding pipeline stall
473 455
474 | ***** COMMON ***** 456 | ***** COMMON *****
475 457
476 addq.l #4, %a5 | p->buf++ 458 addq.l #4, %a5 | p->buf++
477
478 lea.l (historybuffer+PREDICTOR_HISTORY_SIZE*4,%a6), %a2 459 lea.l (historybuffer+PREDICTOR_HISTORY_SIZE*4,%a6), %a2
479 | %a2 = &p->historybuffer[PREDICTOR_HISTORY_SIZE] 460 | %a2 = &p->historybuffer[PREDICTOR_HISTORY_SIZE]
480 461
462 move.l %d2, (%a4)+ | *(decoded1++) = %d2 (p->XfilterA)
463
481 cmp.l %a2, %a5 464 cmp.l %a2, %a5
482 beq.s .move_hist | The history buffer is full, we need to do a memmove 465 beq.s .move_hist | History buffer is full, we need to do a memmove
483 466
484 subq.l #1, (%sp) | decrease loop count 467 subq.l #1, (%sp) | decrease loop count
485 bne.w .loop 468 bne.w .loop
486 469
@@ -514,3 +497,163 @@ predictor_decode_stereo:
514 bne.w .loop 497 bne.w .loop
515 498
516 bra.s .done 499 bra.s .done
500 .size predictor_decode_stereo, .-predictor_decode_stereo
501
502
503 .global predictor_decode_mono
504 .type predictor_decode_mono,@function
505
506| void predictor_decode_mono(struct predictor_t* p,
507| int32_t* decoded0,
508| int count)
509
510predictor_decode_mono:
511 lea.l (-11*4,%sp), %sp
512 movem.l %d2-%d7/%a2-%a6, (%sp)
513
514 move.l #0, %macsr | signed integer mode
515
516 move.l (11*4+4,%sp), %a6 | %a6 = p
517 move.l (11*4+8,%sp), %a4 | %a4 = decoded0
518 move.l (11*4+12,%sp), %d7 | %d7 = count
519 move.l (%a6), %a5 | %a5 = p->buf
520
521 move.l (YlastA,%a6), %d3 | %d3 = p->YlastA
522
523.loopm:
524
525 | ***** PREDICTOR *****
526
527 movem.l (YDELAYA-12,%a5), %d0-%d2 | %d0 = p->buf[YDELAYA-3]
528 | %d1 = p->buf[YDELAYA-2]
529 | %d2 = p->buf[YDELAYA-1]
530
531 move.l %d3, (YDELAYA,%a5) | p->buf[YDELAYA] = %d3
532
533 sub.l %d3, %d2
534 neg.l %d2 | %d2 = %d3 - %d2
535
536 move.l %d2, (YDELAYA-4,%a5) | p->buf[YDELAYA-1] = %d2
537
538 movem.l (YcoeffsA,%a6), %a0-%a3 | %a0 = p->YcoeffsA[0]
539 | %a1 = p->YcoeffsA[1]
540 | %a2 = p->YcoeffsA[2]
541 | %a3 = p->YcoeffsA[3]
542
543 mac.l %d3, %a0, %acc0 | %acc0 = p->buf[YDELAYA] * p->YcoeffsA[0]
544 mac.l %d2, %a1, %acc0 | %acc0 += p->buf[YDELAYA-1] * p->YcoeffsA[1]
545 mac.l %d1, %a2, %acc0 | %acc0 += p->buf[YDELAYA-2] * p->YcoeffsA[2]
546 mac.l %d0, %a3, %acc0 | %acc0 += p->buf[YDELAYA-3] * p->YcoeffsA[3]
547
548 tst.l %d2
549 beq.s 1f
550 spl.b %d2 | pos: 0x??????ff, neg: 0x??????00
551 extb.l %d2 | pos: 0xffffffff, neg: 0x00000000
552 or.l #1, %d2 | pos: 0xffffffff, neg: 0x00000001
5531: | %d2 = SIGN(%d2)
554 move.l %d2, (YADAPTCOEFFSA-4,%a5) | p->buf[YADAPTCOEFFSA-1] = %d2
555
556 tst.l %d3
557 beq.s 1f
558 spl.b %d3
559 extb.l %d3
560 or.l #1, %d3
5611: | %d3 = SIGN(%d3)
562 move.l %d3, (YADAPTCOEFFSA,%a5) | p->buf[YADAPTCOEFFSA] = %d3
563
564 move.l (%a4), %d0 | %d0 = *decoded0
565 beq.s 3f
566
567 movem.l (YADAPTCOEFFSA-12,%a5),%d4-%d5 | %d4 = p->buf[YADAPTCOEFFSA-3]
568 | %d5 = p->buf[YADAPTCOEFFSA-2]
569
570 bmi.s 1f | flags still valid here
571
572 | *decoded0 > 0
573
574 sub.l %d3, %a0 | %a0 = p->YcoeffsA[0] - p->buf[YADAPTCOEFFSA]
575 sub.l %d2, %a1 | %a1 = p->YcoeffsA[1] - p->buf[YADAPTCOEFFSA-1]
576 sub.l %d5, %a2 | %a2 = p->YcoeffsA[2] - p->buf[YADAPTCOEFFSA-2]
577 sub.l %d4, %a3 | %a3 = p->YcoeffsA[3] - p->buf[YADAPTCOEFFSA-3]
578
579 bra.s 2f
580
5811: | *decoded0 < 0
582
583 add.l %d3, %a0 | %a0 = p->YcoeffsA[0] - p->buf[YADAPTCOEFFSA]
584 add.l %d2, %a1 | %a1 = p->YcoeffsA[1] - p->buf[YADAPTCOEFFSA-1]
585 add.l %d5, %a2 | %a2 = p->YcoeffsA[2] - p->buf[YADAPTCOEFFSA-2]
586 add.l %d4, %a3 | %a3 = p->YcoeffsA[3] - p->buf[YADAPTCOEFFSA-3]
587
5882:
589 movem.l %a0-%a3, (YcoeffsA,%a6) | save p->YcoeffsA[]
590
5913:
592 | Finish Predictor
593
594 movclr.l %acc0, %d3 | %d3 = predictionA
595 asr.l #8, %d3
596 asr.l #2, %d3 | %d3 >>= 10
597 add.l %d0, %d3 | %d3 += %d0
598
599 move.l (YfilterA,%a6), %d2 | %d2 = p->YfilterA
600 move.l %d2, %d0
601 lsl.l #5, %d2
602 sub.l %d0, %d2 | %d2 = 31 * %d2
603 asr.l #5, %d2 | %d2 >>= 5
604 add.l %d3, %d2
605 move.l %d2, (YfilterA,%a6) | p->YfilterA = %d2
606
607 | *decoded0 stored 3 instructions down, avoiding pipeline stall
608
609 | ***** COMMON *****
610
611 addq.l #4, %a5 | p->buf++
612 lea.l (historybuffer+PREDICTOR_HISTORY_SIZE*4,%a6), %a3
613 | %a3 = &p->historybuffer[PREDICTOR_HISTORY_SIZE]
614
615 move.l %d2, (%a4)+ | *(decoded0++) = %d2 (p->YfilterA)
616
617 cmp.l %a3, %a5
618 beq.s .move_histm | History buffer is full, we need to do a memmove
619
620 subq.l #1, %d7 | decrease loop count
621 bne.w .loopm
622
623 move.l %d3, (YlastA,%a6) | %d3 = p->YlastA
624
625.donem:
626 move.l %a5, (%a6) | Save value of p->buf
627 movem.l (%sp), %d2-%d7/%a2-%a6
628 lea.l (11*4,%sp), %sp
629 rts
630
631.move_histm:
632 move.l %d3, (YlastA,%a6) | %d3 = p->YlastA
633
634 lea.l (historybuffer,%a6), %a3
635
636 | dest = %a3 (p->historybuffer)
637 | src = %a5 (p->buf)
638 | n = 200
639
640 movem.l (%a5), %d0-%d6/%a0-%a2 | 40 bytes
641 movem.l %d0-%d6/%a0-%a2, (%a3)
642 movem.l (40,%a5), %d0-%d6/%a0-%a2 | 40 bytes
643 movem.l %d0-%d6/%a0-%a2, (40,%a3)
644 movem.l (80,%a5), %d0-%d6/%a0-%a2 | 40 bytes
645 movem.l %d0-%d6/%a0-%a2, (80,%a3)
646 movem.l (120,%a5), %d0-%d6/%a0-%a2 | 40 bytes
647 movem.l %d0-%d6/%a0-%a2, (120,%a3)
648 movem.l (160,%a5), %d0-%d6/%a0-%a2 | 40 bytes
649 movem.l %d0-%d6/%a0-%a2, (160,%a3)
650
651 move.l %a3, %a5 | p->buf = &p->historybuffer[0]
652
653 move.l (YlastA,%a6), %d3 | %d3 = p->YlastA
654
655 subq.l #1, %d7 | decrease loop count
656 bne.w .loopm
657
658 bra.s .donem
659 .size predictor_decode_mono, .-predictor_decode_mono
diff --git a/apps/codecs/demac/libdemac/predictor.c b/apps/codecs/demac/libdemac/predictor.c
index d4f886fb8c..0d03d1d2fb 100644
--- a/apps/codecs/demac/libdemac/predictor.c
+++ b/apps/codecs/demac/libdemac/predictor.c
@@ -211,6 +211,7 @@ void ICODE_ATTR_DEMAC predictor_decode_stereo(struct predictor_t* p,
211} 211}
212#endif 212#endif
213 213
214#if !defined(CPU_COLDFIRE)
214void ICODE_ATTR_DEMAC predictor_decode_mono(struct predictor_t* p, 215void ICODE_ATTR_DEMAC predictor_decode_mono(struct predictor_t* p,
215 int32_t* decoded0, 216 int32_t* decoded0,
216 int count) 217 int count)
@@ -269,3 +270,4 @@ void ICODE_ATTR_DEMAC predictor_decode_mono(struct predictor_t* p,
269 270
270 p->YlastA = currentA; 271 p->YlastA = currentA;
271} 272}
273#endif