summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThom Johansen <thomj@rockbox.org>2005-05-31 07:56:28 +0000
committerThom Johansen <thomj@rockbox.org>2005-05-31 07:56:28 +0000
commit9985caf3f96df691fad9332986b7af4d0f66676d (patch)
tree835adf7c966dcc50f0a4a58da1c9726a01835c12
parentff40e4cc6a0a66e0eecaceae784203298c8c408d (diff)
downloadrockbox-9985caf3f96df691fad9332986b7af4d0f66676d.tar.gz
rockbox-9985caf3f96df691fad9332986b7af4d0f66676d.zip
ASM optimisation by David Bryant.
Placed various important arrays in IRAM. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@6540 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/libwavpack/Makefile2
-rw-r--r--apps/codecs/libwavpack/SOURCES3
-rw-r--r--apps/codecs/libwavpack/coldfire.S535
-rw-r--r--apps/codecs/libwavpack/unpack.c45
-rw-r--r--apps/codecs/libwavpack/wputils.c2
-rw-r--r--apps/plugins/wv2wav.c2
6 files changed, 566 insertions, 23 deletions
diff --git a/apps/codecs/libwavpack/Makefile b/apps/codecs/libwavpack/Makefile
index df26559f59..75b9060534 100644
--- a/apps/codecs/libwavpack/Makefile
+++ b/apps/codecs/libwavpack/Makefile
@@ -15,7 +15,7 @@ INCLUDES += -I$(APPSDIR)/$(APPEXTRA)
15endif 15endif
16 16
17CFLAGS = $(GCCOPTS) \ 17CFLAGS = $(GCCOPTS) \
18$(INCLUDES) $(TARGET) $(EXTRA_DEFINES) -DMEM=${MEMORYSIZE} 18$(INCLUDES) $(TARGET) $(EXTRA_DEFINES) -DMEM=${MEMORYSIZE} -O2 \
19 19
20# This sets up 'SRC' based on the files mentioned in SOURCES 20# This sets up 'SRC' based on the files mentioned in SOURCES
21include $(TOOLSDIR)/makesrc.inc 21include $(TOOLSDIR)/makesrc.inc
diff --git a/apps/codecs/libwavpack/SOURCES b/apps/codecs/libwavpack/SOURCES
index def57b703c..a4f0f2f7a9 100644
--- a/apps/codecs/libwavpack/SOURCES
+++ b/apps/codecs/libwavpack/SOURCES
@@ -4,4 +4,7 @@ metadata.c
4unpack.c 4unpack.c
5words.c 5words.c
6wputils.c 6wputils.c
7#if CONFIG_CPU==MCF5249 && !defined(SIMULATOR)
8coldfire.S
9#endif
7 10
diff --git a/apps/codecs/libwavpack/coldfire.S b/apps/codecs/libwavpack/coldfire.S
new file mode 100644
index 0000000000..9c7e098e88
--- /dev/null
+++ b/apps/codecs/libwavpack/coldfire.S
@@ -0,0 +1,535 @@
1/***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id$
9 *
10 * Copyright (C) 2005 by David Bryant
11 *
12 * All files in this archive are subject to the GNU General Public License.
13 * See the file COPYING in the source tree root for full license agreement.
14 *
15 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
16 * KIND, either express or implied.
17 *
18 ****************************************************************************/
19
20/* This is an assembly optimized version of the following WavPack function:
21 *
22 * void decorr_stereo_pass_cont_mcf5249 (struct decorr_pass *dpp,
23 * long *buffer, long sample_count);
24 *
25 * It performs a single pass of stereo decorrelation on the provided buffer.
26 * Note that this version of the function requires that the 8 previous stereo
27 * samples are visible and correct. In other words, it ignores the "samples_*"
28 * fields in the decorr_pass structure and gets the history data directly
29 * from the buffer. It does, however, return the appropriate history samples
30 * to the decorr_pass structure before returning.
31 *
32 * This is written to work on a MCF5249 processor, or any processor based on
33 * the ColdFire V2 core with an EMAC unit. The EMAC is perfectly suited for
34 * the "apply_weight" function of WavPack decorrelation because it provides
35 * the requires 40-bit product. The fractional rounding mode of the EMAC is not
36 * configurable and uses "round to even" while WavPack uses "round to larger",
37 * so the rounding has to be done manually.
38 */
39
40 .text
41 .align 2
42 .global decorr_stereo_pass_cont_mcf5249
43
44decorr_stereo_pass_cont_mcf5249:
45
46 lea (-44, %sp), %sp
47 movem.l %d2-%d7/%a2-%a6, (%sp)
48 move.l 44+4(%sp), %a2 | a2 = dpp->
49 move.l 44+8(%sp), %a1 | a1 = bptr
50 move.w 2(%a2), %a3 | a3 = dpp->delta
51 move.w 4(%a2), %d3 | d3 = dpp->weight_A (sign extended)
52 ext.l %d3
53 move.w 6(%a2), %d4 | d4 = dpp->weight_B (sign extended)
54 ext.l %d4
55 move.l 44+12(%sp), %d0 | d0 = sample_count
56 jbeq return_only | if zero, nothing to do
57
58 lsl.l #3, %d0 | d5 = bptr + (sample_count * 8)
59 move.l %d0, %d5
60 add.l %a1, %d5
61
62 moveq.l #17, %d0 | left shift weights & delta 17 places
63 asl.l %d0, %d3
64 asl.l %d0, %d4
65 move.l %a3, %d1
66 asl.l %d0, %d1
67 move.l %d1, %a3
68
69 move.l #0x20, %macsr | set fractional mode for MAC
70 move.l #0, %acc1 | acc1 = 0x00 0000 80 (for rounding)
71 move.l #0x800000, %accext01
72
73 move.l #1024<<17, %d6 | d6 & d7 are weight clipping limits
74 move.l #-1024<<17, %d7 | (only used by negative terms)
75
76 move.w (%a2), %d0 | d0 = term
77 ext.l %d0
78 cmp.l #17, %d0
79 jbeq term_17 | term = 17
80 cmp.l #18, %d0
81 jbeq term_18 | term = 18
82 addq.l #1, %d0
83 jbeq term_minus_1 | term = -1
84 addq.l #1, %d0
85 jbeq term_minus_2 | term = -2
86 addq.l #1, %d0
87 jbeq term_minus_3 | term = -3
88 jbra term_default | default term = 1 - 8
89
90|------------------------------------------------------------------------------
91| Loop to handle term = 17 condition
92|
93| a0 = d0 = (2 * bptr [-1]) - bptr [-2]
94| a1 = bptr d1 = initial bptr [0]
95| a2 = dpp-> d2 = updated bptr [0]
96| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
97| a4 = d4 = dpp->weight_B << 17
98| a5 = d5 = eptr
99| macsr = 0x20 acc1 = 0x00 0000 80
100|------------------------------------------------------------------------------
101
102term_17:
103 move.l -8(%a1), %d0 | d0 = 2 * bptr [-1] - bptr [-2]
104 add.l %d0, %d0
105 sub.l -16(%a1), %d0
106 beq .L251 | if zero, skip calculation
107 move.l %acc1, %acc0
108 asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_A
109 mac.l %d0, %d3, %acc0
110 move.l (%a1), %d1
111 beq .L255
112 eor.l %d1, %d0 | else compare signs
113 bge .L256 | if same, add delta to weight
114 sub.l %a3, %d3 | else subtract delta from weight
115 sub.l %a3, %d3 | subtract again instead of branch
116.L256: add.l %a3, %d3 | add delta to weight
117
118.L255: move.l %acc0, %d2 | d2 = rounded product
119 add.l %d1, %d2 | update bptr [0] and store
120 move.l %d2, (%a1)+
121
122.L253: move.l -8(%a1), %d0 | d0 = 2 * bptr [-1] - bptr [-2]
123 add.l %d0, %d0
124 sub.l -16(%a1), %d0
125 beq .L257 | if zero, skip calculations
126 move.l %acc1, %acc0
127 asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_B
128 mac.l %d0, %d4, %acc0
129 move.l (%a1), %d1
130 beq .L254
131 eor.l %d1, %d0 | else compare signs
132 bge .L259 | if same, add delta to weight
133 sub.l %a3, %d4 | else subtract delta from weight
134 sub.l %a3, %d4 | subtract again instead of branch
135.L259: add.l %a3, %d4 | add delta to weight
136
137.L254: move.l %acc0, %d2 | d2 = rounded product
138 add.l %d1, %d2 | update bptr [0] and store
139 move.l %d2, (%a1)+
140
141.L252: cmp.l %a1, %d5 | loop if bptr < eptr
142 jbhi term_17
143 bra term_17_18_finish | exit through common path
144
145.L251: addq.l #4, %a1 | update point and jump back into loop
146 bra .L253
147
148.L257: addq.l #4, %a1 | update point and jump back into loop
149 bra .L252
150
151|------------------------------------------------------------------------------
152| Loop to handle term = 18 condition
153|
154| a0 = d0 = ((3 * bptr [-1]) - bptr [-2]) >> 1
155| a1 = bptr d1 = initial bptr [0]
156| a2 = dpp-> d2 = updated bptr [0]
157| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
158| a4 = d4 = dpp->weight_B << 17
159| a5 = d5 = eptr
160| macsr = 0x20 acc1 = 0x00 0000 80
161|------------------------------------------------------------------------------
162
163term_18:
164 move.l -8(%a1), %a0 | d0 = (3 * bptr [-1] - bptr [-2]) >> 1
165 lea (%a0,%a0.l*2), %a0
166 move.l %a0, %d0
167 sub.l -16(%a1), %d0
168 asr.l #1, %d0
169 beq .L260
170 move.l %acc1, %acc0
171 asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_A
172 mac.l %d0, %d3, %acc0
173 move.l (%a1), %d1
174 beq .L266
175 eor.l %d1, %d0 | else compare signs
176 bge .L267 | if same, add delta to weight
177 sub.l %a3, %d3 | else subtract delta from weight
178 sub.l %a3, %d3 | subtract again instead of branch
179.L267: add.l %a3, %d3 | add delta to weight
180
181.L266: move.l %acc0, %d2 | d2 = rounded product
182 add.l %d1, %d2 | add applied weight to bptr [0], store
183 move.l %d2, (%a1)+
184
185.L268: move.l -8(%a1), %a0 | d0 = (3 * bptr [-1] - bptr [-2]) >> 1
186 lea (%a0,%a0.l*2), %a0
187 move.l %a0, %d0
188 sub.l -16(%a1), %d0
189 asr.l #1, %d0
190 beq .L261
191 move.l %acc1, %acc0
192 asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_B
193 mac.l %d0, %d4, %acc0
194 move.l (%a1), %d1
195 beq .L265
196 eor.l %d1, %d0 | else compare signs
197 bge .L270 | if same, add delta to weight
198 sub.l %a3, %d4 | else subtract delta from weight
199 sub.l %a3, %d4 | subtract again instead of branch
200.L270: add.l %a3, %d4 | add delta to weight
201
202.L265: move.l %acc0, %d2 | d2 = rounded product
203 add.l %d1, %d2 | add applied weight to bptr [0], store
204 move.l %d2, (%a1)+
205
206.L269: cmp.l %a1, %d5 | loop if bptr < eptr
207 jbhi term_18
208 bra term_17_18_finish | exit through common path
209
210.L260: addq.l #4, %a1 | bump pointer and jump back into loop
211 bra .L268
212
213.L261: addq.l #4, %a1 | bump pointer and jump back into loop
214 bra .L269
215
216term_17_18_finish:
217 move.l -4(%a1), 40(%a2) | restore dpp->samples_A [0-1], B [0-1]
218 move.l -8(%a1), 8(%a2)
219 move.l -12(%a1), 44(%a2)
220 move.l -16(%a1), 12(%a2)
221 jbra finish_up
222
223|------------------------------------------------------------------------------
224| Loop to handle default terms (i.e. 1 - 8)
225|
226| a0 = tptr d0 = tptr [0]
227| a1 = bptr d1 = initial bptr [0]
228| a2 = dpp-> d2 = updated bptr [0]
229| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
230| a4 = d4 = dpp->weight_B << 17
231| a5 = d5 = eptr
232| macsr = 0x20 acc1 = 0x00 0000 80
233|------------------------------------------------------------------------------
234
235term_default:
236 move.w (%a2), %d0 | a0 = a1 - (dpp->term * 8)
237 ext.l %d0
238 lsl.l #3, %d0
239 move.l %a1, %a0
240 sub.l %d0, %a0
241
242term_default_loop:
243 move.l (%a0)+, %d0 | d0 = tptr [0], skip ahead if zero
244 beq .L271
245 move.l %acc1, %acc0
246 asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_A
247 mac.l %d0, %d3, %acc0
248 move.l (%a1), %d1
249 beq .L277
250 eor.l %d1, %d0 | else compare signs
251 bge .L278 | if same, add delta to weight
252 sub.l %a3, %d3 | else subtract delta from weight
253 sub.l %a3, %d3 | subtract again instead of branch
254.L278: add.l %a3, %d3 | add delta to weight
255
256.L277: move.l %acc0, %d2 | d2 = rounded product
257 add.l %d1, %d2 | add applied weight to bptr [0], store
258 move.l %d2, (%a1)+
259
260.L275: move.l (%a0)+, %d0 | d0 = tptr [0], skip ahead if zero
261 beq .L272
262 move.l %acc1, %acc0
263 asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_B
264 mac.l %d0, %d4, %acc0
265 move.l (%a1), %d1
266 beq .L276
267 eor.l %d1, %d0 | else compare signs
268 bge .L281 | if same, add delta to weight
269 sub.l %a3, %d4 | else subtract delta from weight
270 sub.l %a3, %d4 | subtract again instead of branch
271.L281: add.l %a3, %d4 | add delta to weight
272
273.L276: move.l %acc0, %d2 | d2 = rounded product
274 add.l %d1, %d2 | add applied weight to bptr [0], store
275 move.l %d2, (%a1)+
276
277.L274: cmp.l %a1, %d5 | loop back if bptr < eptr
278 jbhi term_default_loop
279 move.w (%a2), %d0 | d0 = term - 1
280 moveq.l #8, %d1 | d1 = loop counter
281
282.L323: subq.l #1, %d0 | back up & mask index
283 and.l #7, %d0
284 move.l -(%a1), 40(%a2,%d0.l*4) | store dpp->samples_B [d0]
285 move.l -(%a1), 8(%a2,%d0.l*4) | store dpp->samples_A [d0]
286 subq.l #1, %d1 | loop on count
287 jbne .L323
288 jbra finish_up
289
290.L271: addq.l #4, %a1 | bump pointer and jump back into loop
291 bra .L275
292
293.L272: addq.l #4, %a1 | bump pointer and jump back into loop
294 bra .L274
295
296
297|------------------------------------------------------------------------------
298| Loop to handle term = -1 condition
299|
300| a0 = d0 = decorrelation sample
301| a1 = bptr d1 = initial bptr [0]
302| a2 = dpp-> d2 = updated bptr [0]
303| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
304| a4 = d4 = dpp->weight_B << 17
305| a5 = d5 = eptr
306| a6 = d6 = 1024 << 17
307| a7 = d7 = -1024 << 17
308| macsr = 0x20 acc1 = 0x00 0000 80
309|------------------------------------------------------------------------------
310
311term_minus_1:
312 move.l -4(%a1), %d0 | d0 = bptr [-1]
313 beq .L402
314 move.l %acc1, %acc0
315 asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_A)
316 mac.l %d0, %d3, %acc0
317 move.l (%a1), %d1
318 beq .L405
319 eor.l %d1, %d0 | else compare signs
320 bge .L404 | if same, add delta to weight
321 sub.l %a3, %d3 | else subtract delta from weight
322 cmp.l %d7, %d3 | check for negative clip limit
323 bge .L405
324 move.l %d7, %d3
325 bra .L405
326
327.L404: add.l %a3, %d3 | add delta to weight
328 cmp.l %d6, %d3 | check for positive clip limit
329 ble .L405
330 move.l %d6, %d3
331
332.L405: move.l %acc0, %d0 | d2 = rounded product
333 add.l %d1, %d0 | add applied weight to bptr [0], store
334 move.l %d0, (%a1)+
335 beq .L401
336
337.L410: move.l %acc1, %acc0
338 asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_B)
339 mac.l %d0, %d4, %acc0
340 move.l (%a1), %d1
341 beq .L403
342 eor.l %d1, %d0 | else compare signs
343 bge .L407 | if same, add delta to weight
344 sub.l %a3, %d4 | else subtract delta from weight
345 cmp.l %d7, %d4 | check for negative clip limit
346 bge .L403
347 move.l %d7, %d4
348 bra .L403
349
350.L407: add.l %a3, %d4 | add delta to weight
351 cmp.l %d6, %d4 | check for positive clip limit
352 ble .L403
353 move.l %d6, %d4
354
355.L403: move.l %acc0, %d2 | d2 = rounded product
356 add.l %d1, %d2 | add applied weight to bptr [1], store
357 move.l %d2, (%a1)+
358
359.L411: cmp.l %a1, %d5 | loop back if bptr < eptr
360 jbhi term_minus_1
361 move.l -4(%a1), 8(%a2) | dpp->samples_A [0] = bptr [-1]
362 jbra finish_up
363
364.L402: move.l (%a1)+, %d0
365 bne .L410
366
367.L401: addq.l #4, %a1
368 bra .L411
369
370
371|------------------------------------------------------------------------------
372| Loop to handle term = -2 condition
373|
374| a0 = d0 = decorrelation sample
375| a1 = bptr d1 = initial bptr [0]
376| a2 = dpp-> d2 = updated bptr [0]
377| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
378| a4 = d4 = dpp->weight_B << 17
379| a5 = d5 = eptr
380| a6 = d6 = 1024 << 17
381| a7 = d7 = -1024 << 17
382| macsr = 0x20 acc1 = 0x00 0000 80
383|------------------------------------------------------------------------------
384
385term_minus_2:
386 move.l -8(%a1), %d0 | d0 = bptr [-2]
387 beq .L511
388 move.l %acc1, %acc0
389 asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_B)
390 mac.l %d0, %d4, %acc0
391 move.l 4(%a1), %d1
392 beq .L505
393 eor.l %d1, %d0 | else compare signs
394 bge .L504 | if same, add delta to weight
395 sub.l %a3, %d4 | else subtract delta from weight
396 cmp.l %d7, %d4 | ckeck for negative clip limit
397 bge .L505
398 move.l %d7, %d4
399 bra .L505
400
401.L504: add.l %a3, %d4 | add delta to weight
402 cmp.l %d6, %d4 | check for positive clip limit
403 ble .L505
404 move.l %d6, %d4
405
406.L505: move.l %acc0, %d0 | d2 = rounded product
407 add.l %d1, %d0 | add applied weight to bptr [0], store
408 move.l %d0, 4(%a1)
409 beq .L512
410
411.L510: move.l %acc1, %acc0
412 asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_A)
413 mac.l %d0, %d3, %acc0
414 move.l (%a1), %d1
415 beq .L503
416 eor.l %d1, %d0 | else compare signs
417 bge .L507 | if same, add delta to weight
418 sub.l %a3, %d3 | else subtract delta from weight
419 cmp.l %d7, %d3 | check for negative clip limit
420 bge .L503
421 move.l %d7, %d3
422 bra .L503
423
424.L507: add.l %a3, %d3 | add delta to weight
425 cmp.l %d6, %d3 | check for negative clip limit
426 ble .L503
427 move.l %d6, %d3
428
429.L503: move.l %acc0, %d2 | d2 = rounded product
430 add.l %d1, %d2 | add applied weight to bptr [1], store
431 move.l %d2, (%a1)
432
433.L512: addq.l #8, %a1
434 cmp.l %a1, %d5 | loop if bptr < eptr
435 jbhi term_minus_2
436 move.l -8(%a1), 40(%a2) | dpp->samples_B [0] = bptr [-4]
437 jbra finish_up
438
439.L511: move.l 4(%a1), %d0
440 beq .L512
441 bra .L510
442
443
444|------------------------------------------------------------------------------
445| Loop to handle term = -3 condition
446|
447| a0 = d0 = decorrelation sample
448| a1 = bptr d1 = initial bptr [0]
449| a2 = dpp-> d2 = updated bptr [0]
450| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
451| a4 = d4 = dpp->weight_B << 17
452| a5 = d5 = eptr
453| a6 = d6 = 1024 << 17
454| a7 = d7 = -1024 << 17
455| macsr = 0x20 acc1 = 0x00 0000 80
456|------------------------------------------------------------------------------
457
458term_minus_3:
459 move.l -4(%a1), %d0 | d0 = bptr [-1]
460 beq .L301
461 move.l %acc1, %acc0
462 asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_A)
463 mac.l %d0, %d3, %acc0
464 move.l (%a1), %d1
465 beq .L320
466 eor.l %d1, %d0 | else compare signs
467 bge .L319 | if same, add delta to weight
468 sub.l %a3, %d3 | else subtract delta from weight
469 cmp.l %d7, %d3 | check for negative clip limit
470 bge .L320
471 move.l %d7, %d3
472 bra .L320
473
474.L319: add.l %a3, %d3 | add delta to weight
475 cmp.l %d6, %d3 | check for positive clip limit
476 ble .L320
477 move.l %d6, %d3
478
479.L320: move.l %acc0, %d2 | d2 = rounded product
480 add.l %d1, %d2 | add applied weight to bptr [0], store
481 move.l %d2, (%a1)+
482
483.L330: move.l -12(%a1), %d0 | d0 = bptr [-2]
484 beq .L302
485 move.l %acc1, %acc0
486 asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_B)
487 mac.l %d0, %d4, %acc0
488 move.l (%a1), %d1
489 beq .L318
490 eor.l %d1, %d0 | else compare signs
491 bge .L322 | if same, add delta to weight
492 sub.l %a3, %d4 | else subtract delta from weight
493 cmp.l %d7, %d4 | check for negative clip limit
494 bge .L318
495 move.l %d7, %d4
496 bra .L318
497
498.L322: add.l %a3, %d4 | add delta to weight
499 cmp.l %d6, %d4 | check for positive clip limit
500 ble .L318
501 move.l %d6, %d4
502
503.L318: move.l %acc0, %d2 | d2 = rounded product
504 add.l %d1, %d2 | add applied weight to bptr [1], store
505 move.l %d2, (%a1)+
506
507.L331: cmp.l %a1, %d5 | bptr, eptr
508 jbhi term_minus_3
509 move.l -4(%a1), 8(%a2) | dpp->samples_A [0] = bptr [-1]
510 move.l -8(%a1), 40(%a2) | dpp->samples_B [0] = bptr [-2]
511 jbra finish_up
512
513.L301: addq.l #4, %a1
514 bra .L330
515
516.L302: addq.l #4, %a1
517 bra .L331
518
519| finish and return
520
521finish_up:
522 moveq.l #17, %d0
523 asr.l %d0, %d3
524 asr.l %d0, %d4
525 move.w %d3, 4(%a2) | weight_A, dpp->weight_A
526 move.w %d4, 6(%a2) | weight_B, dpp->weight_B
527
528 clr.l %d0 | clear up EMAC
529 move.l %d0, %acc0
530 move.l %d0, %acc1
531
532return_only:
533 movem.l (%sp), %d2-%d7/%a2-%a6
534 lea (44,%sp), %sp
535 rts
diff --git a/apps/codecs/libwavpack/unpack.c b/apps/codecs/libwavpack/unpack.c
index ae473787a7..5afaac3659 100644
--- a/apps/codecs/libwavpack/unpack.c
+++ b/apps/codecs/libwavpack/unpack.c
@@ -27,7 +27,11 @@ static void strcpy_loc (char *dst, char *src) { while (*src) *dst++ = *src++; *d
27// these macros implement the weight application and update operations 27// these macros implement the weight application and update operations
28// that are at the heart of the decorrelation loops 28// that are at the heart of the decorrelation loops
29 29
30#if 0 // PERFCOND
30#define apply_weight_i(weight, sample) ((weight * sample + 512) >> 10) 31#define apply_weight_i(weight, sample) ((weight * sample + 512) >> 10)
32#else
33#define apply_weight_i(weight, sample) ((((weight * sample) >> 8) + 2) >> 2)
34#endif
31 35
32#define apply_weight_f(weight, sample) (((((sample & 0xffff) * weight) >> 9) + \ 36#define apply_weight_f(weight, sample) (((((sample & 0xffff) * weight) >> 9) + \
33 (((sample & ~0xffff) >> 9) * weight) + 1) >> 1) 37 (((sample & ~0xffff) >> 9) * weight) + 1) >> 1)
@@ -39,7 +43,7 @@ static void strcpy_loc (char *dst, char *src) { while (*src) *dst++ = *src++; *d
39#define apply_weight(weight, sample) ((int32_t)((weight * (int64_t) sample + 512) >> 10)) 43#define apply_weight(weight, sample) ((int32_t)((weight * (int64_t) sample + 512) >> 10))
40#endif 44#endif
41 45
42#if 1 // PERFCOND 46#if 0 // PERFCOND
43#define update_weight(weight, delta, source, result) \ 47#define update_weight(weight, delta, source, result) \
44 if (source && result) weight -= ((((source ^ result) >> 30) & 2) - 1) * delta; 48 if (source && result) weight -= ((((source ^ result) >> 30) & 2) - 1) * delta;
45#else 49#else
@@ -315,9 +319,14 @@ int read_config_info (WavpackContext *wpc, WavpackMetadata *wpmd)
315// samples unpacked, which can be less than the number requested if an error 319// samples unpacked, which can be less than the number requested if an error
316// occurs or the end of the block is reached. 320// occurs or the end of the block is reached.
317 321
322#if CONFIG_CPU==MCF5249 && !defined(SIMULATOR)
323extern void decorr_stereo_pass_cont_mcf5249 (struct decorr_pass *dpp, long *buffer, long sample_count);
324#else
325static void decorr_stereo_pass_cont (struct decorr_pass *dpp, long *buffer, long sample_count);
326#endif
327
318static void decorr_mono_pass (struct decorr_pass *dpp, long *buffer, long sample_count); 328static void decorr_mono_pass (struct decorr_pass *dpp, long *buffer, long sample_count);
319static void decorr_stereo_pass (struct decorr_pass *dpp, long *buffer, long sample_count); 329static void decorr_stereo_pass (struct decorr_pass *dpp, long *buffer, long sample_count);
320static void decorr_stereo_pass_cont (struct decorr_pass *dpp, long *buffer, long sample_count);
321static void fixup_samples (WavpackStream *wps, long *buffer, ulong sample_count); 330static void fixup_samples (WavpackStream *wps, long *buffer, ulong sample_count);
322 331
323long unpack_samples (WavpackContext *wpc, long *buffer, ulong sample_count) 332long unpack_samples (WavpackContext *wpc, long *buffer, ulong sample_count)
@@ -372,7 +381,11 @@ long unpack_samples (WavpackContext *wpc, long *buffer, ulong sample_count)
372 else 381 else
373 for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount--; dpp++) { 382 for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount--; dpp++) {
374 decorr_stereo_pass (dpp, buffer, 8); 383 decorr_stereo_pass (dpp, buffer, 8);
384#if CONFIG_CPU==MCF5249 && !defined(SIMULATOR)
385 decorr_stereo_pass_cont_mcf5249 (dpp, buffer + 16, sample_count - 8);
386#else
375 decorr_stereo_pass_cont (dpp, buffer + 16, sample_count - 8); 387 decorr_stereo_pass_cont (dpp, buffer + 16, sample_count - 8);
388#endif
376 } 389 }
377 390
378 if (flags & JOINT_STEREO) 391 if (flags & JOINT_STEREO)
@@ -530,11 +543,13 @@ static void decorr_stereo_pass (struct decorr_pass *dpp, long *buffer, long samp
530 dpp->weight_B = weight_B; 543 dpp->weight_B = weight_B;
531} 544}
532 545
546#if CONFIG_CPU != MCF5249 || defined(SIMULATOR)
547
533static void decorr_stereo_pass_cont (struct decorr_pass *dpp, long *buffer, long sample_count) 548static void decorr_stereo_pass_cont (struct decorr_pass *dpp, long *buffer, long sample_count)
534{ 549{
535 long delta = dpp->delta, weight_A = dpp->weight_A, weight_B = dpp->weight_B; 550 long delta = dpp->delta, weight_A = dpp->weight_A, weight_B = dpp->weight_B;
536 long *bptr, *tptr, *eptr = buffer + (sample_count * 2), sam_A, sam_B; 551 long *bptr, *tptr, *eptr = buffer + (sample_count * 2), sam_A, sam_B;
537 int k; 552 int k, i;
538 553
539 switch (dpp->term) { 554 switch (dpp->term) {
540 555
@@ -581,23 +596,11 @@ static void decorr_stereo_pass_cont (struct decorr_pass *dpp, long *buffer, long
581 update_weight (weight_B, delta, tptr [1], sam_A); 596 update_weight (weight_B, delta, tptr [1], sam_A);
582 } 597 }
583 598
584 k = dpp->term; 599 for (k = dpp->term - 1, i = 8; i--; k--) {
585 dpp->samples_B [--k & (MAX_TERM - 1)] = bptr [-1]; 600 dpp->samples_B [k & (MAX_TERM - 1)] = *--bptr;
586 dpp->samples_A [ k & (MAX_TERM - 1)] = bptr [-2]; 601 dpp->samples_A [k & (MAX_TERM - 1)] = *--bptr;
587 dpp->samples_B [--k & (MAX_TERM - 1)] = bptr [-3]; 602 }
588 dpp->samples_A [ k & (MAX_TERM - 1)] = bptr [-4]; 603
589 dpp->samples_B [--k & (MAX_TERM - 1)] = bptr [-5];
590 dpp->samples_A [ k & (MAX_TERM - 1)] = bptr [-6];
591 dpp->samples_B [--k & (MAX_TERM - 1)] = bptr [-7];
592 dpp->samples_A [ k & (MAX_TERM - 1)] = bptr [-8];
593 dpp->samples_B [--k & (MAX_TERM - 1)] = bptr [-9];
594 dpp->samples_A [ k & (MAX_TERM - 1)] = bptr [-10];
595 dpp->samples_B [--k & (MAX_TERM - 1)] = bptr [-11];
596 dpp->samples_A [ k & (MAX_TERM - 1)] = bptr [-12];
597 dpp->samples_B [--k & (MAX_TERM - 1)] = bptr [-13];
598 dpp->samples_A [ k & (MAX_TERM - 1)] = bptr [-14];
599 dpp->samples_B [--k & (MAX_TERM - 1)] = bptr [-15];
600 dpp->samples_A [ k & (MAX_TERM - 1)] = bptr [-16];
601 break; 604 break;
602 605
603 case -1: 606 case -1:
@@ -639,6 +642,8 @@ static void decorr_stereo_pass_cont (struct decorr_pass *dpp, long *buffer, long
639 dpp->weight_B = weight_B; 642 dpp->weight_B = weight_B;
640} 643}
641 644
645#endif
646
642static void decorr_mono_pass (struct decorr_pass *dpp, long *buffer, long sample_count) 647static void decorr_mono_pass (struct decorr_pass *dpp, long *buffer, long sample_count)
643{ 648{
644 long delta = dpp->delta, weight_A = dpp->weight_A; 649 long delta = dpp->delta, weight_A = dpp->weight_A;
diff --git a/apps/codecs/libwavpack/wputils.c b/apps/codecs/libwavpack/wputils.c
index 9227b66e46..8d58b3b4d7 100644
--- a/apps/codecs/libwavpack/wputils.c
+++ b/apps/codecs/libwavpack/wputils.c
@@ -45,7 +45,7 @@ static ulong read_next_header (read_stream infile, WavpackHeader *wphdr);
45// large integer or floating point files (but always provides at least 24 bits 45// large integer or floating point files (but always provides at least 24 bits
46// of resolution). 46// of resolution).
47 47
48static WavpackContext wpc; 48static WavpackContext wpc IDATA_ATTR;
49 49
50WavpackContext *WavpackOpenFileInput (read_stream infile, char *error) 50WavpackContext *WavpackOpenFileInput (read_stream infile, char *error)
51{ 51{
diff --git a/apps/plugins/wv2wav.c b/apps/plugins/wv2wav.c
index c0bc05cf12..909a0c3c63 100644
--- a/apps/plugins/wv2wav.c
+++ b/apps/plugins/wv2wav.c
@@ -29,7 +29,7 @@
29 29
30static struct plugin_api* rb; 30static struct plugin_api* rb;
31static file_info_struct file_info; 31static file_info_struct file_info;
32static long temp_buffer [BUFFER_SIZE]; 32static long temp_buffer [BUFFER_SIZE] IDATA_ATTR;
33 33
34/* Reformat samples from longs in processor's native endian mode to 34/* Reformat samples from longs in processor's native endian mode to
35 little-endian data with 2 bytes / sample. */ 35 little-endian data with 2 bytes / sample. */