From 9985caf3f96df691fad9332986b7af4d0f66676d Mon Sep 17 00:00:00 2001
From: Thom Johansen <thomj@rockbox.org>
Date: Tue, 31 May 2005 07:56:28 +0000
Subject: ASM optimisation by David Bryant. Placed various important arrays in
 IRAM.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@6540 a1c6a512-1295-4272-9138-f99709370657
---
 apps/codecs/libwavpack/Makefile   |   2 +-
 apps/codecs/libwavpack/SOURCES    |   3 +
 apps/codecs/libwavpack/coldfire.S | 535 ++++++++++++++++++++++++++++++++++++++
 apps/codecs/libwavpack/unpack.c   |  45 ++--
 apps/codecs/libwavpack/wputils.c  |   2 +-
 apps/plugins/wv2wav.c             |   2 +-
 6 files changed, 566 insertions(+), 23 deletions(-)
 create mode 100644 apps/codecs/libwavpack/coldfire.S

diff --git a/apps/codecs/libwavpack/Makefile b/apps/codecs/libwavpack/Makefile
index df26559f59..75b9060534 100644
--- a/apps/codecs/libwavpack/Makefile
+++ b/apps/codecs/libwavpack/Makefile
@@ -15,7 +15,7 @@ INCLUDES += -I$(APPSDIR)/$(APPEXTRA)
 endif
 
 CFLAGS = $(GCCOPTS) \
-$(INCLUDES) $(TARGET) $(EXTRA_DEFINES) -DMEM=${MEMORYSIZE}
+$(INCLUDES) $(TARGET) $(EXTRA_DEFINES) -DMEM=${MEMORYSIZE} -O2 \
 
 # This sets up 'SRC' based on the files mentioned in SOURCES
 include $(TOOLSDIR)/makesrc.inc
diff --git a/apps/codecs/libwavpack/SOURCES b/apps/codecs/libwavpack/SOURCES
index def57b703c..a4f0f2f7a9 100644
--- a/apps/codecs/libwavpack/SOURCES
+++ b/apps/codecs/libwavpack/SOURCES
@@ -4,4 +4,7 @@ metadata.c
 unpack.c
 words.c
 wputils.c
+#if CONFIG_CPU==MCF5249 && !defined(SIMULATOR)
+coldfire.S
+#endif
 
diff --git a/apps/codecs/libwavpack/coldfire.S b/apps/codecs/libwavpack/coldfire.S
new file mode 100644
index 0000000000..9c7e098e88
--- /dev/null
+++ b/apps/codecs/libwavpack/coldfire.S
@@ -0,0 +1,535 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2005 by David Bryant
+ *
+ * All files in this archive are subject to the GNU General Public License.
+ * See the file COPYING in the source tree root for full license agreement.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+/* This is an assembly optimized version of the following WavPack function:
+ *
+ * void decorr_stereo_pass_cont_mcf5249 (struct decorr_pass *dpp,
+ *                                       long *buffer, long sample_count);
+ *
+ * It performs a single pass of stereo decorrelation on the provided buffer.
+ * Note that this version of the function requires that the 8 previous stereo
+ * samples are visible and correct. In other words, it ignores the "samples_*"
+ * fields in the decorr_pass structure and gets the history data directly
+ * from the buffer. It does, however, return the appropriate history samples
+ * to the decorr_pass structure before returning.
+ *
+ * This is written to work on a MCF5249 processor, or any processor based on
+ * the ColdFire V2 core with an EMAC unit. The EMAC is perfectly suited for
+ * the "apply_weight" function of WavPack decorrelation because it provides
+ * the requires 40-bit product. The fractional rounding mode of the EMAC is not
+ * configurable and uses "round to even" while WavPack uses "round to larger",
+ * so the rounding has to be done manually.
+ */
+
+        .text
+        .align  2
+        .global decorr_stereo_pass_cont_mcf5249
+
+decorr_stereo_pass_cont_mcf5249:
+
+        lea     (-44, %sp), %sp
+        movem.l %d2-%d7/%a2-%a6, (%sp)
+        move.l  44+4(%sp), %a2          | a2 = dpp->
+        move.l  44+8(%sp), %a1          | a1 = bptr
+        move.w  2(%a2), %a3             | a3 = dpp->delta
+        move.w  4(%a2), %d3             | d3 = dpp->weight_A (sign extended)
+        ext.l   %d3
+        move.w  6(%a2), %d4             | d4 = dpp->weight_B (sign extended)
+        ext.l   %d4
+        move.l 44+12(%sp), %d0          | d0 = sample_count
+        jbeq    return_only             | if zero, nothing to do
+
+        lsl.l   #3, %d0                 | d5 = bptr + (sample_count * 8)
+        move.l  %d0, %d5
+        add.l   %a1, %d5
+
+        moveq.l #17, %d0                | left shift weights & delta 17 places
+        asl.l   %d0, %d3
+        asl.l   %d0, %d4
+        move.l  %a3, %d1
+        asl.l   %d0, %d1
+        move.l  %d1, %a3
+
+        move.l  #0x20, %macsr           | set fractional mode for MAC
+        move.l  #0, %acc1               | acc1 = 0x00 0000 80 (for rounding)
+        move.l  #0x800000, %accext01
+        
+        move.l  #1024<<17, %d6          | d6 & d7 are weight clipping limits
+        move.l  #-1024<<17, %d7         | (only used by negative terms)
+
+        move.w  (%a2), %d0              | d0 = term
+        ext.l   %d0
+        cmp.l   #17, %d0
+        jbeq    term_17                 | term = 17
+        cmp.l   #18, %d0
+        jbeq    term_18                 | term = 18
+        addq.l  #1, %d0
+        jbeq    term_minus_1            | term = -1
+        addq.l  #1, %d0
+        jbeq    term_minus_2            | term = -2
+        addq.l  #1, %d0
+        jbeq    term_minus_3            | term = -3
+        jbra    term_default            | default term = 1 - 8
+
+|------------------------------------------------------------------------------
+| Loop to handle term = 17 condition
+|
+| a0 =                          d0 = (2 * bptr [-1]) - bptr [-2]
+| a1 = bptr                     d1 = initial bptr [0]
+| a2 = dpp->                    d2 = updated bptr [0]
+| a3 = dpp->delta << 17         d3 = dpp->weight_A << 17
+| a4 =                          d4 = dpp->weight_B << 17
+| a5 =                          d5 = eptr
+| macsr = 0x20                  acc1 = 0x00 0000 80
+|------------------------------------------------------------------------------
+
+term_17:
+        move.l  -8(%a1), %d0            | d0 = 2 * bptr [-1] - bptr [-2]
+        add.l   %d0, %d0
+        sub.l   -16(%a1), %d0
+        beq     .L251                   | if zero, skip calculation
+        move.l  %acc1, %acc0
+        asl.l   #4, %d0                 | acc0 = acc1 + (d0 << 4) * weight_A
+        mac.l   %d0, %d3, %acc0
+        move.l  (%a1), %d1
+        beq     .L255
+        eor.l   %d1, %d0                | else compare signs
+        bge     .L256                   | if same, add delta to weight
+        sub.l   %a3, %d3                | else subtract delta from weight
+        sub.l   %a3, %d3                | subtract again instead of branch
+.L256:  add.l   %a3, %d3                | add delta to weight
+
+.L255:  move.l  %acc0, %d2              | d2 = rounded product
+        add.l   %d1, %d2                | update bptr [0] and store
+        move.l  %d2, (%a1)+
+
+.L253:  move.l  -8(%a1), %d0            | d0 = 2 * bptr [-1] - bptr [-2]
+        add.l   %d0, %d0
+        sub.l   -16(%a1), %d0
+        beq     .L257                   | if zero, skip calculations
+        move.l  %acc1, %acc0
+        asl.l   #4, %d0                 | acc0 = acc1 + (d0 << 4) * weight_B
+        mac.l   %d0, %d4, %acc0
+        move.l  (%a1), %d1
+        beq     .L254
+        eor.l   %d1, %d0                | else compare signs
+        bge     .L259                   | if same, add delta to weight
+        sub.l   %a3, %d4                | else subtract delta from weight
+        sub.l   %a3, %d4                | subtract again instead of branch
+.L259:  add.l   %a3, %d4                | add delta to weight
+
+.L254:  move.l  %acc0, %d2              | d2 = rounded product
+        add.l   %d1, %d2                | update bptr [0] and store
+        move.l  %d2, (%a1)+
+
+.L252:  cmp.l   %a1, %d5                | loop if bptr < eptr
+        jbhi    term_17
+        bra     term_17_18_finish       | exit through common path
+
+.L251:  addq.l  #4, %a1                 | update point and jump back into loop
+        bra     .L253
+
+.L257:  addq.l  #4, %a1                 | update point and jump back into loop
+        bra     .L252
+
+|------------------------------------------------------------------------------
+| Loop to handle term = 18 condition
+|
+| a0 =                          d0 = ((3 * bptr [-1]) - bptr [-2]) >> 1
+| a1 = bptr                     d1 = initial bptr [0]
+| a2 = dpp->                    d2 = updated bptr [0]
+| a3 = dpp->delta << 17         d3 = dpp->weight_A << 17
+| a4 =                          d4 = dpp->weight_B << 17
+| a5 =                          d5 = eptr
+| macsr = 0x20                  acc1 = 0x00 0000 80
+|------------------------------------------------------------------------------
+
+term_18:
+        move.l  -8(%a1), %a0            | d0 = (3 * bptr [-1] - bptr [-2]) >> 1
+        lea     (%a0,%a0.l*2), %a0
+        move.l  %a0, %d0
+        sub.l   -16(%a1), %d0
+        asr.l   #1, %d0
+        beq     .L260
+        move.l  %acc1, %acc0
+        asl.l   #4, %d0                 | acc0 = acc1 + (d0 << 4) * weight_A
+        mac.l   %d0, %d3, %acc0
+        move.l  (%a1), %d1
+        beq     .L266
+        eor.l   %d1, %d0                | else compare signs
+        bge     .L267                   | if same, add delta to weight
+        sub.l   %a3, %d3                | else subtract delta from weight
+        sub.l   %a3, %d3                | subtract again instead of branch
+.L267:  add.l   %a3, %d3                | add delta to weight
+
+.L266:  move.l  %acc0, %d2              | d2 = rounded product
+        add.l   %d1, %d2                | add applied weight to bptr [0], store
+        move.l  %d2, (%a1)+
+
+.L268:  move.l  -8(%a1), %a0            | d0 = (3 * bptr [-1] - bptr [-2]) >> 1
+        lea     (%a0,%a0.l*2), %a0
+        move.l  %a0, %d0
+        sub.l   -16(%a1), %d0
+        asr.l   #1, %d0
+        beq     .L261
+        move.l  %acc1, %acc0
+        asl.l   #4, %d0                 | acc0 = acc1 + (d0 << 4) * weight_B
+        mac.l   %d0, %d4, %acc0
+        move.l  (%a1), %d1
+        beq     .L265
+        eor.l   %d1, %d0                | else compare signs
+        bge     .L270                   | if same, add delta to weight
+        sub.l   %a3, %d4                | else subtract delta from weight
+        sub.l   %a3, %d4                | subtract again instead of branch
+.L270:  add.l   %a3, %d4                | add delta to weight
+
+.L265:  move.l  %acc0, %d2              | d2 = rounded product
+        add.l   %d1, %d2                | add applied weight to bptr [0], store
+        move.l  %d2, (%a1)+
+
+.L269:  cmp.l   %a1, %d5                | loop if bptr < eptr
+        jbhi    term_18
+        bra     term_17_18_finish       | exit through common path
+
+.L260:  addq.l  #4, %a1                 | bump pointer and jump back into loop
+        bra     .L268
+
+.L261:  addq.l  #4, %a1                 | bump pointer and jump back into loop
+        bra     .L269
+
+term_17_18_finish:
+        move.l  -4(%a1), 40(%a2)        | restore dpp->samples_A [0-1], B [0-1]
+        move.l  -8(%a1), 8(%a2)
+        move.l  -12(%a1), 44(%a2)
+        move.l  -16(%a1), 12(%a2)
+        jbra    finish_up
+
+|------------------------------------------------------------------------------
+| Loop to handle default terms (i.e. 1 - 8)
+|
+| a0 = tptr                     d0 = tptr [0]
+| a1 = bptr                     d1 = initial bptr [0]
+| a2 = dpp->                    d2 = updated bptr [0]
+| a3 = dpp->delta << 17         d3 = dpp->weight_A << 17
+| a4 =                          d4 = dpp->weight_B << 17
+| a5 =                          d5 = eptr
+| macsr = 0x20                  acc1 = 0x00 0000 80
+|------------------------------------------------------------------------------
+
+term_default:
+        move.w  (%a2), %d0              | a0 = a1 - (dpp->term * 8)
+        ext.l   %d0
+        lsl.l   #3, %d0
+        move.l  %a1, %a0
+        sub.l   %d0, %a0
+
+term_default_loop:
+        move.l  (%a0)+, %d0             | d0 = tptr [0], skip ahead if zero
+        beq     .L271
+        move.l  %acc1, %acc0
+        asl.l   #4, %d0                 | acc0 = acc1 + (d0 << 4) * weight_A
+        mac.l   %d0, %d3, %acc0
+        move.l  (%a1), %d1
+        beq     .L277
+        eor.l   %d1, %d0                | else compare signs
+        bge     .L278                   | if same, add delta to weight
+        sub.l   %a3, %d3                | else subtract delta from weight
+        sub.l   %a3, %d3                | subtract again instead of branch
+.L278:  add.l   %a3, %d3                | add delta to weight
+
+.L277:  move.l  %acc0, %d2              | d2 = rounded product
+        add.l   %d1, %d2                | add applied weight to bptr [0], store
+        move.l  %d2, (%a1)+
+
+.L275:  move.l  (%a0)+, %d0             | d0 = tptr [0], skip ahead if zero
+        beq     .L272
+        move.l  %acc1, %acc0
+        asl.l   #4, %d0                 | acc0 = acc1 + (d0 << 4) * weight_B
+        mac.l   %d0, %d4, %acc0
+        move.l  (%a1), %d1
+        beq     .L276
+        eor.l   %d1, %d0                | else compare signs
+        bge     .L281                   | if same, add delta to weight
+        sub.l   %a3, %d4                | else subtract delta from weight
+        sub.l   %a3, %d4                | subtract again instead of branch
+.L281:  add.l   %a3, %d4                | add delta to weight
+
+.L276:  move.l  %acc0, %d2              | d2 = rounded product
+        add.l   %d1, %d2                | add applied weight to bptr [0], store
+        move.l  %d2, (%a1)+
+
+.L274:  cmp.l   %a1, %d5                | loop back if bptr < eptr
+        jbhi    term_default_loop
+        move.w  (%a2), %d0              | d0 = term - 1
+        moveq.l #8, %d1                 | d1 = loop counter
+
+.L323:  subq.l  #1, %d0                 | back up & mask index
+        and.l   #7, %d0
+        move.l  -(%a1), 40(%a2,%d0.l*4) | store dpp->samples_B [d0]
+        move.l  -(%a1), 8(%a2,%d0.l*4)  | store dpp->samples_A [d0]
+        subq.l  #1, %d1                 | loop on count
+        jbne    .L323
+        jbra    finish_up
+
+.L271:  addq.l  #4, %a1                 | bump pointer and jump back into loop
+        bra     .L275
+
+.L272:  addq.l  #4, %a1                 | bump pointer and jump back into loop
+        bra     .L274
+
+
+|------------------------------------------------------------------------------
+| Loop to handle term = -1 condition
+|
+| a0 =                          d0 = decorrelation sample
+| a1 = bptr                     d1 = initial bptr [0]
+| a2 = dpp->                    d2 = updated bptr [0]
+| a3 = dpp->delta << 17         d3 = dpp->weight_A << 17
+| a4 =                          d4 = dpp->weight_B << 17
+| a5 =                          d5 = eptr
+| a6 =                          d6 = 1024 << 17
+| a7 =                          d7 = -1024 << 17
+| macsr = 0x20                  acc1 = 0x00 0000 80
+|------------------------------------------------------------------------------
+
+term_minus_1:
+        move.l  -4(%a1), %d0            | d0 = bptr [-1]
+        beq     .L402
+        move.l  %acc1, %acc0
+        asl.l   #4, %d0                 | acc0 = acc1 + ((d0 << 4) * weight_A)
+        mac.l   %d0, %d3, %acc0
+        move.l  (%a1), %d1
+        beq     .L405
+        eor.l   %d1, %d0                | else compare signs
+        bge     .L404                   | if same, add delta to weight
+        sub.l   %a3, %d3                | else subtract delta from weight
+        cmp.l   %d7, %d3                | check for negative clip limit
+        bge     .L405
+        move.l  %d7, %d3
+        bra     .L405
+
+.L404:  add.l   %a3, %d3                | add delta to weight
+        cmp.l   %d6, %d3                | check for positive clip limit
+        ble     .L405
+        move.l  %d6, %d3
+
+.L405:  move.l  %acc0, %d0              | d2 = rounded product
+        add.l   %d1, %d0                | add applied weight to bptr [0], store
+        move.l  %d0, (%a1)+
+        beq     .L401
+
+.L410:  move.l  %acc1, %acc0
+        asl.l   #4, %d0                 | acc0 = acc1 + ((d0 << 4) * weight_B)
+        mac.l   %d0, %d4, %acc0
+        move.l  (%a1), %d1
+        beq     .L403
+        eor.l   %d1, %d0                | else compare signs
+        bge     .L407                   | if same, add delta to weight
+        sub.l   %a3, %d4                | else subtract delta from weight
+        cmp.l   %d7, %d4                | check for negative clip limit
+        bge     .L403
+        move.l  %d7, %d4
+        bra     .L403
+
+.L407:  add.l   %a3, %d4                | add delta to weight
+        cmp.l   %d6, %d4                | check for positive clip limit
+        ble     .L403
+        move.l  %d6, %d4
+
+.L403:  move.l  %acc0, %d2              | d2 = rounded product
+        add.l   %d1, %d2                | add applied weight to bptr [1], store
+        move.l  %d2, (%a1)+
+
+.L411:  cmp.l   %a1, %d5                | loop back if bptr < eptr
+        jbhi    term_minus_1
+        move.l  -4(%a1), 8(%a2)         | dpp->samples_A [0] = bptr [-1]
+        jbra    finish_up
+
+.L402:  move.l  (%a1)+, %d0
+        bne     .L410
+
+.L401:  addq.l  #4, %a1
+        bra     .L411
+
+
+|------------------------------------------------------------------------------
+| Loop to handle term = -2 condition
+|
+| a0 =                          d0 = decorrelation sample
+| a1 = bptr                     d1 = initial bptr [0]
+| a2 = dpp->                    d2 = updated bptr [0]
+| a3 = dpp->delta << 17         d3 = dpp->weight_A << 17
+| a4 =                          d4 = dpp->weight_B << 17
+| a5 =                          d5 = eptr
+| a6 =                          d6 = 1024 << 17
+| a7 =                          d7 = -1024 << 17
+| macsr = 0x20                  acc1 = 0x00 0000 80
+|------------------------------------------------------------------------------
+
+term_minus_2:
+        move.l  -8(%a1), %d0            | d0 = bptr [-2]
+        beq     .L511
+        move.l  %acc1, %acc0
+        asl.l   #4, %d0                 | acc0 = acc1 + ((d0 << 4) * weight_B)
+        mac.l   %d0, %d4, %acc0
+        move.l  4(%a1), %d1
+        beq     .L505
+        eor.l   %d1, %d0                | else compare signs
+        bge     .L504                   | if same, add delta to weight
+        sub.l   %a3, %d4                | else subtract delta from weight
+        cmp.l   %d7, %d4                | ckeck for negative clip limit
+        bge     .L505
+        move.l  %d7, %d4
+        bra     .L505
+
+.L504:  add.l   %a3, %d4                | add delta to weight
+        cmp.l   %d6, %d4                | check for positive clip limit
+        ble     .L505
+        move.l  %d6, %d4
+
+.L505:  move.l  %acc0, %d0              | d2 = rounded product
+        add.l   %d1, %d0                | add applied weight to bptr [0], store
+        move.l  %d0, 4(%a1)
+        beq     .L512
+
+.L510:  move.l  %acc1, %acc0
+        asl.l   #4, %d0                 | acc0 = acc1 + ((d0 << 4) * weight_A)
+        mac.l   %d0, %d3, %acc0
+        move.l  (%a1), %d1
+        beq     .L503
+        eor.l   %d1, %d0                | else compare signs
+        bge     .L507                   | if same, add delta to weight
+        sub.l   %a3, %d3                | else subtract delta from weight
+        cmp.l   %d7, %d3                | check for negative clip limit
+        bge     .L503
+        move.l  %d7, %d3
+        bra     .L503
+
+.L507:  add.l   %a3, %d3                | add delta to weight
+        cmp.l   %d6, %d3                | check for negative clip limit
+        ble     .L503
+        move.l  %d6, %d3
+
+.L503:  move.l  %acc0, %d2              | d2 = rounded product
+        add.l   %d1, %d2                | add applied weight to bptr [1], store
+        move.l  %d2, (%a1)
+
+.L512:  addq.l  #8, %a1
+        cmp.l   %a1, %d5                | loop if bptr < eptr
+        jbhi    term_minus_2
+        move.l  -8(%a1), 40(%a2)        | dpp->samples_B [0] = bptr [-4]
+        jbra    finish_up
+
+.L511:  move.l  4(%a1), %d0
+        beq     .L512
+        bra     .L510
+
+
+|------------------------------------------------------------------------------
+| Loop to handle term = -3 condition
+|
+| a0 =                          d0 = decorrelation sample
+| a1 = bptr                     d1 = initial bptr [0]
+| a2 = dpp->                    d2 = updated bptr [0]
+| a3 = dpp->delta << 17         d3 = dpp->weight_A << 17
+| a4 =                          d4 = dpp->weight_B << 17
+| a5 =                          d5 = eptr
+| a6 =                          d6 = 1024 << 17
+| a7 =                          d7 = -1024 << 17
+| macsr = 0x20                  acc1 = 0x00 0000 80
+|------------------------------------------------------------------------------
+
+term_minus_3:
+        move.l  -4(%a1), %d0            | d0 = bptr [-1]
+        beq     .L301
+        move.l  %acc1, %acc0
+        asl.l   #4, %d0                 | acc0 = acc1 + ((d0 << 4) * weight_A)
+        mac.l   %d0, %d3, %acc0
+        move.l  (%a1), %d1
+        beq     .L320
+        eor.l   %d1, %d0                | else compare signs
+        bge     .L319                   | if same, add delta to weight
+        sub.l   %a3, %d3                | else subtract delta from weight
+        cmp.l   %d7, %d3                | check for negative clip limit
+        bge     .L320
+        move.l  %d7, %d3
+        bra     .L320
+
+.L319:  add.l   %a3, %d3                | add delta to weight
+        cmp.l   %d6, %d3                | check for positive clip limit
+        ble     .L320
+        move.l  %d6, %d3
+
+.L320:  move.l  %acc0, %d2              | d2 = rounded product
+        add.l   %d1, %d2                | add applied weight to bptr [0], store
+        move.l  %d2, (%a1)+
+
+.L330:  move.l  -12(%a1), %d0           | d0 = bptr [-2]
+        beq     .L302
+        move.l  %acc1, %acc0
+        asl.l   #4, %d0                 | acc0 = acc1 + ((d0 << 4) * weight_B)
+        mac.l   %d0, %d4, %acc0
+        move.l  (%a1), %d1
+        beq     .L318
+        eor.l   %d1, %d0                | else compare signs
+        bge     .L322                   | if same, add delta to weight
+        sub.l   %a3, %d4                | else subtract delta from weight
+        cmp.l   %d7, %d4                | check for negative clip limit
+        bge     .L318
+        move.l  %d7, %d4
+        bra     .L318
+
+.L322:  add.l   %a3, %d4                | add delta to weight
+        cmp.l   %d6, %d4                | check for positive clip limit
+        ble     .L318
+        move.l  %d6, %d4
+
+.L318:  move.l  %acc0, %d2              | d2 = rounded product
+        add.l   %d1, %d2                | add applied weight to bptr [1], store
+        move.l  %d2, (%a1)+
+
+.L331:  cmp.l   %a1, %d5                | bptr, eptr
+        jbhi    term_minus_3
+        move.l  -4(%a1), 8(%a2)         | dpp->samples_A [0] = bptr [-1]
+        move.l  -8(%a1), 40(%a2)        | dpp->samples_B [0] = bptr [-2]
+        jbra    finish_up
+
+.L301:  addq.l  #4, %a1
+        bra     .L330
+
+.L302:  addq.l  #4, %a1
+        bra     .L331
+
+| finish and return
+
+finish_up:
+        moveq.l #17, %d0
+        asr.l   %d0, %d3
+        asr.l   %d0, %d4
+        move.w  %d3, 4(%a2)     | weight_A, dpp->weight_A
+        move.w  %d4, 6(%a2)     | weight_B, dpp->weight_B
+
+        clr.l   %d0             | clear up EMAC
+        move.l  %d0, %acc0
+        move.l  %d0, %acc1
+
+return_only:
+        movem.l (%sp), %d2-%d7/%a2-%a6
+        lea     (44,%sp), %sp
+        rts
diff --git a/apps/codecs/libwavpack/unpack.c b/apps/codecs/libwavpack/unpack.c
index ae473787a7..5afaac3659 100644
--- a/apps/codecs/libwavpack/unpack.c
+++ b/apps/codecs/libwavpack/unpack.c
@@ -27,7 +27,11 @@ static void strcpy_loc (char *dst, char *src) { while (*src) *dst++ = *src++; *d
 // these macros implement the weight application and update operations
 // that are at the heart of the decorrelation loops
 
+#if 0	// PERFCOND
 #define apply_weight_i(weight, sample) ((weight * sample + 512) >> 10)
+#else
+#define apply_weight_i(weight, sample) ((((weight * sample) >> 8) + 2) >> 2)
+#endif
 
 #define apply_weight_f(weight, sample) (((((sample & 0xffff) * weight) >> 9) + \
     (((sample & ~0xffff) >> 9) * weight) + 1) >> 1)
@@ -39,7 +43,7 @@ static void strcpy_loc (char *dst, char *src) { while (*src) *dst++ = *src++; *d
 #define apply_weight(weight, sample) ((int32_t)((weight * (int64_t) sample + 512) >> 10))
 #endif
 
-#if 1	// PERFCOND
+#if 0	// PERFCOND
 #define update_weight(weight, delta, source, result) \
     if (source && result) weight -= ((((source ^ result) >> 30) & 2) - 1) * delta;
 #else
@@ -315,9 +319,14 @@ int read_config_info (WavpackContext *wpc, WavpackMetadata *wpmd)
 // samples unpacked, which can be less than the number requested if an error
 // occurs or the end of the block is reached.
 
+#if CONFIG_CPU==MCF5249 && !defined(SIMULATOR)
+extern void decorr_stereo_pass_cont_mcf5249 (struct decorr_pass *dpp, long *buffer, long sample_count);
+#else
+static void decorr_stereo_pass_cont (struct decorr_pass *dpp, long *buffer, long sample_count);
+#endif
+
 static void decorr_mono_pass (struct decorr_pass *dpp, long *buffer, long sample_count);
 static void decorr_stereo_pass (struct decorr_pass *dpp, long *buffer, long sample_count);
-static void decorr_stereo_pass_cont (struct decorr_pass *dpp, long *buffer, long sample_count);
 static void fixup_samples (WavpackStream *wps, long *buffer, ulong sample_count);
 
 long unpack_samples (WavpackContext *wpc, long *buffer, ulong sample_count)
@@ -372,7 +381,11 @@ long unpack_samples (WavpackContext *wpc, long *buffer, ulong sample_count)
 	else
 	    for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount--; dpp++) {
 		decorr_stereo_pass (dpp, buffer, 8);
+#if CONFIG_CPU==MCF5249 && !defined(SIMULATOR)
+		decorr_stereo_pass_cont_mcf5249 (dpp, buffer + 16, sample_count - 8);
+#else
 		decorr_stereo_pass_cont (dpp, buffer + 16, sample_count - 8);
+#endif
 	    }
 
 	if (flags & JOINT_STEREO)
@@ -530,11 +543,13 @@ static void decorr_stereo_pass (struct decorr_pass *dpp, long *buffer, long samp
     dpp->weight_B = weight_B;
 }
 
+#if CONFIG_CPU != MCF5249 || defined(SIMULATOR)
+
 static void decorr_stereo_pass_cont (struct decorr_pass *dpp, long *buffer, long sample_count)
 {
     long delta = dpp->delta, weight_A = dpp->weight_A, weight_B = dpp->weight_B;
     long *bptr, *tptr, *eptr = buffer + (sample_count * 2), sam_A, sam_B;
-    int k;
+    int k, i;
 
     switch (dpp->term) {
 
@@ -581,23 +596,11 @@ static void decorr_stereo_pass_cont (struct decorr_pass *dpp, long *buffer, long
 		update_weight (weight_B, delta, tptr [1], sam_A);
 	    }
 
-	    k = dpp->term;
-	    dpp->samples_B [--k & (MAX_TERM - 1)] = bptr [-1];
-	    dpp->samples_A [  k & (MAX_TERM - 1)] = bptr [-2];
-	    dpp->samples_B [--k & (MAX_TERM - 1)] = bptr [-3];
-	    dpp->samples_A [  k & (MAX_TERM - 1)] = bptr [-4];
-	    dpp->samples_B [--k & (MAX_TERM - 1)] = bptr [-5];
-	    dpp->samples_A [  k & (MAX_TERM - 1)] = bptr [-6];
-	    dpp->samples_B [--k & (MAX_TERM - 1)] = bptr [-7];
-	    dpp->samples_A [  k & (MAX_TERM - 1)] = bptr [-8];
-	    dpp->samples_B [--k & (MAX_TERM - 1)] = bptr [-9];
-	    dpp->samples_A [  k & (MAX_TERM - 1)] = bptr [-10];
-	    dpp->samples_B [--k & (MAX_TERM - 1)] = bptr [-11];
-	    dpp->samples_A [  k & (MAX_TERM - 1)] = bptr [-12];
-	    dpp->samples_B [--k & (MAX_TERM - 1)] = bptr [-13];
-	    dpp->samples_A [  k & (MAX_TERM - 1)] = bptr [-14];
-	    dpp->samples_B [--k & (MAX_TERM - 1)] = bptr [-15];
-	    dpp->samples_A [  k & (MAX_TERM - 1)] = bptr [-16];
+	    for (k = dpp->term - 1, i = 8; i--; k--) {
+		dpp->samples_B [k & (MAX_TERM - 1)] = *--bptr;
+		dpp->samples_A [k & (MAX_TERM - 1)] = *--bptr;
+	    }
+
 	    break;
 
 	case -1:
@@ -639,6 +642,8 @@ static void decorr_stereo_pass_cont (struct decorr_pass *dpp, long *buffer, long
     dpp->weight_B = weight_B;
 }
 
+#endif
+
 static void decorr_mono_pass (struct decorr_pass *dpp, long *buffer, long sample_count)
 {
     long delta = dpp->delta, weight_A = dpp->weight_A;
diff --git a/apps/codecs/libwavpack/wputils.c b/apps/codecs/libwavpack/wputils.c
index 9227b66e46..8d58b3b4d7 100644
--- a/apps/codecs/libwavpack/wputils.c
+++ b/apps/codecs/libwavpack/wputils.c
@@ -45,7 +45,7 @@ static ulong read_next_header (read_stream infile, WavpackHeader *wphdr);
 // large integer or floating point files (but always provides at least 24 bits
 // of resolution).
 
-static WavpackContext wpc;
+static WavpackContext wpc IDATA_ATTR;
 
 WavpackContext *WavpackOpenFileInput (read_stream infile, char *error)
 {
diff --git a/apps/plugins/wv2wav.c b/apps/plugins/wv2wav.c
index c0bc05cf12..909a0c3c63 100644
--- a/apps/plugins/wv2wav.c
+++ b/apps/plugins/wv2wav.c
@@ -29,7 +29,7 @@
 
 static struct plugin_api* rb;
 static file_info_struct file_info;
-static long temp_buffer [BUFFER_SIZE];
+static long temp_buffer [BUFFER_SIZE] IDATA_ATTR;
 
 /* Reformat samples from longs in processor's native endian mode to
  little-endian data with 2 bytes / sample. */
-- 
cgit v1.2.3