From f40bfc9267b13b54e6379dfe7539447662879d24 Mon Sep 17 00:00:00 2001 From: Sean Bartell Date: Sat, 25 Jun 2011 21:32:25 -0400 Subject: Add codecs to librbcodec. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change-Id: Id7f4717d51ed02d67cb9f9cb3c0ada4a81843f97 Reviewed-on: http://gerrit.rockbox.org/137 Reviewed-by: Nils Wallménius Tested-by: Nils Wallménius --- lib/rbcodec/codecs/libwavpack/coldfire.S | 537 +++++++++++++++++++++++++++++++ 1 file changed, 537 insertions(+) create mode 100644 lib/rbcodec/codecs/libwavpack/coldfire.S (limited to 'lib/rbcodec/codecs/libwavpack/coldfire.S') diff --git a/lib/rbcodec/codecs/libwavpack/coldfire.S b/lib/rbcodec/codecs/libwavpack/coldfire.S new file mode 100644 index 0000000000..884a0ac90f --- /dev/null +++ b/lib/rbcodec/codecs/libwavpack/coldfire.S @@ -0,0 +1,537 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id$ + * + * Copyright (C) 2005 by David Bryant + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ + +/* This is an assembly optimized version of the following WavPack function: + * + * void decorr_stereo_pass_cont_mcf5249 (struct decorr_pass *dpp, + * long *buffer, long sample_count); + * + * It performs a single pass of stereo decorrelation on the provided buffer. + * Note that this version of the function requires that the 8 previous stereo + * samples are visible and correct. In other words, it ignores the "samples_*" + * fields in the decorr_pass structure and gets the history data directly + * from the buffer. It does, however, return the appropriate history samples + * to the decorr_pass structure before returning. + * + * This is written to work on a MCF5249 processor, or any processor based on + * the ColdFire V2 core with an EMAC unit. The EMAC is perfectly suited for + * the "apply_weight" function of WavPack decorrelation because it provides + * the requires 40-bit product. The fractional rounding mode of the EMAC is not + * configurable and uses "round to even" while WavPack uses "round to larger", + * so the rounding has to be done manually. + */ + + .text + .align 2 + .global decorr_stereo_pass_cont_mcf5249 + +decorr_stereo_pass_cont_mcf5249: + + lea (-44, %sp), %sp + movem.l %d2-%d7/%a2-%a6, (%sp) + move.l 44+4(%sp), %a2 | a2 = dpp-> + move.l 44+8(%sp), %a1 | a1 = bptr + move.w 2(%a2), %a3 | a3 = dpp->delta + move.w 4(%a2), %d3 | d3 = dpp->weight_A (sign extended) + ext.l %d3 + move.w 6(%a2), %d4 | d4 = dpp->weight_B (sign extended) + ext.l %d4 + move.l 44+12(%sp), %d0 | d0 = sample_count + jbeq return_only | if zero, nothing to do + + lsl.l #3, %d0 | d5 = bptr + (sample_count * 8) + move.l %d0, %d5 + add.l %a1, %d5 + + moveq.l #17, %d0 | left shift weights & delta 17 places + asl.l %d0, %d3 + asl.l %d0, %d4 + move.l %a3, %d1 + asl.l %d0, %d1 + move.l %d1, %a3 + + moveq.l #0x20, %d6 + move.l %d6, %macsr | set fractional mode for MAC + move.l #0x800000, %accext01 | acc1 = 0x00 0000 80 (for rounding) + + move.l #1024<<17, %d6 | d6 & d7 are weight clipping limits + move.l #-1024<<17, %d7 | (only used by negative terms) + + move.w (%a2), %d0 | d0 = term + ext.l %d0 + cmp.l #17, %d0 + jbeq term_17 | term = 17 + cmp.l #18, %d0 + jbeq term_18 | term = 18 + addq.l #1, %d0 + jbeq term_minus_1 | term = -1 + addq.l #1, %d0 + jbeq term_minus_2 | term = -2 + addq.l #1, %d0 + jbeq term_minus_3 | term = -3 + jbra term_default | default term = 1 - 8 + +|------------------------------------------------------------------------------ +| Loop to handle term = 17 condition +| +| a0 = d0 = (2 * bptr [-1]) - bptr [-2] +| a1 = bptr d1 = initial bptr [0] +| a2 = dpp-> d2 = updated bptr [0] +| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17 +| a4 = d4 = dpp->weight_B << 17 +| a5 = d5 = eptr +| macsr = 0x20 acc1 = 0x00 0000 80 +|------------------------------------------------------------------------------ + +term_17: + move.l -8(%a1), %d0 | d0 = 2 * bptr [-1] - bptr [-2] + add.l %d0, %d0 + sub.l -16(%a1), %d0 + beq .L251 | if zero, skip calculation + move.l %acc1, %acc0 + asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_A + mac.l %d0, %d3, %acc0 + move.l (%a1), %d1 + beq .L255 + eor.l %d1, %d0 | else compare signs + bge .L256 | if same, add delta to weight + sub.l %a3, %d3 | else subtract delta from weight + sub.l %a3, %d3 | subtract again instead of branch +.L256: add.l %a3, %d3 | add delta to weight + +.L255: move.l %acc0, %d2 | d2 = rounded product + add.l %d1, %d2 | update bptr [0] and store + move.l %d2, (%a1)+ + +.L253: move.l -8(%a1), %d0 | d0 = 2 * bptr [-1] - bptr [-2] + add.l %d0, %d0 + sub.l -16(%a1), %d0 + beq .L257 | if zero, skip calculations + move.l %acc1, %acc0 + asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_B + mac.l %d0, %d4, %acc0 + move.l (%a1), %d1 + beq .L254 + eor.l %d1, %d0 | else compare signs + bge .L259 | if same, add delta to weight + sub.l %a3, %d4 | else subtract delta from weight + sub.l %a3, %d4 | subtract again instead of branch +.L259: add.l %a3, %d4 | add delta to weight + +.L254: move.l %acc0, %d2 | d2 = rounded product + add.l %d1, %d2 | update bptr [0] and store + move.l %d2, (%a1)+ + +.L252: cmp.l %a1, %d5 | loop if bptr < eptr + jbhi term_17 + bra term_17_18_finish | exit through common path + +.L251: addq.l #4, %a1 | update point and jump back into loop + bra .L253 + +.L257: addq.l #4, %a1 | update point and jump back into loop + bra .L252 + +|------------------------------------------------------------------------------ +| Loop to handle term = 18 condition +| +| a0 = d0 = ((3 * bptr [-1]) - bptr [-2]) >> 1 +| a1 = bptr d1 = initial bptr [0] +| a2 = dpp-> d2 = updated bptr [0] +| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17 +| a4 = d4 = dpp->weight_B << 17 +| a5 = d5 = eptr +| macsr = 0x20 acc1 = 0x00 0000 80 +|------------------------------------------------------------------------------ + +term_18: + move.l -8(%a1), %a0 | d0 = (3 * bptr [-1] - bptr [-2]) >> 1 + lea (%a0,%a0.l*2), %a0 + move.l %a0, %d0 + sub.l -16(%a1), %d0 + asr.l #1, %d0 + beq .L260 + move.l %acc1, %acc0 + asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_A + mac.l %d0, %d3, %acc0 + move.l (%a1), %d1 + beq .L266 + eor.l %d1, %d0 | else compare signs + bge .L267 | if same, add delta to weight + sub.l %a3, %d3 | else subtract delta from weight + sub.l %a3, %d3 | subtract again instead of branch +.L267: add.l %a3, %d3 | add delta to weight + +.L266: move.l %acc0, %d2 | d2 = rounded product + add.l %d1, %d2 | add applied weight to bptr [0], store + move.l %d2, (%a1)+ + +.L268: move.l -8(%a1), %a0 | d0 = (3 * bptr [-1] - bptr [-2]) >> 1 + lea (%a0,%a0.l*2), %a0 + move.l %a0, %d0 + sub.l -16(%a1), %d0 + asr.l #1, %d0 + beq .L261 + move.l %acc1, %acc0 + asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_B + mac.l %d0, %d4, %acc0 + move.l (%a1), %d1 + beq .L265 + eor.l %d1, %d0 | else compare signs + bge .L270 | if same, add delta to weight + sub.l %a3, %d4 | else subtract delta from weight + sub.l %a3, %d4 | subtract again instead of branch +.L270: add.l %a3, %d4 | add delta to weight + +.L265: move.l %acc0, %d2 | d2 = rounded product + add.l %d1, %d2 | add applied weight to bptr [0], store + move.l %d2, (%a1)+ + +.L269: cmp.l %a1, %d5 | loop if bptr < eptr + jbhi term_18 + bra term_17_18_finish | exit through common path + +.L260: addq.l #4, %a1 | bump pointer and jump back into loop + bra .L268 + +.L261: addq.l #4, %a1 | bump pointer and jump back into loop + bra .L269 + +term_17_18_finish: + move.l -4(%a1), 40(%a2) | restore dpp->samples_A [0-1], B [0-1] + move.l -8(%a1), 8(%a2) + move.l -12(%a1), 44(%a2) + move.l -16(%a1), 12(%a2) + jbra finish_up + +|------------------------------------------------------------------------------ +| Loop to handle default terms (i.e. 1 - 8) +| +| a0 = tptr d0 = tptr [0] +| a1 = bptr d1 = initial bptr [0] +| a2 = dpp-> d2 = updated bptr [0] +| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17 +| a4 = d4 = dpp->weight_B << 17 +| a5 = d5 = eptr +| macsr = 0x20 acc1 = 0x00 0000 80 +|------------------------------------------------------------------------------ + +term_default: + move.w (%a2), %d0 | a0 = a1 - (dpp->term * 8) + ext.l %d0 + lsl.l #3, %d0 + move.l %a1, %a0 + sub.l %d0, %a0 + +term_default_loop: + move.l (%a0)+, %d0 | d0 = tptr [0], skip ahead if zero + beq .L271 + move.l %acc1, %acc0 + asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_A + mac.l %d0, %d3, %acc0 + move.l (%a1), %d1 + beq .L277 + eor.l %d1, %d0 | else compare signs + bge .L278 | if same, add delta to weight + sub.l %a3, %d3 | else subtract delta from weight + sub.l %a3, %d3 | subtract again instead of branch +.L278: add.l %a3, %d3 | add delta to weight + +.L277: move.l %acc0, %d2 | d2 = rounded product + add.l %d1, %d2 | add applied weight to bptr [0], store + move.l %d2, (%a1)+ + +.L275: move.l (%a0)+, %d0 | d0 = tptr [0], skip ahead if zero + beq .L272 + move.l %acc1, %acc0 + asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_B + mac.l %d0, %d4, %acc0 + move.l (%a1), %d1 + beq .L276 + eor.l %d1, %d0 | else compare signs + bge .L281 | if same, add delta to weight + sub.l %a3, %d4 | else subtract delta from weight + sub.l %a3, %d4 | subtract again instead of branch +.L281: add.l %a3, %d4 | add delta to weight + +.L276: move.l %acc0, %d2 | d2 = rounded product + add.l %d1, %d2 | add applied weight to bptr [0], store + move.l %d2, (%a1)+ + +.L274: cmp.l %a1, %d5 | loop back if bptr < eptr + jbhi term_default_loop + move.w (%a2), %d0 | d0 = term - 1 + moveq.l #8, %d1 | d1 = loop counter + +.L323: subq.l #1, %d0 | back up & mask index + and.l #7, %d0 + move.l -(%a1), 40(%a2,%d0.l*4) | store dpp->samples_B [d0] + move.l -(%a1), 8(%a2,%d0.l*4) | store dpp->samples_A [d0] + subq.l #1, %d1 | loop on count + jbne .L323 + jbra finish_up + +.L271: addq.l #4, %a1 | bump pointer and jump back into loop + bra .L275 + +.L272: addq.l #4, %a1 | bump pointer and jump back into loop + bra .L274 + + +|------------------------------------------------------------------------------ +| Loop to handle term = -1 condition +| +| a0 = d0 = decorrelation sample +| a1 = bptr d1 = initial bptr [0] +| a2 = dpp-> d2 = updated bptr [0] +| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17 +| a4 = d4 = dpp->weight_B << 17 +| a5 = d5 = eptr +| a6 = d6 = 1024 << 17 +| a7 = d7 = -1024 << 17 +| macsr = 0x20 acc1 = 0x00 0000 80 +|------------------------------------------------------------------------------ + +term_minus_1: + move.l -4(%a1), %d0 | d0 = bptr [-1] + beq .L402 + move.l %acc1, %acc0 + asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_A) + mac.l %d0, %d3, %acc0 + move.l (%a1), %d1 + beq .L405 + eor.l %d1, %d0 | else compare signs + bge .L404 | if same, add delta to weight + sub.l %a3, %d3 | else subtract delta from weight + cmp.l %d7, %d3 | check for negative clip limit + bge .L405 + move.l %d7, %d3 + bra .L405 + +.L404: add.l %a3, %d3 | add delta to weight + cmp.l %d6, %d3 | check for positive clip limit + ble .L405 + move.l %d6, %d3 + +.L405: move.l %acc0, %d0 | d2 = rounded product + add.l %d1, %d0 | add applied weight to bptr [0], store + move.l %d0, (%a1)+ + beq .L401 + +.L410: move.l %acc1, %acc0 + asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_B) + mac.l %d0, %d4, %acc0 + move.l (%a1), %d1 + beq .L403 + eor.l %d1, %d0 | else compare signs + bge .L407 | if same, add delta to weight + sub.l %a3, %d4 | else subtract delta from weight + cmp.l %d7, %d4 | check for negative clip limit + bge .L403 + move.l %d7, %d4 + bra .L403 + +.L407: add.l %a3, %d4 | add delta to weight + cmp.l %d6, %d4 | check for positive clip limit + ble .L403 + move.l %d6, %d4 + +.L403: move.l %acc0, %d2 | d2 = rounded product + add.l %d1, %d2 | add applied weight to bptr [1], store + move.l %d2, (%a1)+ + +.L411: cmp.l %a1, %d5 | loop back if bptr < eptr + jbhi term_minus_1 + move.l -4(%a1), 8(%a2) | dpp->samples_A [0] = bptr [-1] + jbra finish_up + +.L402: move.l (%a1)+, %d0 + bne .L410 + +.L401: addq.l #4, %a1 + bra .L411 + + +|------------------------------------------------------------------------------ +| Loop to handle term = -2 condition +| +| a0 = d0 = decorrelation sample +| a1 = bptr d1 = initial bptr [0] +| a2 = dpp-> d2 = updated bptr [0] +| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17 +| a4 = d4 = dpp->weight_B << 17 +| a5 = d5 = eptr +| a6 = d6 = 1024 << 17 +| a7 = d7 = -1024 << 17 +| macsr = 0x20 acc1 = 0x00 0000 80 +|------------------------------------------------------------------------------ + +term_minus_2: + move.l -8(%a1), %d0 | d0 = bptr [-2] + beq .L511 + move.l %acc1, %acc0 + asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_B) + mac.l %d0, %d4, %acc0 + move.l 4(%a1), %d1 + beq .L505 + eor.l %d1, %d0 | else compare signs + bge .L504 | if same, add delta to weight + sub.l %a3, %d4 | else subtract delta from weight + cmp.l %d7, %d4 | ckeck for negative clip limit + bge .L505 + move.l %d7, %d4 + bra .L505 + +.L504: add.l %a3, %d4 | add delta to weight + cmp.l %d6, %d4 | check for positive clip limit + ble .L505 + move.l %d6, %d4 + +.L505: move.l %acc0, %d0 | d2 = rounded product + add.l %d1, %d0 | add applied weight to bptr [0], store + move.l %d0, 4(%a1) + beq .L512 + +.L510: move.l %acc1, %acc0 + asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_A) + mac.l %d0, %d3, %acc0 + move.l (%a1), %d1 + beq .L503 + eor.l %d1, %d0 | else compare signs + bge .L507 | if same, add delta to weight + sub.l %a3, %d3 | else subtract delta from weight + cmp.l %d7, %d3 | check for negative clip limit + bge .L503 + move.l %d7, %d3 + bra .L503 + +.L507: add.l %a3, %d3 | add delta to weight + cmp.l %d6, %d3 | check for negative clip limit + ble .L503 + move.l %d6, %d3 + +.L503: move.l %acc0, %d2 | d2 = rounded product + add.l %d1, %d2 | add applied weight to bptr [1], store + move.l %d2, (%a1) + +.L512: addq.l #8, %a1 + cmp.l %a1, %d5 | loop if bptr < eptr + jbhi term_minus_2 + move.l -8(%a1), 40(%a2) | dpp->samples_B [0] = bptr [-4] + jbra finish_up + +.L511: move.l 4(%a1), %d0 + beq .L512 + bra .L510 + + +|------------------------------------------------------------------------------ +| Loop to handle term = -3 condition +| +| a0 = d0 = decorrelation sample +| a1 = bptr d1 = initial bptr [0] +| a2 = dpp-> d2 = updated bptr [0] +| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17 +| a4 = d4 = dpp->weight_B << 17 +| a5 = d5 = eptr +| a6 = d6 = 1024 << 17 +| a7 = d7 = -1024 << 17 +| macsr = 0x20 acc1 = 0x00 0000 80 +|------------------------------------------------------------------------------ + +term_minus_3: + move.l -4(%a1), %d0 | d0 = bptr [-1] + beq .L301 + move.l %acc1, %acc0 + asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_A) + mac.l %d0, %d3, %acc0 + move.l (%a1), %d1 + beq .L320 + eor.l %d1, %d0 | else compare signs + bge .L319 | if same, add delta to weight + sub.l %a3, %d3 | else subtract delta from weight + cmp.l %d7, %d3 | check for negative clip limit + bge .L320 + move.l %d7, %d3 + bra .L320 + +.L319: add.l %a3, %d3 | add delta to weight + cmp.l %d6, %d3 | check for positive clip limit + ble .L320 + move.l %d6, %d3 + +.L320: move.l %acc0, %d2 | d2 = rounded product + add.l %d1, %d2 | add applied weight to bptr [0], store + move.l %d2, (%a1)+ + +.L330: move.l -12(%a1), %d0 | d0 = bptr [-2] + beq .L302 + move.l %acc1, %acc0 + asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_B) + mac.l %d0, %d4, %acc0 + move.l (%a1), %d1 + beq .L318 + eor.l %d1, %d0 | else compare signs + bge .L322 | if same, add delta to weight + sub.l %a3, %d4 | else subtract delta from weight + cmp.l %d7, %d4 | check for negative clip limit + bge .L318 + move.l %d7, %d4 + bra .L318 + +.L322: add.l %a3, %d4 | add delta to weight + cmp.l %d6, %d4 | check for positive clip limit + ble .L318 + move.l %d6, %d4 + +.L318: move.l %acc0, %d2 | d2 = rounded product + add.l %d1, %d2 | add applied weight to bptr [1], store + move.l %d2, (%a1)+ + +.L331: cmp.l %a1, %d5 | bptr, eptr + jbhi term_minus_3 + move.l -4(%a1), 8(%a2) | dpp->samples_A [0] = bptr [-1] + move.l -8(%a1), 40(%a2) | dpp->samples_B [0] = bptr [-2] + jbra finish_up + +.L301: addq.l #4, %a1 + bra .L330 + +.L302: addq.l #4, %a1 + bra .L331 + +| finish and return + +finish_up: + moveq.l #17, %d0 + asr.l %d0, %d3 + asr.l %d0, %d4 + move.w %d3, 4(%a2) | weight_A, dpp->weight_A + move.w %d4, 6(%a2) | weight_B, dpp->weight_B + + clr.l %d0 | clear up EMAC + move.l %d0, %acc0 + move.l %d0, %acc1 + +return_only: + movem.l (%sp), %d2-%d7/%a2-%a6 + lea (44,%sp), %sp + rts -- cgit v1.2.3