From 0b38c7dcbe283ba7d13531831a5367afae668e69 Mon Sep 17 00:00:00 2001 From: Thom Johansen Date: Thu, 27 Oct 2005 00:33:38 +0000 Subject: Assembler optimised LPC routines for Coldfire. Will enable them when codec has seen further testing. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@7657 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/libffmpegFLAC/coldfire.S | 237 +++++++++++++++++++++++++++++++++++ apps/codecs/libffmpegFLAC/coldfire.h | 8 ++ 2 files changed, 245 insertions(+) create mode 100644 apps/codecs/libffmpegFLAC/coldfire.S create mode 100644 apps/codecs/libffmpegFLAC/coldfire.h diff --git a/apps/codecs/libffmpegFLAC/coldfire.S b/apps/codecs/libffmpegFLAC/coldfire.S new file mode 100644 index 0000000000..7e19e4b695 --- /dev/null +++ b/apps/codecs/libffmpegFLAC/coldfire.S @@ -0,0 +1,237 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id$ + * + * Copyright (C) 2005 by Thom Johansen + * + * All files in this archive are subject to the GNU General Public License. + * See the file COPYING in the source tree root for full license agreement. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ + +/* The following is an assembler optimised version of the LPC filtering + routines needed for FLAC decoding. It is optimised for use with the + MCF5249 processor, or any other similar ColdFire core with the EMAC unit. + All LPC filtering up to order 8 is done in specially optimised unrolled + loops, while every order above this is handled by a slower default routine. + */ + .text + .global lpc_decode_emac + .align 2 +lpc_decode_emac: + lea.l (-40, %sp), %sp + movem.l %d2-%d7/%a2-%a5, (%sp) + movem.l (40+4, %sp), %d0-%d2/%a0-%a1 + /* d0 = blocksize, d1 = qlevel, d2 = pred_order + a0 = data, a1 = coeffs + */ + + /* the data pointer always lags behind history pointer by 'pred_order' + samples. since we have one loop for each order, we can hard code this + and free a register by not saving data pointer. + */ + move.l %d2, %d3 + neg.l %d3 + lea.l (%a0, %d3.l*4), %a0 | history + clr.l %d3 + move.l %d3, %macsr | we'll need integer mode for this + tst.l %d0 + jeq .exit | zero samples to process, exit + moveq.l #8, %d3 + cmp.l %d3, %d2 + jgt .default | order is over 8, jump to default case + lea.l .jumptable, %a4 + move.l (%a4, %d2.l*4), %a4 + jmp (%a4) + .align 4 | avoid unaligned fetch +.jumptable: + .long .exit + .long .order1 + .long .order2 + .long .order3 + .long .order4 + .long .order5 + .long .order6 + .long .order7 + .long .order8 + +.order8: + movem.l (%a1), %d3-%d7/%a2-%a4 | load lpc coefs + move.l (%a0)+, %a5 | load first history sample +.loop8: + mac.l %a5, %a4, (%a0)+, %a5, %acc0 + mac.l %a5, %a3, (%a0)+, %a5, %acc0 + mac.l %a5, %a2, (%a0)+, %a5, %acc0 + mac.l %a5, %d7, (%a0)+, %a5, %acc0 + mac.l %a5, %d6, (%a0)+, %a5, %acc0 + mac.l %a5, %d5, (%a0)+, %a5, %acc0 + mac.l %a5, %d4, (%a0)+, %a5, %acc0 + mac.l %a5, %d3, (-7*4, %a0), %a5, %acc0 | load for the next iteration + movclr.l %acc0, %d2 | get sum + asr.l %d1, %d2 | shift sum by lp_quantization bits + add.l %d2, (%a0) | add residual and save + lea.l (-6*4, %a0), %a0 | history pointer points at second element + subq.l #1, %d0 | decrement counter + jne .loop8 | are we done? + jra .exit + +.order7: + movem.l (%a1), %d3-%d7/%a2-%a3 + move.l (%a0)+, %a5 +.loop7: + mac.l %a5, %a3, (%a0)+, %a5, %acc0 + mac.l %a5, %a2, (%a0)+, %a5, %acc0 + mac.l %a5, %d7, (%a0)+, %a5, %acc0 + mac.l %a5, %d6, (%a0)+, %a5, %acc0 + mac.l %a5, %d5, (%a0)+, %a5, %acc0 + mac.l %a5, %d4, (%a0)+, %a5, %acc0 + mac.l %a5, %d3, (-6*4, %a0), %a5, %acc0 + movclr.l %acc0, %d2 + asr.l %d1, %d2 + add.l %d2, (%a0) + lea.l (-5*4, %a0), %a0 + subq.l #1, %d0 + jne .loop7 + jra .exit + +.order6: + movem.l (%a1), %d3-%d7/%a2 + move.l (%a0)+, %a5 +.loop6: + mac.l %a5, %a2, (%a0)+, %a5, %acc0 + mac.l %a5, %d7, (%a0)+, %a5, %acc0 + mac.l %a5, %d6, (%a0)+, %a5, %acc0 + mac.l %a5, %d5, (%a0)+, %a5, %acc0 + mac.l %a5, %d4, (%a0)+, %a5, %acc0 + mac.l %a5, %d3, (-5*4, %a0), %a5, %acc0 + movclr.l %acc0, %d2 + asr.l %d1, %d2 + add.l %d2, (%a0) + lea.l (-4*4, %a0), %a0 + subq.l #1, %d0 + jne .loop6 + jra .exit + +.order5: + movem.l (%a1), %d3-%d7 + move.l (%a0)+, %a5 +.loop5: + mac.l %a5, %d7, (%a0)+, %a5, %acc0 + mac.l %a5, %d6, (%a0)+, %a5, %acc0 + mac.l %a5, %d5, (%a0)+, %a5, %acc0 + mac.l %a5, %d4, (%a0)+, %a5, %acc0 + mac.l %a5, %d3, (-4*4, %a0), %a5, %acc0 + movclr.l %acc0, %d2 + asr.l %d1, %d2 + add.l %d2, (%a0) + lea.l (-3*4, %a0), %a0 + subq.l #1, %d0 + jne .loop5 + jra .exit + +.order4: + movem.l (%a1), %d3-%d6 + move.l (%a0)+, %a5 +.loop4: + mac.l %a5, %d6, (%a0)+, %a5, %acc0 + mac.l %a5, %d5, (%a0)+, %a5, %acc0 + mac.l %a5, %d4, (%a0)+, %a5, %acc0 + mac.l %a5, %d3, (-3*4, %a0), %a5, %acc0 + movclr.l %acc0, %d2 + asr.l %d1, %d2 + add.l %d2, (%a0) + subq.l #8, %a0 + subq.l #1, %d0 + jne .loop4 + jra .exit + +.order3: + movem.l (%a1), %d3-%d5 + move.l (%a0)+, %a5 +.loop3: + mac.l %a5, %d5, (%a0)+, %a5, %acc0 + mac.l %a5, %d4, (%a0)+, %a5, %acc0 + mac.l %a5, %d3, (-2*4, %a0), %a5, %acc0 + movclr.l %acc0, %d2 + asr.l %d1, %d2 + add.l %d2, (%a0) + subq.l #4, %a0 + subq.l #1, %d0 + jne .loop3 + jra .exit + +.order2: + movem.l (%a1), %d3-%d4 + move.l (%a0)+, %a5 +.loop2: + mac.l %a5, %d4, (%a0)+, %a5, %acc0 + mac.l %a5, %d3, %acc0 | data for next iteration is already loaded + movclr.l %acc0, %d2 + asr.l %d1, %d2 + add.l %d2, (%a0) + subq.l #1, %d0 + jne .loop2 + jra .exit + +.order1: + | no point in using mac here + move.l (%a1), %d3 +.loop1: + move.l %d3, %d2 + muls.l (%a0)+, %d2 + asr.l %d1, %d2 + add.l %d2, (%a0) + subq.l #1, %d0 + jne .loop1 + jra .exit + +.default: + /* we do the filtering in an unrolled by 4 loop as far as we can, and then + do the rest in an ordinary one by one sample loop. + */ + lea.l (%a1, %d2.l*4), %a2 | need to start in the other end of coefs + move.l %a0, %a3 | working copy of history pointer + move.l %d2, %d3 + lsr.l #2, %d3 | coefs/4, num of iterations needed in next loop + move.l (%a3)+, %a5 | preload data for loop +.dloop1: + lea.l (-4*4, %a2), %a2 | move lpc coef pointer four samples backwards + movem.l (%a2), %d4-%d7 | load four coefs + mac.l %a5, %d7, (%a3)+, %a5, %acc0 + mac.l %a5, %d6, (%a3)+, %a5, %acc0 + mac.l %a5, %d5, (%a3)+, %a5, %acc0 + mac.l %a5, %d4, (%a3)+, %a5, %acc0 + subq.l #1, %d3 | any more unrolled loop operations left? + jne .dloop1 + + move.l %d2, %d3 + moveq.l #3, %d4 | mask 0x00000003 + and.l %d4, %d3 | get the remaining samples to be filtered + jeq .dsave | no remaining samples +.dloop2: + move.l -(%a2), %d4 | get lpc coef + mac.l %a5, %d4, (%a3)+, %a5, %acc0 + subq.l #1, %d3 | any more iterations left? + jne .dloop2 +.dsave: + movclr.l %acc0, %d3 | get result + asr.l %d1, %d3 | shift lp_quantization bits right + subq.l #4, %a3 | we're one past the save location + add.l %d3, (%a3) | add residual and save + addq.l #4, %a0 | increment history pointer + subq.l #1, %d0 | decrement data_len + jne .default | are we done? + | if so, fall through to exit + +.exit: + movem.l (%sp), %d2-%d7/%a2-%a5 + lea.l (40, %sp), %sp + rts diff --git a/apps/codecs/libffmpegFLAC/coldfire.h b/apps/codecs/libffmpegFLAC/coldfire.h new file mode 100644 index 0000000000..5493f549f7 --- /dev/null +++ b/apps/codecs/libffmpegFLAC/coldfire.h @@ -0,0 +1,8 @@ +#ifndef _FLAC_COLDFIRE_H +#define _FLAC_COLDFIRE_H + +#include "bitstream.h" + +void lpc_decode_emac(int blocksize, int qlevel, int pred_order, int32_t* data, int* coeffs); + +#endif -- cgit v1.2.3