From e78186f4cfe0adae87cf499b73a75807d019d9bc Mon Sep 17 00:00:00 2001 From: Thom Johansen Date: Sun, 6 Mar 2005 22:13:44 +0000 Subject: Added asm optimized short block IMDCT and windowing. Removed a warning in synth.c. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@6159 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/libmad/layer3.c | 112 ++++++++++++++++++++++++++++++++++++++++++++ apps/codecs/libmad/synth.c | 7 +-- 2 files changed, 116 insertions(+), 3 deletions(-) (limited to 'apps/codecs') diff --git a/apps/codecs/libmad/layer3.c b/apps/codecs/libmad/layer3.c index b1a9919af0..27c8d18430 100644 --- a/apps/codecs/libmad/layer3.c +++ b/apps/codecs/libmad/layer3.c @@ -2144,6 +2144,116 @@ void III_imdct_l(mad_fixed_t const X[18], mad_fixed_t z[36], * NAME: III_imdct_s() * DESCRIPTION: perform IMDCT and windowing for short blocks */ + +# if CONFIG_CPU==MCF5249 && !defined(SIMULATOR) +/* this should probably be stuffed in a .S file somewhere, it's almost + 100% asm as it is. + */ +static +void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36]) +{ + mad_fixed_t y[36], *yptr; + mad_fixed_t const *wptr; + + /* IMDCT */ + yptr = &y[0]; + + /* if additional precision is needed in this block, it is possible to + * get more low bits out of the accext01 register _before_ doing the + * movclrs. + */ + asm volatile ( + "move.l #0x000000b0, %%macsr\n\t" /* frac. mode, saturation, rounding */ + "suba.l %%a0, %%a0\n\t" /* clear loop variable */ + ".align 2\n\t.imdctloop:\n\t" /* outer loop label */ + "lea.l imdct_s, %%a1\n\t" /* load pointer to imdct coefs in a1 */ + "movem.l (%[X]), %%d0-%%d5\n\t" /* load input data in d0-d5 */ + + "clr.l %%d7\n\t" /* init loop variable */ + "move.l (%%a1)+, %%a5\n\t" /* load imdct coef in a5 */ + ".align 2\n\t.macloop:\n\t" /* inner loop label */ + "mac.l %%d0, %%a5, (%%a1)+, %%a5, %%acc0\n\t" /* mac sequence */ + "mac.l %%d1, %%a5, (%%a1)+, %%a5, %%acc0\n\t" + "mac.l %%d2, %%a5, (%%a1)+, %%a5, %%acc0\n\t" + "mac.l %%d3, %%a5, (%%a1)+, %%a5, %%acc0\n\t" + "mac.l %%d4, %%a5, (%%a1)+, %%a5, %%acc0\n\t" + "mac.l %%d5, %%a5, (%%a1)+, %%a5, %%acc0\n\t" + "movclr.l %%acc0, %%d6\n\t" /* get result, left shifted once */ + "asl.l #3, %%d6\n\t" /* got one shift free, shift three more */ + "mov.l %%d6, (%[yptr], %%d7.l*4)\n\t" /* yptr[i] = result */ + "neg.l %%d6\n\t" + "neg.l %%d7\n\t" + "mov.l %%d6, (5*4, %[yptr], %%d7.l*4)\n\t" /* yptr[5 - 1] = -result */ + "mac.l %%d0, %%a5, (%%a1)+, %%a5, %%acc0\n\t" /* mac sequence */ + "mac.l %%d1, %%a5, (%%a1)+, %%a5, %%acc0\n\t" + "mac.l %%d2, %%a5, (%%a1)+, %%a5, %%acc0\n\t" + "mac.l %%d3, %%a5, (%%a1)+, %%a5, %%acc0\n\t" + "mac.l %%d4, %%a5, (%%a1)+, %%a5, %%acc0\n\t" + "mac.l %%d5, %%a5, (%%a1)+, %%a5, %%acc0\n\t" + "movclr.l %%acc0, %%d6\n\t" /* get result */ + "asl.l #3, %%d6\n\t" + "mov.l %%d6, (11*4, %[yptr], %%d7.l*4)\n\t" /* yptr[11 - i] = result*/ + "neg.l %%d7\n\t" + "mov.l %%d6, (6*4, %[yptr], %%d7.l*4)\n\t" /* yptr[i + 6] = result */ + "addq.l #1, %%d7\n\t" /* increment inner loop variable */ + "cmp.l #3, %%d7\n\t" /* we do three inner loop iterations */ + "jne .macloop\n\t" + + "adda.l #48, %[yptr]\n\t" /* add pointer increment */ + "adda.l #24, %[X]\n\t" + "addq.l #1, %%a0\n\t" /* increment outer loop variable */ + "cmpa.l #3, %%a0\n\t" /* we do three outer loop iterations */ + "jne .imdctloop\n\t" + : [X] "+a" (X), [yptr] "+a" (yptr) + : : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "a0", "a1", "a5"); + + /* windowing, overlapping and concatenation */ + + yptr = &y[0]; + wptr = &window_s[0]; + + asm volatile ( + "clr.l %%d7\n\t" + ".align 2\n\t.overlaploop:\n\t" + "clr.l (%[z], %%d7.l*4)\n\t" /* z[i + 0] = 0 */ + "move.l (%[wptr]), %%d0\n\t" + "move.l (%[yptr]), %%d2\n\t" + "mac.l %%d0, %%d2, 24(%[wptr]), %%d1, %%acc0\n\t" + "movclr.l %%acc0, %%d6\n\t" + "asl.l #3, %%d6\n\t" + "move.l %%d6, (6*4, %[z], %%d7.l*4)\n\t" /* z[i + 6] = result */ + + "move.l 24(%[yptr]), %%d2\n\t" + "mac.l %%d1, %%d2, 48(%[yptr]), %%d2, %%acc0\n\t" + "mac.l %%d0, %%d2, 72(%[yptr]), %%d2, %%acc0\n\t" + "movclr.l %%acc0, %%d6\n\t" + "asl.l #3, %%d6\n\t" + "move.l %%d6, (12*4, %[z], %%d7.l*4)\n\t" /* z[i + 12] = result */ + + "mac.l %%d1, %%d2, (24*4, %[yptr]), %%d2, %%acc0\n\t" + "mac.l %%d0, %%d2, (30*4, %[yptr]), %%d2, %%acc0\n\t" + "movclr.l %%acc0, %%d6\n\t" + "asl.l #3, %%d6\n\t" + "move.l %%d6, (18*4, %[z], %%d7.l*4)\n\t" /* z[i + 18] = result */ + + "mac.l %%d1, %%d2, %%acc0\n\t" + "movclr.l %%acc0, %%d6\n\t" + "asl.l #3, %%d6\n\t" + "move.l %%d6, (24*4, %[z], %%d7.l*4)\n\t" /* z[i + 24] = result */ + + "clr.l (30*4, %[z], %%d7.l*4)\n\t" /* z[i + 30] = 0 */ + "addq.l #1, %%d7\n\t" + "addq.l #4, %[yptr]\n\t" + "addq.l #4, %[wptr]\n\t" + "cmp.l #6, %%d7\n\t" /* six iterations */ + "jne .overlaploop\n\t" + : [yptr] "+a" (yptr), [wptr] "+a" (wptr) + : [z] "a" (z) + : "d7"); +} + +#else + static void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36]) { @@ -2219,6 +2329,8 @@ void III_imdct_s(mad_fixed_t const X[18], mad_fixed_t z[36]) } } +#endif + /* * NAME: III_overlap() * DESCRIPTION: perform overlap-add of windowed IMDCT outputs diff --git a/apps/codecs/libmad/synth.c b/apps/codecs/libmad/synth.c index 530f33cdf6..2d212c091f 100644 --- a/apps/codecs/libmad/synth.c +++ b/apps/codecs/libmad/synth.c @@ -24,6 +24,7 @@ # endif # include "global.h" + # include "fixed.h" # include "frame.h" # include "synth.h" @@ -100,6 +101,7 @@ void mad_synth_mute(struct mad_synth *synth) # endif /* possible DCT speed optimization */ + # if defined(OPT_SPEED) && defined(MAD_F_MLX) # define OPT_DCTO # define MUL(x, y) \ @@ -112,6 +114,7 @@ void mad_synth_mute(struct mad_synth *synth) # undef OPT_DCTO # define MUL(x, y) mad_f_mul((x), (y)) # endif + /* * NAME: dct32() * DESCRIPTION: perform fast in[32]->out[32] DCT @@ -547,7 +550,6 @@ mad_fixed_t const D[17][32] __attribute__ ((section(".idata"))) = { void synth_full(struct mad_synth *, struct mad_frame const *, unsigned int, unsigned int); # else - /* * NAME: synth->full() * DESCRIPTION: perform full frequency PCM synthesis @@ -563,7 +565,7 @@ void synth_full(struct mad_synth *synth, struct mad_frame const *frame, mad_fixed_t *pcm1, *pcm2, (*filter)[2][2][16][8]; mad_fixed_t const (*sbsample)[36][32]; mad_fixed_t (*fe)[8], (*fx)[8], (*fo)[8]; - mad_fixed_t const (*Dptr)[32], *ptr; + mad_fixed_t const (*Dptr)[32]; mad_fixed64hi_t hi = 0; mad_fixed64lo_t lo; @@ -1010,4 +1012,3 @@ void mad_synth_frame(struct mad_synth *synth, struct mad_frame const *frame) synth->phase = (synth->phase + ns) % 16; } - -- cgit v1.2.3