From 2dbb424eb934bfb5516aef96d541098348c058a0 Mon Sep 17 00:00:00 2001 From: Andree Buschmann Date: Sun, 30 Aug 2009 14:14:22 +0000 Subject: Further performance optimization of the atrac3 decoder. Rework the internal sample representation and usage of dsp routines. For now a quick and dirty solution is used to add a fract part of 2 bits. Through this several buffers and functions as well as copy loops could be removed. Furthermore add some ASM for coldfire and place some additional data in IRAM on PP5022/24 and X5/M5. Speedup on ARM: +3%, speedup on Coldfire: +639%. Both ARM and Coldfire can decode in realtime now. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@22561 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/libatrac/fixp_math.h | 58 +++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 24 deletions(-) (limited to 'apps/codecs/libatrac/fixp_math.h') diff --git a/apps/codecs/libatrac/fixp_math.h b/apps/codecs/libatrac/fixp_math.h index 88cb5e4b66..5174cc7cc6 100644 --- a/apps/codecs/libatrac/fixp_math.h +++ b/apps/codecs/libatrac/fixp_math.h @@ -36,17 +36,38 @@ : "r"(X),"r"(Y)); \ low; \ }) - - #define fixmul32(X,Y) \ - ({ \ - int32_t low; \ - int32_t high; \ - asm volatile ( /* calculates: result = (X*Y)>>32 */ \ - "smull %0,%1,%2,%3 \n\t" /* 64 = 32x32 multiply */ \ - : "=&r"(low), "=&r" (high) \ - : "r"(X),"r"(Y)); \ - high; \ - }) +#elif defined(CPU_COLDFIRE) + #define fixmul16(X,Y) \ + ({ \ + int32_t t1, t2; \ + asm volatile ( \ + "mac.l %[x],%[y],%%acc0\n\t" /* multiply */ \ + "mulu.l %[y],%[x] \n\t" /* get lower half, avoid emac stall */ \ + "movclr.l %%acc0,%[t1] \n\t" /* get higher half */ \ + "moveq.l #15,%[t2] \n\t" \ + "asl.l %[t2],%[t1] \n\t" /* hi <<= 15, plus one free */ \ + "moveq.l #16,%[t2] \n\t" \ + "lsr.l %[t2],%[x] \n\t" /* (unsigned)lo >>= 16 */ \ + "or.l %[x],%[t1] \n\t" /* combine result */ \ + : /* outputs */ \ + [t1]"=&d"(t1), \ + [t2]"=&d"(t2) \ + : /* inputs */ \ + [x] "d" ((X)), \ + [y] "d" ((Y))); \ + t1; \ + }) + + #define fixmul31(X,Y) \ + ({ \ + int32_t t; \ + asm volatile ( \ + "mac.l %[x], %[y], %%acc0\n\t" /* multiply */ \ + "movclr.l %%acc0, %[t]\n\t" /* get higher half as result */ \ + : [t] "=d" (t) \ + : [x] "r" ((X)), [y] "r" ((Y))); \ + t; \ + }) #else static inline int32_t fixmul16(int32_t x, int32_t y) { @@ -69,17 +90,6 @@ return (int32_t)temp; } - - static inline int32_t fixmul32(int32_t x, int32_t y) - { - int64_t temp; - temp = x; - temp *= y; - - temp >>= 32; //16+31-16 = 31 bits - - return (int32_t)temp; - } #endif static inline int32_t fixdiv16(int32_t x, int32_t y) @@ -104,13 +114,13 @@ static inline int32_t fastSqrt(int32_t n) /* * Logically, these are unsigned. * We need the sign bit to test - * whether (op - res - one) underflowed. + * whether (op - res - one) underflowed. */ int32_t op, res, one; op = n; res = 0; /* "one" starts at the highest power of four <= than the argument. */ - one = 1 << 30; /* second-to-top bit set */ + one = 1 << 30; /* second-to-top bit set */ while (one > op) one >>= 2; while (one != 0) { -- cgit v1.2.3