From 601ede7f9cc88cc40e074cc9d9cfdc2c0ba46d4c Mon Sep 17 00:00:00 2001 From: Dave Chapman Date: Sun, 10 Jun 2007 08:55:16 +0000 Subject: C optimisations to the predictor decoding - create a single function for decoding stereo streams, and reorganise to minimise the number of variables used. My -c1000 test track now decodes at 93% realtime on PortalPlayer (was 78%), 187% on Coldfire (was 170%) and 447% on Gigabeat (was 408%). git-svn-id: svn://svn.rockbox.org/rockbox/trunk@13608 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/demac/libdemac/decoder.c | 6 +- apps/codecs/demac/libdemac/parser.h | 31 ++-- apps/codecs/demac/libdemac/predictor.c | 288 ++++++++++++++++++++------------- apps/codecs/demac/libdemac/predictor.h | 8 +- 4 files changed, 203 insertions(+), 130 deletions(-) (limited to 'apps') diff --git a/apps/codecs/demac/libdemac/decoder.c b/apps/codecs/demac/libdemac/decoder.c index ba8c393a67..4f4a583d00 100644 --- a/apps/codecs/demac/libdemac/decoder.c +++ b/apps/codecs/demac/libdemac/decoder.c @@ -47,7 +47,7 @@ void init_frame_decoder(struct ape_ctx_t* ape_ctx, //printf("CRC=0x%08x\n",ape_ctx->CRC); //printf("Flags=0x%08x\n",ape_ctx->frameflags); - init_predictor_decoder(ape_ctx); + init_predictor_decoder(&ape_ctx->predictor); switch (ape_ctx->compressiontype) { @@ -117,7 +117,7 @@ int decode_chunk(struct ape_ctx_t* ape_ctx, } /* Now apply the predictor decoding */ - predictor_decode_mono(ape_ctx,decoded0,count); + predictor_decode_mono(&ape_ctx->predictor,decoded0,count); if (ape_ctx->channels==2) { /* Pseudo-stereo - just copy left channel to right channel */ @@ -163,7 +163,7 @@ int decode_chunk(struct ape_ctx_t* ape_ctx, } /* Now apply the predictor decoding */ - predictor_decode_stereo(ape_ctx,decoded0,decoded1,count); + predictor_decode_stereo(&ape_ctx->predictor,decoded0,decoded1,count); if (ape_ctx->bps == 8) { /* TODO: Handle 8-bit streams */ diff --git a/apps/codecs/demac/libdemac/parser.h b/apps/codecs/demac/libdemac/parser.h index 0e35425315..301cf4a5e1 100644 --- a/apps/codecs/demac/libdemac/parser.h +++ b/apps/codecs/demac/libdemac/parser.h @@ -68,24 +68,28 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA #define HISTORY_SIZE 512 #define PREDICTOR_ORDER 8 +/* Total size of all predictor histories - 50 * sizeof(int32_t) */ +#define PREDICTOR_SIZE 50 struct predictor_t { - /* Adaption co-efficients */ - int32_t coeffsA[4]; - int32_t coeffsB[5]; - /* Filter histories */ - int32_t historybuffer[HISTORY_SIZE + PREDICTOR_ORDER * 4]; - int32_t* delayA; - int32_t* delayB; - int32_t* adaptcoeffsA; - int32_t* adaptcoeffsB; + int32_t* buf; - int32_t lastA; + int32_t YlastA; + int32_t XlastA; - int32_t filterA; - int32_t filterB; + int32_t YfilterA; + int32_t XfilterA; + int32_t YfilterB; + int32_t XfilterB; + + /* Adaption co-efficients */ + int32_t YcoeffsA[4]; + int32_t XcoeffsA[4]; + int32_t YcoeffsB[5]; + int32_t XcoeffsB[5]; + int32_t historybuffer[HISTORY_SIZE + PREDICTOR_SIZE]; }; struct ape_ctx_t @@ -129,8 +133,7 @@ struct ape_ctx_t int frameflags; int currentframeblocks; int blocksdecoded; - struct predictor_t predictorY; - struct predictor_t predictorX; + struct predictor_t predictor; }; int ape_parseheader(int fd, struct ape_ctx_t* ape_ctx); diff --git a/apps/codecs/demac/libdemac/predictor.c b/apps/codecs/demac/libdemac/predictor.c index 9531786fd1..a7210bf014 100644 --- a/apps/codecs/demac/libdemac/predictor.c +++ b/apps/codecs/demac/libdemac/predictor.c @@ -37,160 +37,230 @@ static const int32_t initial_coeffs[4] = { 360, 317, -109, 98 }; -static void init_predictor(struct predictor_t* p) +#define YDELAYA (18 + PREDICTOR_ORDER*4) +#define YDELAYB (18 + PREDICTOR_ORDER*3) +#define XDELAYA (18 + PREDICTOR_ORDER*2) +#define XDELAYB (18 + PREDICTOR_ORDER) + +#define YADAPTCOEFFSA (18) +#define XADAPTCOEFFSA (14) +#define YADAPTCOEFFSB (10) +#define XADAPTCOEFFSB (5) + +void init_predictor_decoder(struct predictor_t* p) { /* Zero the history buffers */ - memset(p->historybuffer, 0, (PREDICTOR_ORDER*4) * sizeof(int32_t)); - p->delayA = p->historybuffer + PREDICTOR_ORDER*4; - p->delayB = p->historybuffer + PREDICTOR_ORDER*3; - p->adaptcoeffsA = p->historybuffer + PREDICTOR_ORDER*2; - p->adaptcoeffsB = p->historybuffer + PREDICTOR_ORDER; + memset(p->historybuffer, 0, PREDICTOR_SIZE * sizeof(int32_t)); + p->buf = p->historybuffer; /* Initialise and zero the co-efficients */ - memcpy(p->coeffsA, initial_coeffs, sizeof(initial_coeffs)); - memset(p->coeffsB, 0, sizeof(p->coeffsB)); - - p->filterA = 0; - p->filterB = 0; - - p->lastA = 0; + memcpy(p->YcoeffsA, initial_coeffs, sizeof(initial_coeffs)); + memcpy(p->XcoeffsA, initial_coeffs, sizeof(initial_coeffs)); + memset(p->YcoeffsB, 0, sizeof(p->YcoeffsB)); + memset(p->XcoeffsB, 0, sizeof(p->XcoeffsB)); + + p->YfilterA = 0; + p->YfilterB = 0; + p->YlastA = 0; + + p->XfilterA = 0; + p->XfilterB = 0; + p->XlastA = 0; } -static int do_predictor_decode(struct predictor_t* p, int32_t A, int32_t B) -{ - int32_t predictionA, predictionB, currentA; - - p->delayA[0] = p->lastA; - p->delayA[-1] = p->delayA[0] - p->delayA[-1]; - - predictionA = scalarproduct4_rev32(p->coeffsA,p->delayA); - - /* Apply a scaled first-order filter compression */ - p->delayB[0] = B - ((p->filterB * 31) >> 5); - p->filterB = B; - - p->delayB[-1] = p->delayB[0] - p->delayB[-1]; - - predictionB = scalarproduct5_rev32(p->coeffsB,p->delayB); +#ifdef CPU_COLDFIRE +/* Putting this in IRAM makes a small speedup (e.g. 186% -> 187% + realtime for a -c1000 file on Coldfire, but is slower on PP. */ +int predictor_decode_stereo(struct predictor_t* p, int32_t* decoded0, int32_t* decoded1, int count) ICODE_ATTR; +#endif - currentA = A + ((predictionA + (predictionB >> 1)) >> 10); - - p->adaptcoeffsA[0] = SIGN(p->delayA[0]); - p->adaptcoeffsA[-1] = SIGN(p->delayA[-1]); - - p->adaptcoeffsB[0] = SIGN(p->delayB[0]); - p->adaptcoeffsB[-1] = SIGN(p->delayB[-1]); +int predictor_decode_stereo(struct predictor_t* p, int32_t* decoded0, int32_t* decoded1, int count) +{ + int32_t predictionA, predictionB; - if (A > 0) - { - vector_sub4_rev32(p->coeffsA, p->adaptcoeffsA); - vector_sub5_rev32(p->coeffsB, p->adaptcoeffsB); - } - else if (A < 0) + while (count--) { - vector_add4_rev32(p->coeffsA, p->adaptcoeffsA); - vector_add5_rev32(p->coeffsB, p->adaptcoeffsB); - } - - p->delayA++; - p->delayB++; - p->adaptcoeffsA++; - p->adaptcoeffsB++; - - /* Have we filled the history buffer? */ - if (p->delayA == p->historybuffer + HISTORY_SIZE + (PREDICTOR_ORDER*4)) { - memmove(p->historybuffer, p->delayA - (PREDICTOR_ORDER*4), - (PREDICTOR_ORDER*4) * sizeof(int32_t)); - p->delayA = p->historybuffer + PREDICTOR_ORDER*4; - p->delayB = p->historybuffer + PREDICTOR_ORDER*3; - p->adaptcoeffsA = p->historybuffer + PREDICTOR_ORDER*2; - p->adaptcoeffsB = p->historybuffer + PREDICTOR_ORDER; - } + /* Predictor Y */ + p->buf[YDELAYA] = p->YlastA; + p->buf[YADAPTCOEFFSA] = SIGN(p->buf[YDELAYA]); + + p->buf[YDELAYA-1] = p->buf[YDELAYA] - p->buf[YDELAYA-1]; + p->buf[YADAPTCOEFFSA-1] = SIGN(p->buf[YDELAYA-1]); + + predictionA = (p->buf[YDELAYA] * p->YcoeffsA[0]) + + (p->buf[YDELAYA-1] * p->YcoeffsA[1]) + + (p->buf[YDELAYA-2] * p->YcoeffsA[2]) + + (p->buf[YDELAYA-3] * p->YcoeffsA[3]); + + /* Apply a scaled first-order filter compression */ + p->buf[YDELAYB] = p->XfilterA - ((p->YfilterB * 31) >> 5); + p->buf[YADAPTCOEFFSB] = SIGN(p->buf[YDELAYB]); + p->YfilterB = p->XfilterA; + + p->buf[YDELAYB-1] = p->buf[YDELAYB] - p->buf[YDELAYB-1]; + p->buf[YADAPTCOEFFSB-1] = SIGN(p->buf[YDELAYB-1]); + + predictionB = (p->buf[YDELAYB] * p->YcoeffsB[0]) + + (p->buf[YDELAYB-1] * p->YcoeffsB[1]) + + (p->buf[YDELAYB-2] * p->YcoeffsB[2]) + + (p->buf[YDELAYB-3] * p->YcoeffsB[3]) + + (p->buf[YDELAYB-4] * p->YcoeffsB[4]); + + p->YlastA = *decoded0 + ((predictionA + (predictionB >> 1)) >> 10); + p->YfilterA = p->YlastA + ((p->YfilterA * 31) >> 5); + + /* Predictor X */ + + p->buf[XDELAYA] = p->XlastA; + p->buf[XADAPTCOEFFSA] = SIGN(p->buf[XDELAYA]); + p->buf[XDELAYA-1] = p->buf[XDELAYA] - p->buf[XDELAYA-1]; + p->buf[XADAPTCOEFFSA-1] = SIGN(p->buf[XDELAYA-1]); + + predictionA = (p->buf[XDELAYA] * p->XcoeffsA[0]) + + (p->buf[XDELAYA-1] * p->XcoeffsA[1]) + + (p->buf[XDELAYA-2] * p->XcoeffsA[2]) + + (p->buf[XDELAYA-3] * p->XcoeffsA[3]); + + /* Apply a scaled first-order filter compression */ + p->buf[XDELAYB] = p->YfilterA - ((p->XfilterB * 31) >> 5); + p->buf[XADAPTCOEFFSB] = SIGN(p->buf[XDELAYB]); + p->XfilterB = p->YfilterA; + p->buf[XDELAYB-1] = p->buf[XDELAYB] - p->buf[XDELAYB-1]; + p->buf[XADAPTCOEFFSB-1] = SIGN(p->buf[XDELAYB-1]); + + predictionB = (p->buf[XDELAYB] * p->XcoeffsB[0]) + + (p->buf[XDELAYB-1] * p->XcoeffsB[1]) + + (p->buf[XDELAYB-2] * p->XcoeffsB[2]) + + (p->buf[XDELAYB-3] * p->XcoeffsB[3]) + + (p->buf[XDELAYB-4] * p->XcoeffsB[4]); + + p->XlastA = *decoded1 + ((predictionA + (predictionB >> 1)) >> 10); + p->XfilterA = p->XlastA + ((p->XfilterA * 31) >> 5); + + if (*decoded0 > 0) + { + p->YcoeffsA[0] -= p->buf[YADAPTCOEFFSA]; + p->YcoeffsA[1] -= p->buf[YADAPTCOEFFSA-1]; + p->YcoeffsA[2] -= p->buf[YADAPTCOEFFSA-2]; + p->YcoeffsA[3] -= p->buf[YADAPTCOEFFSA-3]; + + p->YcoeffsB[0] -= p->buf[YADAPTCOEFFSB]; + p->YcoeffsB[1] -= p->buf[YADAPTCOEFFSB-1]; + p->YcoeffsB[2] -= p->buf[YADAPTCOEFFSB-2]; + p->YcoeffsB[3] -= p->buf[YADAPTCOEFFSB-3]; + p->YcoeffsB[4] -= p->buf[YADAPTCOEFFSB-4]; + } + else if (*decoded0 < 0) + { + p->YcoeffsA[0] += p->buf[YADAPTCOEFFSA]; + p->YcoeffsA[1] += p->buf[YADAPTCOEFFSA-1]; + p->YcoeffsA[2] += p->buf[YADAPTCOEFFSA-2]; + p->YcoeffsA[3] += p->buf[YADAPTCOEFFSA-3]; + + p->YcoeffsB[0] += p->buf[YADAPTCOEFFSB]; + p->YcoeffsB[1] += p->buf[YADAPTCOEFFSB-1]; + p->YcoeffsB[2] += p->buf[YADAPTCOEFFSB-2]; + p->YcoeffsB[3] += p->buf[YADAPTCOEFFSB-3]; + p->YcoeffsB[4] += p->buf[YADAPTCOEFFSB-4]; + } - p->lastA = currentA; - p->filterA = currentA + ((p->filterA * 31) >> 5); + *(decoded0++) = p->YfilterA; - return p->filterA; -} - -static int32_t X; + if (*decoded1 > 0) + { + p->XcoeffsA[0] -= p->buf[XADAPTCOEFFSA]; + p->XcoeffsA[1] -= p->buf[XADAPTCOEFFSA-1]; + p->XcoeffsA[2] -= p->buf[XADAPTCOEFFSA-2]; + p->XcoeffsA[3] -= p->buf[XADAPTCOEFFSA-3]; + + p->XcoeffsB[0] -= p->buf[XADAPTCOEFFSB]; + p->XcoeffsB[1] -= p->buf[XADAPTCOEFFSB-1]; + p->XcoeffsB[2] -= p->buf[XADAPTCOEFFSB-2]; + p->XcoeffsB[3] -= p->buf[XADAPTCOEFFSB-3]; + p->XcoeffsB[4] -= p->buf[XADAPTCOEFFSB-4]; + } + else if (*decoded1 < 0) + { + p->XcoeffsA[0] += p->buf[XADAPTCOEFFSA]; + p->XcoeffsA[1] += p->buf[XADAPTCOEFFSA-1]; + p->XcoeffsA[2] += p->buf[XADAPTCOEFFSA-2]; + p->XcoeffsA[3] += p->buf[XADAPTCOEFFSA-3]; + + p->XcoeffsB[0] += p->buf[XADAPTCOEFFSB]; + p->XcoeffsB[1] += p->buf[XADAPTCOEFFSB-1]; + p->XcoeffsB[2] += p->buf[XADAPTCOEFFSB-2]; + p->XcoeffsB[3] += p->buf[XADAPTCOEFFSB-3]; + p->XcoeffsB[4] += p->buf[XADAPTCOEFFSB-4]; + } -void init_predictor_decoder(struct ape_ctx_t* ape_ctx) -{ - X = 0; + *(decoded1++) = p->XfilterA; - init_predictor(&ape_ctx->predictorY); - init_predictor(&ape_ctx->predictorX); -} + /* Combined */ + p->buf++; -int predictor_decode_stereo(struct ape_ctx_t* ape_ctx, int32_t* decoded0, int32_t* decoded1, int count) ICODE_ATTR; -int predictor_decode_stereo(struct ape_ctx_t* ape_ctx, int32_t* decoded0, int32_t* decoded1, int count) -{ - while (count--) - { - *decoded0 = do_predictor_decode(&ape_ctx->predictorY, *decoded0, X); - X = do_predictor_decode(&ape_ctx->predictorX, *decoded1, *(decoded0)++); - *(decoded1++) = X; + /* Have we filled the history buffer? */ + if (p->buf == p->historybuffer + HISTORY_SIZE) { + memmove(p->historybuffer, p->buf, + PREDICTOR_SIZE * sizeof(int32_t)); + p->buf = p->historybuffer; + } } return 0; } -int predictor_decode_mono(struct ape_ctx_t* ape_ctx, int32_t* decoded0, int count) +int predictor_decode_mono(struct predictor_t* p, int32_t* decoded0, int count) { - struct predictor_t* p = &ape_ctx->predictorY; int32_t predictionA, currentA, A; - currentA = p->lastA; + currentA = p->YlastA; while (count--) { A = *decoded0; - p->delayA[0] = currentA; - p->delayA[-1] = p->delayA[0] - p->delayA[-1]; + p->buf[YDELAYA] = currentA; + p->buf[YDELAYA-1] = p->buf[YDELAYA] - p->buf[YDELAYA-1]; - predictionA = (p->delayA[0] * p->coeffsA[0]) + - (p->delayA[-1] * p->coeffsA[1]) + - (p->delayA[-2] * p->coeffsA[2]) + - (p->delayA[-3] * p->coeffsA[3]); + predictionA = (p->buf[YDELAYA] * p->YcoeffsA[0]) + + (p->buf[YDELAYA-1] * p->YcoeffsA[1]) + + (p->buf[YDELAYA-2] * p->YcoeffsA[2]) + + (p->buf[YDELAYA-3] * p->YcoeffsA[3]); currentA = A + (predictionA >> 10); - p->adaptcoeffsA[0] = SIGN(p->delayA[0]); - p->adaptcoeffsA[-1] = SIGN(p->delayA[-1]); + p->buf[YADAPTCOEFFSA] = SIGN(p->buf[YDELAYA]); + p->buf[YADAPTCOEFFSA-1] = SIGN(p->buf[YDELAYA-1]); if (A > 0) { - p->coeffsA[0] -= p->adaptcoeffsA[0]; - p->coeffsA[1] -= p->adaptcoeffsA[-1]; - p->coeffsA[2] -= p->adaptcoeffsA[-2]; - p->coeffsA[3] -= p->adaptcoeffsA[-3]; + p->YcoeffsA[0] -= p->buf[YADAPTCOEFFSA]; + p->YcoeffsA[1] -= p->buf[YADAPTCOEFFSA-1]; + p->YcoeffsA[2] -= p->buf[YADAPTCOEFFSA-2]; + p->YcoeffsA[3] -= p->buf[YADAPTCOEFFSA-3]; } else if (A < 0) { - p->coeffsA[0] += p->adaptcoeffsA[0]; - p->coeffsA[1] += p->adaptcoeffsA[-1]; - p->coeffsA[2] += p->adaptcoeffsA[-2]; - p->coeffsA[3] += p->adaptcoeffsA[-3]; + p->YcoeffsA[0] += p->buf[YADAPTCOEFFSA]; + p->YcoeffsA[1] += p->buf[YADAPTCOEFFSA-1]; + p->YcoeffsA[2] += p->buf[YADAPTCOEFFSA-2]; + p->YcoeffsA[3] += p->buf[YADAPTCOEFFSA-3]; } - p->delayA++; - p->adaptcoeffsA++; + p->buf++; /* Have we filled the history buffer? */ - if (p->delayA == p->historybuffer + HISTORY_SIZE + (PREDICTOR_ORDER*4)) { - memmove(p->historybuffer, p->delayA - (PREDICTOR_ORDER*4), - (PREDICTOR_ORDER*4) * sizeof(int32_t)); - p->delayA = p->historybuffer + PREDICTOR_ORDER*4; - p->adaptcoeffsA = p->historybuffer + PREDICTOR_ORDER*2; + if (p->buf == p->historybuffer + HISTORY_SIZE) { + memmove(p->historybuffer, p->buf, + PREDICTOR_SIZE * sizeof(int32_t)); + p->buf = p->historybuffer; } - p->filterA = currentA + ((p->filterA * 31) >> 5); - *(decoded0++) = p->filterA; + p->YfilterA = currentA + ((p->YfilterA * 31) >> 5); + *(decoded0++) = p->YfilterA; } - p->lastA = currentA; + p->YlastA = currentA; return 0; } diff --git a/apps/codecs/demac/libdemac/predictor.h b/apps/codecs/demac/libdemac/predictor.h index 3c023c8188..df2ba629e6 100644 --- a/apps/codecs/demac/libdemac/predictor.h +++ b/apps/codecs/demac/libdemac/predictor.h @@ -2,7 +2,7 @@ libdemac - A Monkey's Audio decoder -$Id:$ +$Id$ Copyright (C) Dave Chapman 2007 @@ -29,8 +29,8 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA #include "parser.h" #include "filter.h" -void init_predictor_decoder(struct ape_ctx_t* ape_ctx); -int predictor_decode_stereo(struct ape_ctx_t* ape_ctx, int32_t* decoded0, int32_t* decoded1, int count); -int predictor_decode_mono(struct ape_ctx_t* ape_ctx, int32_t* decoded0, int count); +void init_predictor_decoder(struct predictor_t* p); +int predictor_decode_stereo(struct predictor_t* p, int32_t* decoded0, int32_t* decoded1, int count); +int predictor_decode_mono(struct predictor_t* p, int32_t* decoded0, int count); #endif -- cgit v1.2.3