From 8e46ab85a9a1c50589920897763ce53e593c9369 Mon Sep 17 00:00:00 2001 From: Dave Chapman Date: Tue, 7 Feb 2006 22:16:35 +0000 Subject: Patch #1426489 - Shorten codec optimisations from Mark Arigo git-svn-id: svn://svn.rockbox.org/rockbox/trunk@8615 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/libffmpegFLAC/shndec.c | 190 +++++++++++++++++++++++++++---------- apps/codecs/libffmpegFLAC/shndec.h | 44 ++++++--- 2 files changed, 172 insertions(+), 62 deletions(-) (limited to 'apps/codecs/libffmpegFLAC') diff --git a/apps/codecs/libffmpegFLAC/shndec.c b/apps/codecs/libffmpegFLAC/shndec.c index 6dde8f7a70..d7fc6a15a6 100644 --- a/apps/codecs/libffmpegFLAC/shndec.c +++ b/apps/codecs/libffmpegFLAC/shndec.c @@ -28,12 +28,6 @@ #include "golomb.h" #include "shndec.h" -/* These seem reasonable from my test files. - Does MAX_HEADER_SIZE really need to be 16384? */ -#define MAX_PRED_ORDER 16 -#define MAX_HEADER_SIZE DEFAULT_BLOCK_SIZE*4 -//#define MAX_HEADER_SIZE 16384 - #define ULONGSIZE 2 #define WAVE_FORMAT_PCM 0x0001 @@ -54,16 +48,6 @@ #define V2LPCQOFFSET (1 << LPCQUANT) #define FNSIZE 2 -#define FN_DIFF0 0 -#define FN_DIFF1 1 -#define FN_DIFF2 2 -#define FN_DIFF3 3 -#define FN_QUIT 4 -#define FN_BLOCKSIZE 5 -#define FN_BITSHIFT 6 -#define FN_QLPC 7 -#define FN_ZERO 8 -#define FN_VERBATIM 9 #define VERBATIM_CKSIZE_SIZE 5 #define VERBATIM_BYTE_SIZE 8 @@ -76,22 +60,21 @@ #define get_le16(gb) bswap_16(get_bits_long(gb, 16)) #define get_le32(gb) bswap_32(get_bits_long(gb, 32)) -static inline uint32_t bswap_32(uint32_t x){ +static uint32_t bswap_32(uint32_t x){ x= ((x<<8)&0xFF00FF00) | ((x>>8)&0x00FF00FF); return (x>>16) | (x<<16); } -static inline uint16_t bswap_16(uint16_t x){ +static uint16_t bswap_16(uint16_t x){ return (x>>8) | (x<<8); } /* converts fourcc string to int */ -static inline int ff_get_fourcc(const char *s){ +static int ff_get_fourcc(const char *s){ //assert( strlen(s)==4 ); return (s[0]) + (s[1]<<8) + (s[2]<<16) + (s[3]<<24); } -static unsigned int get_uint(ShortenContext *s, int k) ICODE_ATTR; static unsigned int get_uint(ShortenContext *s, int k) { if (s->version != 0) @@ -99,10 +82,77 @@ static unsigned int get_uint(ShortenContext *s, int k) return get_ur_golomb_shorten(&s->gb, k); } -static void decode_subframe_lpc(ShortenContext *s, int32_t *decoded, - int residual_size, int pred_order) ICODE_ATTR; -static void decode_subframe_lpc(ShortenContext *s, int32_t *decoded, - int residual_size, int pred_order) +#if defined(CPU_COLDFIRE) && !defined(SIMULATOR) +static void coldfire_lshift_samples(int n, int shift, int32_t *samples) ICODE_ATTR; +static void coldfire_lshift_samples(int n, int shift, int32_t *samples) +{ +/* + for (i = 0; i < n; i++) + samples[i] =<< shift; +*/ + asm volatile ( + "move.l %[n], %%d0 \n" /* d0 = loop counter */ + "asr.l #2, %%d0 \n" + "beq l1_shift \n" + "l2_shift:" /* main loop (unroll by 4) */ + "movem.l (%[x]), %%d4-%%d7 \n" + "asl.l %[s], %%d4 \n" + "asl.l %[s], %%d5 \n" + "asl.l %[s], %%d6 \n" + "asl.l %[s], %%d7 \n" + "movem.l %%d4-%%d7, (%[x]) \n" + "add.l #16, %[x] \n" + + "subq.l #1, %%d0 \n" + "bne l2_shift \n" + "l1_shift:" /* any loops left? */ + "and.l #3, %[n] \n" + "beq l4_shift \n" + "l3_shift:" /* remaining loops */ + "move.l (%[x]), %%d4 \n" + "asl.l %[s], %%d4 \n" + "move.l %%d4, (%[x])+ \n" + + "subq.l #1, %[n] \n" + "bne l3_shift \n" + "l4_shift:" /* exit */ + : [n] "+d" (n), /* d1 */ + [s] "+d" (shift), /* d2 */ + [x] "+a" (samples) /* a0 */ + : + : "%d0", "%d4", "%d5", "%d6", "%d7" + ); +} +#endif + +static inline void fix_bitshift(ShortenContext *s, int32_t *samples) +{ + int i; + + /* Wrapped samples don't get bitshifted, so we'll do them during + the next iteration. */ + if (s->bitshift != 0) { +#if defined(CPU_COLDFIRE) && !defined(SIMULATOR) + coldfire_lshift_samples(s->blocksize, s->bitshift, samples - s->nwrap); +#else + for (i = -s->nwrap; i < (s->blocksize - s->nwrap); i++) + samples[i] <<= s->bitshift; +#endif + } + + /* Also, when we have to remember to fix the wrapped samples when + the bitshift changes.*/ + if (s->bitshift != s->last_bitshift) { + if (s->last_bitshift != 0) + for (i = -s->nwrap; i < 0; i++) + samples[i] <<= s->last_bitshift; + + s->last_bitshift = s->bitshift; + } +} + +static inline void decode_subframe_lpc(ShortenContext *s, int32_t *decoded, + int residual_size, int pred_order) { int sum, i, j; int coeffs[MAX_PRED_ORDER]; @@ -121,18 +171,12 @@ static void decode_subframe_lpc(ShortenContext *s, int32_t *decoded, } } -int shorten_decode_frame(ShortenContext *s, - int32_t *decoded, - int32_t *offset, - uint8_t *buf, - int buf_size) +static inline int shorten_decode_frame(ShortenContext *s, int32_t *decoded, + int32_t *offset) { int i; int32_t sum; - init_get_bits(&s->gb, buf, buf_size*8); - get_bits(&s->gb, s->bitindex); - int cmd = get_ur_golomb_shorten(&s->gb, FNSIZE); switch (cmd) { case FN_ZERO: @@ -201,10 +245,6 @@ int shorten_decode_frame(ShortenContext *s, case FN_QLPC: { int pred_order = get_ur_golomb_shorten(&s->gb, LPCQSIZE); - if (pred_order > MAX_PRED_ORDER) { - return -2; - } - for (i=0; inwrap; i<0; i++) - decoded[i] = decoded[i + s->blocksize]; - - int scale = s->bitshift + SHN_OUTPUT_DEPTH - s->bits_per_sample; - for (i = 0; i < s->blocksize; i++) - decoded[i] <<= scale; + fix_bitshift(s, decoded); break; } @@ -244,29 +279,88 @@ int shorten_decode_frame(ShortenContext *s, i = get_ur_golomb_shorten(&s->gb, VERBATIM_CKSIZE_SIZE); while (i--) get_ur_golomb_shorten(&s->gb, VERBATIM_BYTE_SIZE); - return 4; break; case FN_BITSHIFT: s->bitshift = get_ur_golomb_shorten(&s->gb, BITSHIFTSIZE); - return 3; break; case FN_BLOCKSIZE: s->blocksize = get_uint(s, av_log2(s->blocksize)); - return 2; break; case FN_QUIT: - return 1; break; default: - return -1; + return FN_ERROR; break; } - return 0; + return cmd; +} + +int shorten_decode_frames(ShortenContext *s, int *nsamples, + int32_t *decoded0, int32_t *decoded1, + int32_t *offset0, int32_t *offset1, + uint8_t *buf, int buf_size, + void (*yield)(void)) +{ + int32_t *decoded, *offset; + int cmd; + + *nsamples = 0; + + init_get_bits(&s->gb, buf, buf_size*8); + get_bits(&s->gb, s->bitindex); + + int n = 0; + while (n < NUM_DEC_LOOPS) { + int chan = n%2; + if (chan == 0) { + decoded = decoded0 + s->nwrap + *nsamples; + offset = offset0; + } else { + decoded = decoded1 + s->nwrap + *nsamples; + offset = offset1; + } + + yield(); + + cmd = shorten_decode_frame(s, decoded, offset); + + if (cmd == FN_VERBATIM || cmd == FN_BITSHIFT || cmd == FN_BLOCKSIZE) { + continue; + } else if (cmd == FN_QUIT || cmd == FN_ERROR) { + break; + } + + *nsamples += chan * s->blocksize; + n++; + } + + if (*nsamples) { + /* Wrap the samples for the next loop */ + int i; + for (i = 0; i < s->nwrap; i++) { + decoded0[i] = decoded0[*nsamples + i]; + decoded1[i] = decoded1[*nsamples + i]; + } + + /* Scale the samples for the pcmbuf */ + int scale = SHN_OUTPUT_DEPTH - s->bits_per_sample; +#if defined(CPU_COLDFIRE) && !defined(SIMULATOR) + coldfire_lshift_samples(*nsamples, scale, decoded0 + s->nwrap); + coldfire_lshift_samples(*nsamples, scale, decoded1 + s->nwrap); +#else + for (i = 0; i < *nsamples; i++) { + decoded0[i + s->nwrap] <<= scale; + decoded1[i + s->nwrap] <<= scale; + } +#endif + } + + return cmd; } static int decode_wave_header(ShortenContext *s, diff --git a/apps/codecs/libffmpegFLAC/shndec.h b/apps/codecs/libffmpegFLAC/shndec.h index 6b830dcafa..713a5b56dc 100644 --- a/apps/codecs/libffmpegFLAC/shndec.h +++ b/apps/codecs/libffmpegFLAC/shndec.h @@ -1,11 +1,31 @@ #include "bitstream.h" #define SHN_OUTPUT_DEPTH 28 + +#define MAX_CHANNELS 2 +#define MAX_PRED_ORDER 16 +#define MAX_NWRAP MAX_PRED_ORDER +#define MAX_NMEAN 4 + +/* NUM_DEC_LOOPS should be even number */ +#define NUM_DEC_LOOPS 26 #define DEFAULT_BLOCK_SIZE 256 -#define MAX_FRAMESIZE 1024 -#define MAX_CHANNELS 2 -#define MAX_NWRAP 3 -#define MAX_NMEAN 4 +#define MAX_HEADER_SIZE DEFAULT_BLOCK_SIZE*4 +#define MAX_BUFFER_SIZE 2*DEFAULT_BLOCK_SIZE*NUM_DEC_LOOPS +#define MAX_DECODE_SIZE ((DEFAULT_BLOCK_SIZE*NUM_DEC_LOOPS/2) + MAX_NWRAP) +#define MAX_OFFSET_SIZE MAX_NMEAN + +#define FN_DIFF0 0 +#define FN_DIFF1 1 +#define FN_DIFF2 2 +#define FN_DIFF3 3 +#define FN_QUIT 4 +#define FN_BLOCKSIZE 5 +#define FN_BITSHIFT 6 +#define FN_QLPC 7 +#define FN_ZERO 8 +#define FN_VERBATIM 9 +#define FN_ERROR 10 typedef struct ShortenContext { GetBitContext gb; @@ -17,20 +37,16 @@ typedef struct ShortenContext { int bits_per_sample; int version; int bitshift; + int last_bitshift; int nmean; int nwrap; int blocksize; int bitindex; -/* Not needed... - int bit_rate; - int block_align; - int chunk_size; -*/ } ShortenContext; int shorten_init(ShortenContext* s, uint8_t *buf, int buf_size); -int shorten_decode_frame(ShortenContext *s, - int32_t *decoded, - int32_t *offset, - uint8_t *buf, - int buf_size) ICODE_ATTR; +int shorten_decode_frames(ShortenContext *s, int *nsamples, + int32_t *decoded0, int32_t *decoded1, + int32_t *offset0, int32_t *offset1, + uint8_t *buf, int buf_size, + void (*yield)(void)) ICODE_ATTR; -- cgit v1.2.3