From a8d1cfdec8f62f976ba03713da07b88bd927fce5 Mon Sep 17 00:00:00 2001 From: Dave Hooper Date: Sun, 26 Jul 2009 19:06:36 +0000 Subject: Approx 10% speedup in cook on files tested: Remove some inner loops in favour of memcpy/memset/vect_add calls; remove multiplication from index arithmetic in loops in favour of pointer arithmetic; make use of the MULT31, MULT31_SHIFT15 and CLIP_TO_15 implementations from codelib instead of having their own implementations in cook git-svn-id: svn://svn.rockbox.org/rockbox/trunk@22055 a1c6a512-1295-4272-9138-f99709370657 --- apps/codecs/libcook/cook.c | 79 +++++++++++++++------------ apps/codecs/libcook/cook_fixpoint.h | 104 +++++++++++++----------------------- 2 files changed, 84 insertions(+), 99 deletions(-) (limited to 'apps/codecs') diff --git a/apps/codecs/libcook/cook.c b/apps/codecs/libcook/cook.c index 524f5e1ff8..7ad994926e 100644 --- a/apps/codecs/libcook/cook.c +++ b/apps/codecs/libcook/cook.c @@ -328,13 +328,8 @@ static void categorize(COOKContext *q, int* quant_index_table, --exp_index2[index]; } } - - for(i=0 ; itotal_subbands ; i++) - category[i] = exp_index2[i]; - - for(i=0 ; inumvector_size-1 ; i++) - category_index[i] = tmp_categorize_array[tmp_categorize_array2_idx++]; - + memcpy(category, exp_index2, sizeof(int) * q->total_subbands ); + memcpy(category_index, tmp_categorize_array+tmp_categorize_array2_idx, sizeof(int) * (q->numvector_size-1) ); } @@ -370,27 +365,38 @@ static int unpack_SQVH(COOKContext *q, int category, int* subband_coef_index, vd = vd_tab[category]; result = 0; - for(i=0 ; igb, q->sqvh[category].table, q->sqvh[category].bits, 3); - if (q->bits_per_subpacket < get_bits_count(&q->gb)){ + if (q->bits_per_subpacket < get_bits_count(&q->gb)) + { vlc = 0; result = 1; + memset(subband_coef_index, 0, sizeof(int)*vd); + memset(subband_coef_sign, 0, sizeof(int)*vd); + subband_coef_index+=vd; + subband_coef_sign+=vd; } - for(j=vd-1 ; j>=0 ; j--){ - tmp = (vlc * invradix_tab[category])/0x100000; - subband_coef_index[vd*i+j] = vlc - tmp * (kmax_tab[category]+1); - vlc = tmp; - } - for(j=0 ; jgb) < q->bits_per_subpacket){ - subband_coef_sign[i*vd+j] = get_bits1(&q->gb); + else + { + for(j=vd-1 ; j>=0 ; j--){ + tmp = (vlc * invradix_tab[category])/0x100000; + subband_coef_index[j] = vlc - tmp * (kmax_tab[category]+1); + vlc = tmp; + } + + for(j=0 ; jgb) < q->bits_per_subpacket) { + *subband_coef_sign++ = get_bits1(&q->gb); + } else { + result=1; + *subband_coef_sign++=0; + } } else { - result=1; - subband_coef_sign[i*vd+j]=0; + *subband_coef_sign++=0; } - } else { - subband_coef_sign[i*vd+j]=0; } } } @@ -505,7 +511,7 @@ static void decouple_info(COOKContext *q, int* decouple_tab){ static void joint_decode(COOKContext *q, REAL_T* mlt_buffer1, REAL_T* mlt_buffer2) { - int i,j; + int i; int decouple_tab[SUBBAND_SIZE]; REAL_T *decode_buffer = q->decode_buffer_0; int idx; @@ -520,11 +526,14 @@ static void joint_decode(COOKContext *q, REAL_T* mlt_buffer1, mono_decode(q, decode_buffer); /* The two channels are stored interleaved in decode_buffer. */ - for (i=0 ; ijs_subband_start ; i++) { - for (j=0 ; jjs_subband_start*SUBBAND_SIZE); + while(mlt_buffer1 < mlt_buffer1_end) + { + memcpy(mlt_buffer1,decode_buffer,sizeof(REAL_T)*SUBBAND_SIZE); + memcpy(mlt_buffer2,decode_buffer+20,sizeof(REAL_T)*SUBBAND_SIZE); + mlt_buffer1 += 20; + mlt_buffer2 += 20; + decode_buffer += 40; } /* When we reach js_subband_start (the higher frequencies) @@ -533,11 +542,15 @@ static void joint_decode(COOKContext *q, REAL_T* mlt_buffer1, for (i=q->js_subband_start ; isubbands ; i++) { int i1 = decouple_tab[cplband[i]]; int i2 = idx - i1 - 1; - for (j=0 ; jjs_subband_start + i)*20)+j]; - mlt_buffer1[20*i+j] = cplscale_math(x, q->js_vlc_bits, i1); - mlt_buffer2[20*i+j] = cplscale_math(x, q->js_vlc_bits, i2); + mlt_buffer1_end = mlt_buffer1 + SUBBAND_SIZE; + while(mlt_buffer1 < mlt_buffer1_end) + { + *mlt_buffer1++ = cplscale_math(*decode_buffer, q->js_vlc_bits, i1); + *mlt_buffer2++ = cplscale_math(*decode_buffer++, q->js_vlc_bits, i2); } + mlt_buffer1 += (20-SUBBAND_SIZE); + mlt_buffer2 += (20-SUBBAND_SIZE); + decode_buffer += (20-SUBBAND_SIZE); } } @@ -581,7 +594,7 @@ decode_bytes_and_gain(COOKContext *q, const uint8_t *inbuffer, * @param chan 0: left or single channel, 1: right channel */ -static inline void +static void mlt_compensate_output(COOKContext *q, REAL_T *decode_buffer, cook_gains *gains, REAL_T *previous_buffer, int16_t *out, int chan) diff --git a/apps/codecs/libcook/cook_fixpoint.h b/apps/codecs/libcook/cook_fixpoint.h index 32d8a81cc2..f92d717f20 100644 --- a/apps/codecs/libcook/cook_fixpoint.h +++ b/apps/codecs/libcook/cook_fixpoint.h @@ -35,8 +35,13 @@ * in C using two 32 bit integer multiplications. */ +/* get definitions of MULT31, MULT31_SHIFT15, CLIP_TO_15, vect_add, from codelib */ +#include "asm_arm.h" +#include "asm_mcf5249.h" +#include "codeclib_misc.h" + /* The following table is taken from libavutil/mathematics.c */ -const uint8_t ff_log2_tab[256]={ +const uint8_t ff_log2_tab[256] ={ 0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, @@ -67,6 +72,11 @@ static inline FIXP fixp_pow2(FIXP x, int i) return x << i; /* no check for overflow */ } +static inline FIXP fixp_pow2_neg(FIXP x, int i) +{ + return (x >> i) + ((x >> (i-1)) & 1); +} + /** * Fixed point multiply by fraction. * @@ -74,53 +84,10 @@ static inline FIXP fixp_pow2(FIXP x, int i) * @param b fix point fraction, 0 <= b < 1 */ -static inline FIXP fixp_mult_su(FIXP a, FIXPU b) -{ - - int32_t hb = (a >> 16) * b; - uint32_t lb = (a & 0xffff) * b; - - return hb + (lb >> 16) + ((lb & 0x8000) >> 15); -} +#define fixp_mult_su(x,y) (MULT31_SHIFT15(x,y)) /* Faster version of the above using 32x32=64 bit multiply */ -#ifdef CPU_ARM -#define fixmul31(x, y) \ - ({ int32_t __hi; \ - uint32_t __lo; \ - int32_t __result; \ - asm ("smull %0, %1, %3, %4\n\t" \ - "movs %2, %1, lsl #1" \ - : "=&r" (__lo), "=&r" (__hi), "=r" (__result) \ - : "%r" (x), "r" (y) \ - : "cc"); \ - __result; \ - }) - -#elif defined(CPU_COLDFIRE) -static inline int32_t fixmul31(int32_t x, int32_t y) -{ - asm ( - "mac.l %[x], %[y], %%acc0 \n" /* multiply */ - "movclr.l %%acc0, %[x] \n" /* get higher half */ - : [x] "+d" (x) - : [y] "d" (y) - ); - return x; -} -#else -static inline int32_t fixmul31(int32_t x, int32_t y) -{ - int64_t temp; - - temp = x; - temp *= y; - - temp >>= 31; //16+31-16 = 31 bits - - return (int32_t)temp; -} -#endif +#define fixmul31(x,y) (MULT31(x,y)) /* math functions taken from libavutil/common.h */ @@ -169,13 +136,13 @@ static void scalar_dequant_math(COOKContext *q, int index, int* subband_coef_sign, REAL_T *mlt_p) { /* Num. half bits to right shift */ - const int s = 33 - quant_index + av_log2(q->samples_per_channel); + const int s = (33 - quant_index + av_log2(q->samples_per_channel)) >> 1; const FIXP *table = quant_tables[s & 1][index]; FIXP f; int i; - if(s >= 64) + if(s >= 32) memset(mlt_p, 0, sizeof(REAL_T)*SUBBAND_SIZE); else { @@ -186,7 +153,7 @@ static void scalar_dequant_math(COOKContext *q, int index, ((subband_coef_index[i] != 0) && subband_coef_sign[i])) f = -f; - mlt_p[i] =fixp_pow2(f, -(s/2)); + *mlt_p++ = fixp_pow2_neg(f, s); } } } @@ -274,10 +241,9 @@ static inline void imlt_math(COOKContext *q, FIXP *in) static inline void overlap_math(COOKContext *q, int gain, FIXP buffer[]) { int i; - if(LIKELY(gain == 0)){ - for(i=0 ; isamples_per_channel ; i++) { - q->mono_mdct_output[i] += buffer[i]; - } + if(LIKELY(gain == 0)) + { + vect_add(q->mono_mdct_output, buffer, q->samples_per_channel); } else if (gain > 0){ for(i=0 ; isamples_per_channel ; i++) { @@ -301,7 +267,7 @@ static inline void overlap_math(COOKContext *q, int gain, FIXP buffer[]) * @param gain_index_next index for the next block multiplier */ static inline void -interpolate_math(COOKContext *q, FIXP* buffer, +interpolate_math(COOKContext *q, register FIXP* buffer, int gain_index, int gain_index_next) { int i; @@ -315,14 +281,17 @@ interpolate_math(COOKContext *q, FIXP* buffer, int step = (gain_index_next - gain_index) << (7 - av_log2(gain_size_factor)); int x = 0; - - for(i = 0; i < gain_size_factor; i++) { - buffer[i] = fixp_mult_su(buffer[i], pow128_tab[x]); - buffer[i] = fixp_pow2(buffer[i], gain_index+1); + register FIXP* bufferend = buffer+gain_size_factor; + while(buffer < bufferend ) + { + *buffer = fixp_pow2( + fixp_mult_su(*buffer, pow128_tab[x]), + gain_index+1); + buffer++; x += step; - gain_index += (x + 128) / 128 - 1; - x = (x + 128) % 128; + gain_index += ( (x + 128) >> 7 ) - 1; + x = ( (x + 128) & 127 ); } } } @@ -349,12 +318,15 @@ static inline FIXP cplscale_math(FIXP x, int table, int i) * @param out pointer to the output buffer * @param chan 0: left or single channel, 1: right channel */ -static inline void output_math(COOKContext *q, int16_t *out, int chan) +static inline void output_math(COOKContext *q, register int16_t *out, int chan) { - int j; - - for (j = 0; j < q->samples_per_channel; j++) { - out[chan + q->nb_channels * j] = - av_clip(fixp_pow2(q->mono_mdct_output[j], -11), -32768, 32767); + register REAL_T * mono_output_ptr = q->mono_mdct_output; + register REAL_T * mono_output_end = mono_output_ptr + q->samples_per_channel; + out += chan; + const int STEP = q->nb_channels; + while( mono_output_ptr < mono_output_end ) + { + *out = CLIP_TO_15(fixp_pow2_neg(*mono_output_ptr++, 11)); + out += STEP; } } -- cgit v1.2.3