Approx 10% speedup in cook on files tested: Remove some inner loops in favour of memcpy/memset/vect_add calls; remove multiplication from index arithmetic in loops in favour of pointer arithmetic; make use of the MULT31, MULT31_SHIFT15 and CLIP_TO_15 implementations from codelib instead of having their own implementations in cook

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@22055 a1c6a512-1295-4272-9138-f99709370657
author: Dave Hooper <dave@beermex.com> 2009-07-26 19:06:36 +0000
committer: Dave Hooper <dave@beermex.com> 2009-07-26 19:06:36 +0000
commit: a8d1cfdec8f62f976ba03713da07b88bd927fce5 (patch)
tree: d9a5177cbab89abdd1e9ae4c0e6820a9187f36c0
parent: cece75eb42ca7e294fb423ff64c0d664cb374ec6 (diff)
download: rockbox-a8d1cfdec8f62f976ba03713da07b88bd927fce5.tar.gz
rockbox-a8d1cfdec8f62f976ba03713da07b88bd927fce5.zip
2 files changed, 84 insertions, 99 deletions
diff --git a/apps/codecs/libcook/cook.c b/apps/codecs/libcook/cook.c
index 524f5e1ff8..7ad994926e 100644
--- a/apps/codecs/libcook/cook.c
+++ b/apps/codecs/libcook/cook.c
@@ -328,13 +328,8 @@ static void categorize(COOKContext *q, int* quant_index_table,
            --exp_index2[index];
        }
    }
+    memcpy(category, exp_index2, sizeof(int) * q->total_subbands );
-    for(i=0 ; i<q->total_subbands ; i++)
+    memcpy(category_index, tmp_categorize_array+tmp_categorize_array2_idx, sizeof(int) * (q->numvector_size-1) );
-        category[i] = exp_index2[i];
-    for(i=0 ; i<q->numvector_size-1 ; i++)
-        category_index[i] = tmp_categorize_array[tmp_categorize_array2_idx++];
 }
@@ -370,27 +365,38 @@ static int unpack_SQVH(COOKContext *q, int category, int* subband_coef_index,
    vd = vd_tab[category];
    result = 0;
-    for(i=0 ; i<vpr_tab[category] ; i++){
+    for(i=0 ; i<vpr_tab[category] ; i++)
+    {
        vlc = get_vlc2(&q->gb, q->sqvh[category].table, q->sqvh[category].bits, 3);
-        if (q->bits_per_subpacket < get_bits_count(&q->gb)){
+        if (q->bits_per_subpacket < get_bits_count(&q->gb))
+        {
            vlc = 0;
            result = 1;
+            memset(subband_coef_index, 0, sizeof(int)*vd);
+            memset(subband_coef_sign, 0, sizeof(int)*vd);
+            subband_coef_index+=vd;
+            subband_coef_sign+=vd;
        }
-        for(j=vd-1 ; j>=0 ; j--){
+        else
-            tmp = (vlc * invradix_tab[category])/0x100000;
+        {
-            subband_coef_index[vd*i+j] = vlc - tmp * (kmax_tab[category]+1);
+            for(j=vd-1 ; j>=0 ; j--){
-            vlc = tmp;
+                tmp = (vlc * invradix_tab[category])/0x100000;
-        }
+                subband_coef_index[j] = vlc - tmp * (kmax_tab[category]+1);
-        for(j=0 ; j<vd ; j++){
+                vlc = tmp;
-            if (subband_coef_index[i*vd + j]) {
+            }
-                if(get_bits_count(&q->gb) < q->bits_per_subpacket){
-                    subband_coef_sign[i*vd+j] = get_bits1(&q->gb);
+            for(j=0 ; j<vd ; j++)
+            {
+                if (*subband_coef_index++) {
+                    if(get_bits_count(&q->gb) < q->bits_per_subpacket) {
+                        *subband_coef_sign++ = get_bits1(&q->gb);
+                    } else {
+                        result=1;
+                        *subband_coef_sign++=0;
+                    }
                } else {
-                    result=1;
+                    *subband_coef_sign++=0;
-                    subband_coef_sign[i*vd+j]=0;
                }
-            } else {
-                subband_coef_sign[i*vd+j]=0;
            }
        }
    }
@@ -505,7 +511,7 @@ static void decouple_info(COOKContext *q, int* decouple_tab){
 static void joint_decode(COOKContext *q, REAL_T* mlt_buffer1,
                         REAL_T* mlt_buffer2) {
-    int i,j;
+    int i;
    int decouple_tab[SUBBAND_SIZE];
    REAL_T *decode_buffer = q->decode_buffer_0;
    int idx;
@@ -520,11 +526,14 @@ static void joint_decode(COOKContext *q, REAL_T* mlt_buffer1,
    mono_decode(q, decode_buffer);
    /* The two channels are stored interleaved in decode_buffer. */
-    for (i=0 ; i<q->js_subband_start ; i++) {
+    REAL_T * mlt_buffer1_end = mlt_buffer1 + (q->js_subband_start*SUBBAND_SIZE);
-        for (j=0 ; j<SUBBAND_SIZE ; j++) {
+    while(mlt_buffer1 < mlt_buffer1_end)
-            mlt_buffer1[i*20+j] = decode_buffer[i*40+j];
+    {
-            mlt_buffer2[i*20+j] = decode_buffer[i*40+20+j];
+        memcpy(mlt_buffer1,decode_buffer,sizeof(REAL_T)*SUBBAND_SIZE);
-        }
+        memcpy(mlt_buffer2,decode_buffer+20,sizeof(REAL_T)*SUBBAND_SIZE);
+        mlt_buffer1 += 20;
+        mlt_buffer2 += 20;
+        decode_buffer += 40;
    }
    /* When we reach js_subband_start (the higher frequencies)
@@ -533,11 +542,15 @@ static void joint_decode(COOKContext *q, REAL_T* mlt_buffer1,
    for (i=q->js_subband_start ; i<q->subbands ; i++) {
        int i1 = decouple_tab[cplband[i]];
        int i2 = idx - i1 - 1;
-        for (j=0 ; j<SUBBAND_SIZE ; j++) {
+        mlt_buffer1_end = mlt_buffer1 + SUBBAND_SIZE;
-            REAL_T x = decode_buffer[((q->js_subband_start + i)*20)+j];
+        while(mlt_buffer1 < mlt_buffer1_end)
-            mlt_buffer1[20*i+j] = cplscale_math(x, q->js_vlc_bits, i1);
+        {
-            mlt_buffer2[20*i+j] = cplscale_math(x, q->js_vlc_bits, i2);
+            *mlt_buffer1++ = cplscale_math(*decode_buffer, q->js_vlc_bits, i1);
+            *mlt_buffer2++ = cplscale_math(*decode_buffer++, q->js_vlc_bits, i2);
        }
+        mlt_buffer1 += (20-SUBBAND_SIZE);
+        mlt_buffer2 += (20-SUBBAND_SIZE);
+        decode_buffer += (20-SUBBAND_SIZE);
    }
 }
@@ -581,7 +594,7 @@ decode_bytes_and_gain(COOKContext *q, const uint8_t *inbuffer,
 * @param chan              0: left or single channel, 1: right channel
 */
-static inline void
+static void
 mlt_compensate_output(COOKContext *q, REAL_T *decode_buffer,
                      cook_gains *gains, REAL_T *previous_buffer,
                      int16_t *out, int chan)
diff --git a/apps/codecs/libcook/cook_fixpoint.h b/apps/codecs/libcook/cook_fixpoint.h
index 32d8a81cc2..f92d717f20 100644
--- a/apps/codecs/libcook/cook_fixpoint.h
+++ b/apps/codecs/libcook/cook_fixpoint.h
@@ -35,8 +35,13 @@
 *    in C using two 32 bit integer multiplications.
 */
+/* get definitions of MULT31, MULT31_SHIFT15, CLIP_TO_15, vect_add, from codelib */
+#include "asm_arm.h"
+#include "asm_mcf5249.h"
+#include "codeclib_misc.h"
 /* The following table is taken from libavutil/mathematics.c */
-const uint8_t ff_log2_tab[256]={
+const uint8_t ff_log2_tab[256] ={
        0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
        5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
        6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
@@ -67,6 +72,11 @@ static inline FIXP fixp_pow2(FIXP x, int i)
    return x << i;              /* no check for overflow */
 }
+static inline FIXP fixp_pow2_neg(FIXP x, int i)
+{
+  return (x >> i) + ((x >> (i-1)) & 1);
+}
 /**
 * Fixed point multiply by fraction.
 *
@@ -74,53 +84,10 @@ static inline FIXP fixp_pow2(FIXP x, int i)
 * @param b                     fix point fraction, 0 <= b < 1
 */
-static inline FIXP fixp_mult_su(FIXP a, FIXPU b)
+#define fixp_mult_su(x,y) (MULT31_SHIFT15(x,y))
-{
- 
-    int32_t hb = (a >> 16) * b;
-    uint32_t lb = (a & 0xffff) * b;
-    return hb + (lb >> 16) + ((lb & 0x8000) >> 15);
-}
 /* Faster version of the above using 32x32=64 bit multiply */
-#ifdef CPU_ARM
+#define fixmul31(x,y) (MULT31(x,y))
-#define fixmul31(x, y)  \
-    ({ int32_t __hi;  \
-       uint32_t __lo;  \
-       int32_t __result;  \
-       asm ("smull   %0, %1, %3, %4\n\t"  \
-            "movs    %2, %1, lsl #1"  \
-            : "=&r" (__lo), "=&r" (__hi), "=r" (__result)  \
-            : "%r" (x), "r" (y)  \
-            : "cc");  \
-       __result;  \
-    })
-#elif defined(CPU_COLDFIRE)
-static inline int32_t fixmul31(int32_t x, int32_t y)
-{
-    asm (
-        "mac.l   %[x], %[y], %%acc0  \n" /* multiply */
-        "movclr.l %%acc0, %[x]  \n"     /* get higher half */
-        : [x] "+d" (x)
-        : [y] "d"  (y)
-    );
-    return x;
-}
-#else
-static inline int32_t fixmul31(int32_t x, int32_t y)
-{
-    int64_t temp;
-    temp = x;
-    temp *= y;
-    temp >>= 31;        //16+31-16 = 31 bits
-    return (int32_t)temp;
-}
-#endif
 /* math functions taken from libavutil/common.h */
@@ -169,13 +136,13 @@ static void scalar_dequant_math(COOKContext *q, int index,
                                int* subband_coef_sign, REAL_T *mlt_p)
 {
    /* Num. half bits to right shift */
-    const int s = 33 - quant_index + av_log2(q->samples_per_channel);
+    const int s = (33 - quant_index + av_log2(q->samples_per_channel)) >> 1;
    const FIXP *table = quant_tables[s & 1][index];
    FIXP f;
    int i;
-    if(s >= 64)
+    if(s >= 32)
        memset(mlt_p, 0, sizeof(REAL_T)*SUBBAND_SIZE);
    else 
    {
@@ -186,7 +153,7 @@ static void scalar_dequant_math(COOKContext *q, int index,
                ((subband_coef_index[i] != 0) && subband_coef_sign[i]))
                f = -f;
-            mlt_p[i] =fixp_pow2(f, -(s/2));
+            *mlt_p++ = fixp_pow2_neg(f, s);
        }
    }
 }
@@ -274,10 +241,9 @@ static inline void imlt_math(COOKContext *q, FIXP *in)
 static inline void overlap_math(COOKContext *q, int gain, FIXP buffer[])
 {
    int i;
-    if(LIKELY(gain == 0)){
+    if(LIKELY(gain == 0))
-        for(i=0 ; i<q->samples_per_channel ; i++) {
+    {
-            q->mono_mdct_output[i] += buffer[i];
+        vect_add(q->mono_mdct_output, buffer, q->samples_per_channel);
-        }        
        
    } else if (gain > 0){
        for(i=0 ; i<q->samples_per_channel ; i++) {
@@ -301,7 +267,7 @@ static inline void overlap_math(COOKContext *q, int gain, FIXP buffer[])
 * @param gain_index_next   index for the next block multiplier
 */
 static inline void
-interpolate_math(COOKContext *q, FIXP* buffer,
+interpolate_math(COOKContext *q, register FIXP* buffer,
                 int gain_index, int gain_index_next)
 {
    int i;
@@ -315,14 +281,17 @@ interpolate_math(COOKContext *q, FIXP* buffer,
        int step = (gain_index_next - gain_index)
                   << (7 - av_log2(gain_size_factor));
        int x = 0;
+        register FIXP* bufferend = buffer+gain_size_factor;
-        for(i = 0; i < gain_size_factor; i++) {
+        while(buffer < bufferend )
-            buffer[i] = fixp_mult_su(buffer[i], pow128_tab[x]);
+        {
-            buffer[i] = fixp_pow2(buffer[i], gain_index+1);
+            *buffer = fixp_pow2(
+                          fixp_mult_su(*buffer, pow128_tab[x]),
+                          gain_index+1);
+            buffer++;
            x += step;
-            gain_index += (x + 128) / 128 - 1;
+            gain_index += ( (x + 128) >> 7 ) - 1;
-            x = (x + 128) % 128;
+            x = ( (x + 128) & 127 );
        }
    }
 }
@@ -349,12 +318,15 @@ static inline FIXP cplscale_math(FIXP x, int table, int i)
 * @param out               pointer to the output buffer
 * @param chan              0: left or single channel, 1: right channel
 */
-static inline void output_math(COOKContext *q, int16_t *out, int chan)
+static inline void output_math(COOKContext *q, register int16_t *out, int chan)
 {
-    int j;
+    register REAL_T * mono_output_ptr = q->mono_mdct_output;
+    register REAL_T * mono_output_end = mono_output_ptr + q->samples_per_channel;
-    for (j = 0; j < q->samples_per_channel; j++) {
+    out += chan;
-        out[chan + q->nb_channels * j] =
+    const int STEP = q->nb_channels;
-          av_clip(fixp_pow2(q->mono_mdct_output[j], -11), -32768, 32767);
+    while( mono_output_ptr < mono_output_end )
+    {
+      *out = CLIP_TO_15(fixp_pow2_neg(*mono_output_ptr++, 11));
+      out += STEP;
    }
 }
author	Dave Hooper <dave@beermex.com>	2009-07-26 19:06:36 +0000
committer	Dave Hooper <dave@beermex.com>	2009-07-26 19:06:36 +0000
commit	a8d1cfdec8f62f976ba03713da07b88bd927fce5 (patch)
tree	d9a5177cbab89abdd1e9ae4c0e6820a9187f36c0
parent	cece75eb42ca7e294fb423ff64c0d664cb374ec6 (diff)
download	rockbox-a8d1cfdec8f62f976ba03713da07b88bd927fce5.tar.gz rockbox-a8d1cfdec8f62f976ba03713da07b88bd927fce5.zip

diff --git a/apps/codecs/libcook/cook.c b/apps/codecs/libcook/cook.c index 524f5e1ff8..7ad994926e 100644 --- a/apps/codecs/libcook/cook.c +++ b/apps/codecs/libcook/cook.c
@@ -328,13 +328,8 @@ static void categorize(COOKContext q, int quant_index_table,
328	--exp_index2[index];	328	--exp_index2[index];
329	}	329	}
330	}	330	}
331		331	memcpy(category, exp_index2, sizeof(int) * q->total_subbands );
332	for(i=0 ; i<q->total_subbands ; i++)	332	memcpy(category_index, tmp_categorize_array+tmp_categorize_array2_idx, sizeof(int) * (q->numvector_size-1) );
333	category[i] = exp_index2[i];
334
335	for(i=0 ; i<q->numvector_size-1 ; i++)
336	category_index[i] = tmp_categorize_array[tmp_categorize_array2_idx++];
337
338	}	333	}
339		334
340		335
@@ -370,27 +365,38 @@ static int unpack_SQVH(COOKContext q, int category, int subband_coef_index,
370		365
371	vd = vd_tab[category];	366	vd = vd_tab[category];
372	result = 0;	367	result = 0;
373	for(i=0 ; i<vpr_tab[category] ; i++){	368	for(i=0 ; i<vpr_tab[category] ; i++)
		369	{
374	vlc = get_vlc2(&q->gb, q->sqvh[category].table, q->sqvh[category].bits, 3);	370	vlc = get_vlc2(&q->gb, q->sqvh[category].table, q->sqvh[category].bits, 3);
375	if (q->bits_per_subpacket < get_bits_count(&q->gb)){	371	if (q->bits_per_subpacket < get_bits_count(&q->gb))
		372	{
376	vlc = 0;	373	vlc = 0;
377	result = 1;	374	result = 1;
		375	memset(subband_coef_index, 0, sizeof(int)*vd);
		376	memset(subband_coef_sign, 0, sizeof(int)*vd);
		377	subband_coef_index+=vd;
		378	subband_coef_sign+=vd;
378	}	379	}
379	for(j=vd-1 ; j>=0 ; j--){	380	else
380	tmp = (vlc * invradix_tab[category])/0x100000;	381	{
381	subband_coef_index[vdi+j] = vlc - tmp (kmax_tab[category]+1);	382	for(j=vd-1 ; j>=0 ; j--){
382	vlc = tmp;	383	tmp = (vlc * invradix_tab[category])/0x100000;
383	}	384	subband_coef_index[j] = vlc - tmp * (kmax_tab[category]+1);
384	for(j=0 ; j<vd ; j++){	385	vlc = tmp;
385	if (subband_coef_index[i*vd + j]) {	386	}
386	if(get_bits_count(&q->gb) < q->bits_per_subpacket){	387
387	subband_coef_sign[i*vd+j] = get_bits1(&q->gb);	388	for(j=0 ; j<vd ; j++)
		389	{
		390	if (*subband_coef_index++) {
		391	if(get_bits_count(&q->gb) < q->bits_per_subpacket) {
		392	*subband_coef_sign++ = get_bits1(&q->gb);
		393	} else {
		394	result=1;
		395	*subband_coef_sign++=0;
		396	}
388	} else {	397	} else {
389	result=1;	398	*subband_coef_sign++=0;
390	subband_coef_sign[i*vd+j]=0;
391	}	399	}
392	} else {
393	subband_coef_sign[i*vd+j]=0;
394	}	400	}
395	}	401	}
396	}	402	}
@@ -505,7 +511,7 @@ static void decouple_info(COOKContext q, int decouple_tab){
505		511
506	static void joint_decode(COOKContext q, REAL_T mlt_buffer1,	512	static void joint_decode(COOKContext q, REAL_T mlt_buffer1,
507	REAL_T* mlt_buffer2) {	513	REAL_T* mlt_buffer2) {
508	int i,j;	514	int i;
509	int decouple_tab[SUBBAND_SIZE];	515	int decouple_tab[SUBBAND_SIZE];
510	REAL_T *decode_buffer = q->decode_buffer_0;	516	REAL_T *decode_buffer = q->decode_buffer_0;
511	int idx;	517	int idx;
@@ -520,11 +526,14 @@ static void joint_decode(COOKContext q, REAL_T mlt_buffer1,
520	mono_decode(q, decode_buffer);	526	mono_decode(q, decode_buffer);
521		527
522	/* The two channels are stored interleaved in decode_buffer. */	528	/* The two channels are stored interleaved in decode_buffer. */
523	for (i=0 ; i<q->js_subband_start ; i++) {	529	REAL_T * mlt_buffer1_end = mlt_buffer1 + (q->js_subband_start*SUBBAND_SIZE);
524	for (j=0 ; j<SUBBAND_SIZE ; j++) {	530	while(mlt_buffer1 < mlt_buffer1_end)
525	mlt_buffer1[i20+j] = decode_buffer[i40+j];	531	{
526	mlt_buffer2[i20+j] = decode_buffer[i40+20+j];	532	memcpy(mlt_buffer1,decode_buffer,sizeof(REAL_T)*SUBBAND_SIZE);
527	}	533	memcpy(mlt_buffer2,decode_buffer+20,sizeof(REAL_T)*SUBBAND_SIZE);
		534	mlt_buffer1 += 20;
		535	mlt_buffer2 += 20;
		536	decode_buffer += 40;
528	}	537	}
529		538
530	/* When we reach js_subband_start (the higher frequencies)	539	/* When we reach js_subband_start (the higher frequencies)
@@ -533,11 +542,15 @@ static void joint_decode(COOKContext q, REAL_T mlt_buffer1,
533	for (i=q->js_subband_start ; i<q->subbands ; i++) {	542	for (i=q->js_subband_start ; i<q->subbands ; i++) {
534	int i1 = decouple_tab[cplband[i]];	543	int i1 = decouple_tab[cplband[i]];
535	int i2 = idx - i1 - 1;	544	int i2 = idx - i1 - 1;
536	for (j=0 ; j<SUBBAND_SIZE ; j++) {	545	mlt_buffer1_end = mlt_buffer1 + SUBBAND_SIZE;
537	REAL_T x = decode_buffer[((q->js_subband_start + i)*20)+j];	546	while(mlt_buffer1 < mlt_buffer1_end)
538	mlt_buffer1[20*i+j] = cplscale_math(x, q->js_vlc_bits, i1);	547	{
539	mlt_buffer2[20*i+j] = cplscale_math(x, q->js_vlc_bits, i2);	548	mlt_buffer1++ = cplscale_math(decode_buffer, q->js_vlc_bits, i1);
		549	mlt_buffer2++ = cplscale_math(decode_buffer++, q->js_vlc_bits, i2);
540	}	550	}
		551	mlt_buffer1 += (20-SUBBAND_SIZE);
		552	mlt_buffer2 += (20-SUBBAND_SIZE);
		553	decode_buffer += (20-SUBBAND_SIZE);
541	}	554	}
542	}	555	}
543		556
@@ -581,7 +594,7 @@ decode_bytes_and_gain(COOKContext q, const uint8_t inbuffer,
581	* @param chan 0: left or single channel, 1: right channel	594	* @param chan 0: left or single channel, 1: right channel
582	*/	595	*/
583		596
584	static inline void	597	static void
585	mlt_compensate_output(COOKContext q, REAL_T decode_buffer,	598	mlt_compensate_output(COOKContext q, REAL_T decode_buffer,
586	cook_gains gains, REAL_T previous_buffer,	599	cook_gains gains, REAL_T previous_buffer,
587	int16_t *out, int chan)	600	int16_t *out, int chan)


diff --git a/apps/codecs/libcook/cook_fixpoint.h b/apps/codecs/libcook/cook_fixpoint.h index 32d8a81cc2..f92d717f20 100644 --- a/apps/codecs/libcook/cook_fixpoint.h +++ b/apps/codecs/libcook/cook_fixpoint.h
@@ -35,8 +35,13 @@
35	* in C using two 32 bit integer multiplications.	35	* in C using two 32 bit integer multiplications.
36	*/	36	*/
37		37
		38	/* get definitions of MULT31, MULT31_SHIFT15, CLIP_TO_15, vect_add, from codelib */
		39	#include "asm_arm.h"
		40	#include "asm_mcf5249.h"
		41	#include "codeclib_misc.h"
		42
38	/* The following table is taken from libavutil/mathematics.c */	43	/* The following table is taken from libavutil/mathematics.c */
39	const uint8_t ff_log2_tab[256]={	44	const uint8_t ff_log2_tab[256] ={
40	0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,	45	0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
41	5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,	46	5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
42	6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,	47	6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
@@ -67,6 +72,11 @@ static inline FIXP fixp_pow2(FIXP x, int i)
67	return x << i; /* no check for overflow */	72	return x << i; /* no check for overflow */
68	}	73	}
69		74
		75	static inline FIXP fixp_pow2_neg(FIXP x, int i)
		76	{
		77	return (x >> i) + ((x >> (i-1)) & 1);
		78	}
		79
70	/**	80	/**
71	* Fixed point multiply by fraction.	81	* Fixed point multiply by fraction.
72	*	82	*
@@ -74,53 +84,10 @@ static inline FIXP fixp_pow2(FIXP x, int i)
74	* @param b fix point fraction, 0 <= b < 1	84	* @param b fix point fraction, 0 <= b < 1
75	*/	85	*/
76		86
77	static inline FIXP fixp_mult_su(FIXP a, FIXPU b)	87	#define fixp_mult_su(x,y) (MULT31_SHIFT15(x,y))
78	{
79
80	int32_t hb = (a >> 16) * b;
81	uint32_t lb = (a & 0xffff) * b;
82
83	return hb + (lb >> 16) + ((lb & 0x8000) >> 15);
84	}
85		88
86	/* Faster version of the above using 32x32=64 bit multiply */	89	/* Faster version of the above using 32x32=64 bit multiply */
87	#ifdef CPU_ARM	90	#define fixmul31(x,y) (MULT31(x,y))
88	#define fixmul31(x, y) \
89	({ int32_t __hi; \
90	uint32_t __lo; \
91	int32_t __result; \
92	asm ("smull %0, %1, %3, %4\n\t" \
93	"movs %2, %1, lsl #1" \
94	: "=&r" (__lo), "=&r" (__hi), "=r" (__result) \
95	: "%r" (x), "r" (y) \
96	: "cc"); \
97	__result; \
98	})
99
100	#elif defined(CPU_COLDFIRE)
101	static inline int32_t fixmul31(int32_t x, int32_t y)
102	{
103	asm (
104	"mac.l %[x], %[y], %%acc0 \n" /* multiply */
105	"movclr.l %%acc0, %[x] \n" /* get higher half */
106	: [x] "+d" (x)
107	: [y] "d" (y)
108	);
109	return x;
110	}
111	#else
112	static inline int32_t fixmul31(int32_t x, int32_t y)
113	{
114	int64_t temp;
115
116	temp = x;
117	temp *= y;
118
119	temp >>= 31; //16+31-16 = 31 bits
120
121	return (int32_t)temp;
122	}
123	#endif
124		91
125	/* math functions taken from libavutil/common.h */	92	/* math functions taken from libavutil/common.h */
126		93
@@ -169,13 +136,13 @@ static void scalar_dequant_math(COOKContext *q, int index,
169	int* subband_coef_sign, REAL_T *mlt_p)	136	int* subband_coef_sign, REAL_T *mlt_p)
170	{	137	{
171	/* Num. half bits to right shift */	138	/* Num. half bits to right shift */
172	const int s = 33 - quant_index + av_log2(q->samples_per_channel);	139	const int s = (33 - quant_index + av_log2(q->samples_per_channel)) >> 1;
173	const FIXP *table = quant_tables[s & 1][index];	140	const FIXP *table = quant_tables[s & 1][index];
174	FIXP f;	141	FIXP f;
175	int i;	142	int i;
176		143
177		144
178	if(s >= 64)	145	if(s >= 32)
179	memset(mlt_p, 0, sizeof(REAL_T)*SUBBAND_SIZE);	146	memset(mlt_p, 0, sizeof(REAL_T)*SUBBAND_SIZE);
180	else	147	else
181	{	148	{
@@ -186,7 +153,7 @@ static void scalar_dequant_math(COOKContext *q, int index,
186	((subband_coef_index[i] != 0) && subband_coef_sign[i]))	153	((subband_coef_index[i] != 0) && subband_coef_sign[i]))
187	f = -f;	154	f = -f;
188		155
189	mlt_p[i] =fixp_pow2(f, -(s/2));	156	*mlt_p++ = fixp_pow2_neg(f, s);
190	}	157	}
191	}	158	}
192	}	159	}
@@ -274,10 +241,9 @@ static inline void imlt_math(COOKContext q, FIXP in)
274	static inline void overlap_math(COOKContext *q, int gain, FIXP buffer[])	241	static inline void overlap_math(COOKContext *q, int gain, FIXP buffer[])
275	{	242	{
276	int i;	243	int i;
277	if(LIKELY(gain == 0)){	244	if(LIKELY(gain == 0))
278	for(i=0 ; i<q->samples_per_channel ; i++) {	245	{
279	q->mono_mdct_output[i] += buffer[i];	246	vect_add(q->mono_mdct_output, buffer, q->samples_per_channel);
280	}
281		247
282	} else if (gain > 0){	248	} else if (gain > 0){
283	for(i=0 ; i<q->samples_per_channel ; i++) {	249	for(i=0 ; i<q->samples_per_channel ; i++) {
@@ -301,7 +267,7 @@ static inline void overlap_math(COOKContext *q, int gain, FIXP buffer[])
301	* @param gain_index_next index for the next block multiplier	267	* @param gain_index_next index for the next block multiplier
302	*/	268	*/
303	static inline void	269	static inline void
304	interpolate_math(COOKContext q, FIXP buffer,	270	interpolate_math(COOKContext q, register FIXP buffer,
305	int gain_index, int gain_index_next)	271	int gain_index, int gain_index_next)
306	{	272	{
307	int i;	273	int i;
@@ -315,14 +281,17 @@ interpolate_math(COOKContext q, FIXP buffer,
315	int step = (gain_index_next - gain_index)	281	int step = (gain_index_next - gain_index)
316	<< (7 - av_log2(gain_size_factor));	282	<< (7 - av_log2(gain_size_factor));
317	int x = 0;	283	int x = 0;
318		284	register FIXP* bufferend = buffer+gain_size_factor;
319	for(i = 0; i < gain_size_factor; i++) {	285	while(buffer < bufferend )
320	buffer[i] = fixp_mult_su(buffer[i], pow128_tab[x]);	286	{
321	buffer[i] = fixp_pow2(buffer[i], gain_index+1);	287	*buffer = fixp_pow2(
		288	fixp_mult_su(*buffer, pow128_tab[x]),
		289	gain_index+1);
		290	buffer++;
322		291
323	x += step;	292	x += step;
324	gain_index += (x + 128) / 128 - 1;	293	gain_index += ( (x + 128) >> 7 ) - 1;
325	x = (x + 128) % 128;	294	x = ( (x + 128) & 127 );
326	}	295	}
327	}	296	}
328	}	297	}
@@ -349,12 +318,15 @@ static inline FIXP cplscale_math(FIXP x, int table, int i)
349	* @param out pointer to the output buffer	318	* @param out pointer to the output buffer
350	* @param chan 0: left or single channel, 1: right channel	319	* @param chan 0: left or single channel, 1: right channel
351	*/	320	*/
352	static inline void output_math(COOKContext q, int16_t out, int chan)	321	static inline void output_math(COOKContext q, register int16_t out, int chan)
353	{	322	{
354	int j;	323	register REAL_T * mono_output_ptr = q->mono_mdct_output;
355		324	register REAL_T * mono_output_end = mono_output_ptr + q->samples_per_channel;
356	for (j = 0; j < q->samples_per_channel; j++) {	325	out += chan;
357	out[chan + q->nb_channels * j] =	326	const int STEP = q->nb_channels;
358	av_clip(fixp_pow2(q->mono_mdct_output[j], -11), -32768, 32767);	327	while( mono_output_ptr < mono_output_end )
		328	{
		329	out = CLIP_TO_15(fixp_pow2_neg(mono_output_ptr++, 11));
		330	out += STEP;
359	}	331	}
360	}	332	}