From 51a8be1a0f11d329af99bf9c33a61af11d7a43ed Mon Sep 17 00:00:00 2001
From: Andree Buschmann <AndreeBuschmann@t-online.de>
Date: Mon, 15 Feb 2010 16:11:49 +0000
Subject: Further optimization of atrac3 codec. Refacturate
 gainCompensateAndOverlap(), avoid multiplication if not needed, unroll loops.
 Speeds up codec by 1.1 MHz (+2%) on ARM.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@24668 a1c6a512-1295-4272-9138-f99709370657
---
 apps/codecs/libatrac/atrac3.c | 183 +++++++++++++++++++++++++++++++++---------
 1 file changed, 145 insertions(+), 38 deletions(-)

(limited to 'apps/codecs')

diff --git a/apps/codecs/libatrac/atrac3.c b/apps/codecs/libatrac/atrac3.c
index 18a4f120e1..ff6e639a3b 100644
--- a/apps/codecs/libatrac/atrac3.c
+++ b/apps/codecs/libatrac/atrac3.c
@@ -59,6 +59,7 @@ static int32_t          qmf_window[48] IBSS_ATTR;
 static VLC              spectral_coeff_tab[7];
 static channel_unit     channel_units[2] IBSS_ATTR_LARGE_IRAM;
 
+
 /**
  * Matrixing within quadrature mirror synthesis filter.
  *
@@ -91,6 +92,7 @@ static channel_unit     channel_units[2] IBSS_ATTR_LARGE_IRAM;
     }
 #endif
 
+
 /**
  * Matrixing within quadrature mirror synthesis filter.
  *
@@ -195,6 +197,7 @@ static channel_unit     channel_units[2] IBSS_ATTR_LARGE_IRAM;
     }
 #endif
 
+
 /**
  * IMDCT windowing.
  *
@@ -214,6 +217,7 @@ atrac3_imdct_windowing(int32_t *buffer,
     }
 }
 
+
 /**
  * Quadrature mirror synthesis filter.
  *
@@ -240,12 +244,13 @@ static void iqmf (int32_t *inlo, int32_t *inhi, unsigned int nIn, int32_t *pOut,
     memcpy(delayBuf, temp + (nIn << 1), 46*sizeof(int32_t));
 }
 
+
 /**
  * Regular 512 points IMDCT without overlapping, with the exception of the swapping of odd bands
  * caused by the reverse spectra of the QMF.
  *
- * @param pInput    float input
- * @param pOutput   float output
+ * @param pInput    input
+ * @param pOutput   output
  * @param odd_band  1 if the band is an odd band
  */
 
@@ -274,7 +279,7 @@ static int decode_bytes(const uint8_t* inbuffer, uint8_t* out, int bytes){
     uint32_t* obuf = (uint32_t*) out;
 
 #if ((defined(TEST) || defined(SIMULATOR)) && !defined(CPU_ARM))
-    off = 0; //no check for memory alignment of inbuffer
+    off = 0; /* no check for memory alignment of inbuffer */
 #else
     off = (intptr_t)inbuffer & 3;
 #endif /* TEST */
@@ -306,6 +311,7 @@ static void init_atrac3_transforms(void) {
     }
 }
 
+
 /**
  * Mantissa decoding
  *
@@ -338,7 +344,7 @@ static void readQuantSpectralCoeffs (GetBitContext *gb, int selector, int coding
         } else {
             for (cnt = 0; cnt < numCodes; cnt++) {
                 if (numBits)
-                    code = get_bits(gb, numBits); //numBits is always 4 in this case
+                    code = get_bits(gb, numBits); /* numBits is always 4 in this case */
                 else
                     code = 0;
                 mantissas[cnt*2] = seTab_0[code >> 2];
@@ -366,6 +372,7 @@ static void readQuantSpectralCoeffs (GetBitContext *gb, int selector, int coding
     }
 }
 
+
 /**
  * Restore the quantized band spectrum coefficients
  *
@@ -382,8 +389,8 @@ int decodeSpectrum (GetBitContext *gb, int32_t *pOut)
     int   mantissas[128];
     int32_t SF;
 
-    numSubbands = get_bits(gb, 5); // number of coded subbands
-    codingMode = get_bits1(gb); // coding Mode: 0 - VLC/ 1-CLC
+    numSubbands = get_bits(gb, 5); /* number of coded subbands */
+    codingMode = get_bits1(gb); /* coding Mode: 0 - VLC/ 1-CLC */
 
     /* Get the VLC selector table for the subbands, 0 means not coded. */
     for (cnt = 0; cnt <= numSubbands; cnt++)
@@ -437,6 +444,7 @@ int decodeSpectrum (GetBitContext *gb, int32_t *pOut)
     return numSubbands;
 }
 
+
 /**
  * Restore the quantized tonal components
  *
@@ -517,6 +525,7 @@ static int decodeTonalComponents (GetBitContext *gb, tonal_component *pComponent
     return component_count;
 }
 
+
 /**
  * Decode gain parameters for the coded bands
  *
@@ -554,21 +563,125 @@ static int decodeGainControl (GetBitContext *gb, gain_block *pGb, int numBands)
     return 0;
 }
 
+
+/**
+ * Apply fix (constant) gain and overlap for sample[start...255].
+ *
+ * @param pIn           input buffer
+ * @param pPrev         previous buffer to perform overlap against
+ * @param pOut          output buffer
+ * @param start         index to start with (always a multiple of 8)
+ * @param gain          gain to apply
+ */
+ 
+static void applyFixGain (int32_t *pIn, int32_t *pPrev, int32_t *pOut, 
+                          int32_t start, int32_t gain)
+{
+    int32_t i = start;
+    
+    /* start is always a multiple of 8 and therefore allows us to unroll the 
+     * loop to 8 calculation per loop 
+     */
+    if (ONE_16 == gain) {
+        /* gain1 = 1.0 -> no multiplication needed, just adding */
+        /* Remark: This path is called >90%. */
+        do {
+            pOut[i] = pIn[i] + pPrev[i]; i++;
+            pOut[i] = pIn[i] + pPrev[i]; i++;
+            pOut[i] = pIn[i] + pPrev[i]; i++;
+            pOut[i] = pIn[i] + pPrev[i]; i++;
+            pOut[i] = pIn[i] + pPrev[i]; i++;
+            pOut[i] = pIn[i] + pPrev[i]; i++;
+            pOut[i] = pIn[i] + pPrev[i]; i++;
+            pOut[i] = pIn[i] + pPrev[i]; i++;
+        } while (i<256);
+    } else {
+        /* gain1 != 1.0 -> we need to do a multiplication */
+        /* Remark: This path is called seldom. */
+        do {
+            pOut[i] = fixmul16(pIn[i], gain) + pPrev[i]; i++;
+            pOut[i] = fixmul16(pIn[i], gain) + pPrev[i]; i++;
+            pOut[i] = fixmul16(pIn[i], gain) + pPrev[i]; i++;
+            pOut[i] = fixmul16(pIn[i], gain) + pPrev[i]; i++;
+            pOut[i] = fixmul16(pIn[i], gain) + pPrev[i]; i++;
+            pOut[i] = fixmul16(pIn[i], gain) + pPrev[i]; i++;
+            pOut[i] = fixmul16(pIn[i], gain) + pPrev[i]; i++;
+            pOut[i] = fixmul16(pIn[i], gain) + pPrev[i]; i++;
+        } while (i<256);
+    }
+}
+
+
+/**
+ * Apply variable gain and overlap. Returns sample index after applying gain,
+ * resulting sample index is always a multiple of 8.
+ *
+ * @param pIn           input buffer
+ * @param pPrev         previous buffer to perform overlap against
+ * @param pOut          output buffer
+ * @param start         index to start with (always a multiple of 8)
+ * @param end           end index for first loop (always a multiple of 8)
+ * @param gain1         current bands gain to apply
+ * @param gain2         next bands gain to apply
+ * @param gain_inc      stepwise adaption from gain1 to gain2
+ */
+ 
+static int applyVariableGain (int32_t *pIn, int32_t *pPrev, int32_t *pOut, 
+                              int32_t start, int32_t end, 
+                              int32_t gain1, int32_t gain2, int32_t gain_inc)
+{
+    int32_t i = start;
+    
+    /* Apply fix gains until end index is reached */
+    do {
+        pOut[i] = fixmul16((fixmul16(pIn[i], gain1) + pPrev[i]), gain2); i++;
+        pOut[i] = fixmul16((fixmul16(pIn[i], gain1) + pPrev[i]), gain2); i++;
+        pOut[i] = fixmul16((fixmul16(pIn[i], gain1) + pPrev[i]), gain2); i++;
+        pOut[i] = fixmul16((fixmul16(pIn[i], gain1) + pPrev[i]), gain2); i++;
+        pOut[i] = fixmul16((fixmul16(pIn[i], gain1) + pPrev[i]), gain2); i++;
+        pOut[i] = fixmul16((fixmul16(pIn[i], gain1) + pPrev[i]), gain2); i++;
+        pOut[i] = fixmul16((fixmul16(pIn[i], gain1) + pPrev[i]), gain2); i++;
+        pOut[i] = fixmul16((fixmul16(pIn[i], gain1) + pPrev[i]), gain2); i++;
+    } while (i < end);
+
+    /* Interpolation is done over next eight samples */
+    pOut[i] = fixmul16((fixmul16(pIn[i], gain1) + pPrev[i]), gain2); i++;
+    gain2 = fixmul16(gain2, gain_inc);
+    pOut[i] = fixmul16((fixmul16(pIn[i], gain1) + pPrev[i]), gain2); i++;
+    gain2 = fixmul16(gain2, gain_inc);
+    pOut[i] = fixmul16((fixmul16(pIn[i], gain1) + pPrev[i]), gain2); i++;
+    gain2 = fixmul16(gain2, gain_inc);
+    pOut[i] = fixmul16((fixmul16(pIn[i], gain1) + pPrev[i]), gain2); i++;
+    gain2 = fixmul16(gain2, gain_inc);
+    pOut[i] = fixmul16((fixmul16(pIn[i], gain1) + pPrev[i]), gain2); i++;
+    gain2 = fixmul16(gain2, gain_inc);
+    pOut[i] = fixmul16((fixmul16(pIn[i], gain1) + pPrev[i]), gain2); i++;
+    gain2 = fixmul16(gain2, gain_inc);
+    pOut[i] = fixmul16((fixmul16(pIn[i], gain1) + pPrev[i]), gain2); i++;
+    gain2 = fixmul16(gain2, gain_inc);
+    pOut[i] = fixmul16((fixmul16(pIn[i], gain1) + pPrev[i]), gain2); i++;
+    gain2 = fixmul16(gain2, gain_inc);
+    
+    return i;
+}
+
+
 /**
  * Apply gain parameters and perform the MDCT overlapping part
  *
- * @param pIn           input float buffer
- * @param pPrev         previous float buffer to perform overlap against
- * @param pOut          output float buffer
+ * @param pIn           input buffer
+ * @param pPrev         previous buffer to perform overlap against
+ * @param pOut          output buffer
  * @param pGain1        current band gain info
  * @param pGain2        next band gain info
  */
 
-static void gainCompensateAndOverlap (int32_t *pIn, int32_t *pPrev, int32_t *pOut, gain_info *pGain1, gain_info *pGain2)
+static void gainCompensateAndOverlap (int32_t *pIn, int32_t *pPrev, int32_t *pOut, 
+                                      gain_info *pGain1, gain_info *pGain2)
 {
     /* gain compensation function */
     int32_t  gain1, gain2, gain_inc;
-    int   cnt, numdata, nsample, startLoc, endLoc;
+    int   cnt, numdata, nsample, startLoc;
 
     if (pGain2->num_gain_data == 0)
         gain1 = ONE_16;
@@ -576,41 +689,35 @@ static void gainCompensateAndOverlap (int32_t *pIn, int32_t *pPrev, int32_t *pOu
         gain1 = gain_tab1[pGain2->levcode[0]];
 
     if (pGain1->num_gain_data == 0) {
-        for (cnt = 0; cnt < 256; cnt++)
-            pOut[cnt] = fixmul16(pIn[cnt], gain1) + pPrev[cnt];
+        /* Remark: This path is called >90%. */
+        /* Apply gain for all samples from 0...255 */
+        applyFixGain(pIn, pPrev, pOut, 0, gain1);
     } else {
+        /* Remark: This path is called seldom. */
         numdata = pGain1->num_gain_data;
         pGain1->loccode[numdata] = 32;
         pGain1->levcode[numdata] = 4;
-
-        nsample = 0; // current sample = 0
+        
+        nsample = 0; /* starting loop with =0 */
 
         for (cnt = 0; cnt < numdata; cnt++) {
             startLoc = pGain1->loccode[cnt] * 8;
-            endLoc = startLoc + 8;
 
-            gain2 = gain_tab1[pGain1->levcode[cnt]];
+            gain2    = gain_tab1[pGain1->levcode[cnt]];
             gain_inc = gain_tab2[(pGain1->levcode[cnt+1] - pGain1->levcode[cnt])+15];
 
-            /* interpolate */
-            for (; nsample < startLoc; nsample++)
-                pOut[nsample] = fixmul16((fixmul16(pIn[nsample], gain1) + pPrev[nsample]), gain2);
-
-            /* interpolation is done over eight samples */
-            for (; nsample < endLoc; nsample++) {
-                pOut[nsample] = fixmul16((fixmul16(pIn[nsample], gain1) + pPrev[nsample]),gain2);
-                gain2 = fixmul16(gain2, gain_inc);
-            }
+            /* Apply variable gain (gain1 -> gain2) to samples */
+            nsample  = applyVariableGain(pIn, pPrev, pOut, nsample, startLoc, gain1, gain2, gain_inc);
         }
-
-        for (; nsample < 256; nsample++)
-            pOut[nsample] = fixmul16(pIn[nsample], gain1) + pPrev[nsample];
+        /* Apply gain for the residual samples from nsample...255 */
+        applyFixGain(pIn, pPrev, pOut, nsample, gain1);
     }
 
     /* Delay for the overlapping part. */
     memcpy(pPrev, &pIn[256], 256*sizeof(int32_t));
 }
 
+
 /**
  * Combine the tonal band spectrum and regular band spectrum
  * Return position of the last tonal coefficient
@@ -639,6 +746,7 @@ static int addTonalComponents (int32_t *pSpectrum, int numComponents, tonal_comp
     return lastPos;
 }
 
+
 /**
  * Linear equidistant interpolation between two points x and y. 7 interpolation
  * points can be calculated. Result is scaled by <<16.
@@ -712,7 +820,7 @@ static void reverseMatrixing(int32_t *su1, int32_t *su2, int *pPrevCode, int *pC
                 }
                 break;
             default:
-                //assert(0);
+                /* assert(0) */;
                 break;
         }
     }
@@ -755,18 +863,16 @@ static void channelWeighting (int32_t *su1, int32_t *su2, int *p3)
     }
 }
 
-
 /**
  * Decode a Sound Unit
  *
  * @param gb            the GetBit context
  * @param pSnd          the channel unit to be used
- * @param pOut          the decoded samples before IQMF in float representation
+ * @param pOut          the decoded samples before IQMF
  * @param channelNum    channel number
  * @param codingMode    the coding mode (JOINT_STEREO or regular stereo/mono)
  */
 
-
 static int decodeChannelSoundUnit (GetBitContext *gb, channel_unit *pSnd, int32_t *pOut, int channelNum, int codingMode)
 {
     int   band, result=0, numSubbands, lastTonal, numBands;
@@ -807,8 +913,9 @@ static int decodeChannelSoundUnit (GetBitContext *gb, channel_unit *pSnd, int32_
         /* Perform the IMDCT step without overlapping. */
         if (band <= numBands) {
             IMLT(&(pSnd->spectrum[band*256]), pSnd->IMDCT_buf);
-        } else
+        } else {
             memset(pSnd->IMDCT_buf, 0, 512 * sizeof(int32_t));
+        }
 
         /* gain compensation and overlapping */
         gainCompensateAndOverlap (pSnd->IMDCT_buf, &(pSnd->prevFrame[band*256]), &(pOut[band*256]),
@@ -982,12 +1089,12 @@ int atrac3_decode_init(ATRAC3Context *q, RMContext *rmctx)
     /* Take care of the codec-specific extradata. */
     if (rmctx->extradata_size == 14) {
         /* Parse the extradata, WAV format */
-        DEBUGF("[0-1] %d\n",rm_get_uint16le(&edata_ptr[0]));  //Unknown value always 1
+        DEBUGF("[0-1] %d\n",rm_get_uint16le(&edata_ptr[0]));    /* Unknown value always 1 */
         q->samples_per_channel = rm_get_uint32le(&edata_ptr[2]);
         q->codingMode = rm_get_uint16le(&edata_ptr[6]);
-        DEBUGF("[8-9] %d\n",rm_get_uint16le(&edata_ptr[8]));  //Dupe of coding mode
-        q->frame_factor = rm_get_uint16le(&edata_ptr[10]);  //Unknown always 1
-        DEBUGF("[12-13] %d\n",rm_get_uint16le(&edata_ptr[12]));  //Unknown always 0
+        DEBUGF("[8-9] %d\n",rm_get_uint16le(&edata_ptr[8]));    /* Dupe of coding mode */
+        q->frame_factor = rm_get_uint16le(&edata_ptr[10]);      /* Unknown always 1 */
+        DEBUGF("[12-13] %d\n",rm_get_uint16le(&edata_ptr[12])); /* Unknown always 0 */
 
         /* setup */
         q->samples_per_frame = 1024 * q->channels;
-- 
cgit v1.2.3