summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndree Buschmann <AndreeBuschmann@t-online.de>2010-07-09 18:32:37 +0000
committerAndree Buschmann <AndreeBuschmann@t-online.de>2010-07-09 18:32:37 +0000
commit811af5968ae3015af04804aa774728f90a897c05 (patch)
tree18aadf8c4a535fef2de069e968fcb23824fdd20a
parentf3e0207384af483d31e43e3d1f326a138007faf8 (diff)
downloadrockbox-811af5968ae3015af04804aa774728f90a897c05.tar.gz
rockbox-811af5968ae3015af04804aa774728f90a897c05.zip
Submit FS#11461. Major speedup for aac he profile (PP5002 +20%, PP5020 +15%, PP5022 +19%, MCF5249 +35%, MCF5250 +80%), still not realtime on most targets though. This change does a lot of refactoring in the sbr filters and the dct, switching to our optimized codeclib fft and tweaking IRAM usage.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@27358 a1c6a512-1295-4272-9138-f99709370657
-rw-r--r--apps/codecs/libfaad/common.c2
-rw-r--r--apps/codecs/libfaad/sbr_dct.c345
-rw-r--r--apps/codecs/libfaad/sbr_dct.h2
-rw-r--r--apps/codecs/libfaad/sbr_dec.c6
-rw-r--r--apps/codecs/libfaad/sbr_qmf.c371
-rw-r--r--apps/codecs/libfaad/sbr_qmf_c.h2
-rw-r--r--apps/codecs/libfaad/specrec.c18
7 files changed, 202 insertions, 544 deletions
diff --git a/apps/codecs/libfaad/common.c b/apps/codecs/libfaad/common.c
index 025c8f8c5b..c838c88d33 100644
--- a/apps/codecs/libfaad/common.c
+++ b/apps/codecs/libfaad/common.c
@@ -319,7 +319,7 @@ static const uint32_t pow2_tab[] ICONST_ATTR = {
319 UFIX_CONST(2.000000000000000,POWTBL_PRECIS) 319 UFIX_CONST(2.000000000000000,POWTBL_PRECIS)
320}; 320};
321 321
322static const real_t log2_tab[] ICONST_ATTR = { 322static const real_t log2_tab[] ICONST_ATTR_FAAD_LARGE_IRAM = {
323 REAL_CONST(0.000000000000000), REAL_CONST(0.022367813028455), REAL_CONST(0.044394119358453), 323 REAL_CONST(0.000000000000000), REAL_CONST(0.022367813028455), REAL_CONST(0.044394119358453),
324 REAL_CONST(0.066089190457772), REAL_CONST(0.087462841250339), REAL_CONST(0.108524456778169), 324 REAL_CONST(0.066089190457772), REAL_CONST(0.087462841250339), REAL_CONST(0.108524456778169),
325 REAL_CONST(0.129283016944966), REAL_CONST(0.149747119504682), REAL_CONST(0.169925001442312), 325 REAL_CONST(0.129283016944966), REAL_CONST(0.149747119504682), REAL_CONST(0.169925001442312),
diff --git a/apps/codecs/libfaad/sbr_dct.c b/apps/codecs/libfaad/sbr_dct.c
index c916a82a61..123514f226 100644
--- a/apps/codecs/libfaad/sbr_dct.c
+++ b/apps/codecs/libfaad/sbr_dct.c
@@ -26,6 +26,9 @@
26**/ 26**/
27 27
28#include "common.h" 28#include "common.h"
29#include "../lib/fft.h"
30#include "../lib/mdct_lookup.h"
31
29 32
30#ifdef SBR_DEC 33#ifdef SBR_DEC
31 34
@@ -1447,267 +1450,9 @@ void DCT2_32_unscaled(real_t *y, real_t *x)
1447 y[17] = f286 - f285; 1450 y[17] = f286 - f285;
1448} 1451}
1449 1452
1450#else 1453#else /* #ifdef SBR_LOW_POWER */
1451
1452
1453#define n 32
1454#define log2n 5
1455
1456// w_array_real[i] = cos(2*M_PI*i/32)
1457static const real_t w_array_real[] = {
1458 FRAC_CONST(1.000000000000000), FRAC_CONST(0.980785279337272),
1459 FRAC_CONST(0.923879528329380), FRAC_CONST(0.831469603195765),
1460 FRAC_CONST(0.707106765732237), FRAC_CONST(0.555570210304169),
1461 FRAC_CONST(0.382683402077046), FRAC_CONST(0.195090284503576),
1462 FRAC_CONST(0.000000000000000), FRAC_CONST(-0.195090370246552),
1463 FRAC_CONST(-0.382683482845162), FRAC_CONST(-0.555570282993553),
1464 FRAC_CONST(-0.707106827549476), FRAC_CONST(-0.831469651765257),
1465 FRAC_CONST(-0.923879561784627), FRAC_CONST(-0.980785296392607)
1466};
1467
1468// w_array_imag[i] = sin(-2*M_PI*i/32)
1469static const real_t w_array_imag[] = {
1470 FRAC_CONST(0.000000000000000), FRAC_CONST(-0.195090327375064),
1471 FRAC_CONST(-0.382683442461104), FRAC_CONST(-0.555570246648862),
1472 FRAC_CONST(-0.707106796640858), FRAC_CONST(-0.831469627480512),
1473 FRAC_CONST(-0.923879545057005), FRAC_CONST(-0.980785287864940),
1474 FRAC_CONST(-1.000000000000000), FRAC_CONST(-0.980785270809601),
1475 FRAC_CONST(-0.923879511601754), FRAC_CONST(-0.831469578911016),
1476 FRAC_CONST(-0.707106734823616), FRAC_CONST(-0.555570173959476),
1477 FRAC_CONST(-0.382683361692986), FRAC_CONST(-0.195090241632088)
1478};
1479
1480// FFT decimation in frequency
1481// 4*16*2+16=128+16=144 multiplications
1482// 6*16*2+10*8+4*16*2=192+80+128=400 additions
1483static void fft_dif(real_t * Real, real_t * Imag)
1484{
1485 real_t w_real, w_imag; // For faster access
1486 real_t point1_real, point1_imag, point2_real, point2_imag; // For faster access
1487 uint32_t j, i, i2, w_index; // Counters
1488
1489 // First 2 stages of 32 point FFT decimation in frequency
1490 // 4*16*2=64*2=128 multiplications
1491 // 6*16*2=96*2=192 additions
1492 // Stage 1 of 32 point FFT decimation in frequency
1493 for (i = 0; i < 16; i++)
1494 {
1495 point1_real = Real[i];
1496 point1_imag = Imag[i];
1497 i2 = i+16;
1498 point2_real = Real[i2];
1499 point2_imag = Imag[i2];
1500
1501 w_real = w_array_real[i];
1502 w_imag = w_array_imag[i];
1503
1504 // temp1 = x[i] - x[i2]
1505 point1_real -= point2_real;
1506 point1_imag -= point2_imag;
1507
1508 // x[i1] = x[i] + x[i2]
1509 Real[i] += point2_real;
1510 Imag[i] += point2_imag;
1511
1512 // x[i2] = (x[i] - x[i2]) * w
1513 Real[i2] = (MUL_F(point1_real,w_real) - MUL_F(point1_imag,w_imag));
1514 Imag[i2] = (MUL_F(point1_real,w_imag) + MUL_F(point1_imag,w_real));
1515 }
1516 // Stage 2 of 32 point FFT decimation in frequency
1517 for (j = 0, w_index = 0; j < 8; j++, w_index += 2)
1518 {
1519 w_real = w_array_real[w_index];
1520 w_imag = w_array_imag[w_index];
1521
1522 i = j;
1523 point1_real = Real[i];
1524 point1_imag = Imag[i];
1525 i2 = i+8;
1526 point2_real = Real[i2];
1527 point2_imag = Imag[i2];
1528
1529 // temp1 = x[i] - x[i2]
1530 point1_real -= point2_real;
1531 point1_imag -= point2_imag;
1532
1533 // x[i1] = x[i] + x[i2]
1534 Real[i] += point2_real;
1535 Imag[i] += point2_imag;
1536 1454
1537 // x[i2] = (x[i] - x[i2]) * w 1455static const real_t dct4_64_tab[] ICONST_ATTR = {
1538 Real[i2] = (MUL_F(point1_real,w_real) - MUL_F(point1_imag,w_imag));
1539 Imag[i2] = (MUL_F(point1_real,w_imag) + MUL_F(point1_imag,w_real));
1540
1541 i = j+16;
1542 point1_real = Real[i];
1543 point1_imag = Imag[i];
1544 i2 = i+8;
1545 point2_real = Real[i2];
1546 point2_imag = Imag[i2];
1547
1548 // temp1 = x[i] - x[i2]
1549 point1_real -= point2_real;
1550 point1_imag -= point2_imag;
1551
1552 // x[i1] = x[i] + x[i2]
1553 Real[i] += point2_real;
1554 Imag[i] += point2_imag;
1555
1556 // x[i2] = (x[i] - x[i2]) * w
1557 Real[i2] = (MUL_F(point1_real,w_real) - MUL_F(point1_imag,w_imag));
1558 Imag[i2] = (MUL_F(point1_real,w_imag) + MUL_F(point1_imag,w_real));
1559 }
1560
1561 // Stage 3 of 32 point FFT decimation in frequency
1562 // 2*4*2=16 multiplications
1563 // 4*4*2+6*4*2=10*8=80 additions
1564 for (i = 0; i < n; i += 8)
1565 {
1566 i2 = i+4;
1567 point1_real = Real[i];
1568 point1_imag = Imag[i];
1569
1570 point2_real = Real[i2];
1571 point2_imag = Imag[i2];
1572
1573 // out[i1] = point1 + point2
1574 Real[i] += point2_real;
1575 Imag[i] += point2_imag;
1576
1577 // out[i2] = point1 - point2
1578 Real[i2] = point1_real - point2_real;
1579 Imag[i2] = point1_imag - point2_imag;
1580 }
1581 w_real = w_array_real[4]; // = sqrt(2)/2
1582 // w_imag = -w_real; // = w_array_imag[4]; // = -sqrt(2)/2
1583 for (i = 1; i < n; i += 8)
1584 {
1585 i2 = i+4;
1586 point1_real = Real[i];
1587 point1_imag = Imag[i];
1588
1589 point2_real = Real[i2];
1590 point2_imag = Imag[i2];
1591
1592 // temp1 = x[i] - x[i2]
1593 point1_real -= point2_real;
1594 point1_imag -= point2_imag;
1595
1596 // x[i1] = x[i] + x[i2]
1597 Real[i] += point2_real;
1598 Imag[i] += point2_imag;
1599
1600 // x[i2] = (x[i] - x[i2]) * w
1601 Real[i2] = MUL_F(point1_real+point1_imag, w_real);
1602 Imag[i2] = MUL_F(point1_imag-point1_real, w_real);
1603 }
1604 for (i = 2; i < n; i += 8)
1605 {
1606 i2 = i+4;
1607 point1_real = Real[i];
1608 point1_imag = Imag[i];
1609
1610 point2_real = Real[i2];
1611 point2_imag = Imag[i2];
1612
1613 // x[i] = x[i] + x[i2]
1614 Real[i] += point2_real;
1615 Imag[i] += point2_imag;
1616
1617 // x[i2] = (x[i] - x[i2]) * (-i)
1618 Real[i2] = point1_imag - point2_imag;
1619 Imag[i2] = point2_real - point1_real;
1620 }
1621 w_real = w_array_real[12]; // = -sqrt(2)/2
1622 // w_imag = w_real; // = w_array_imag[12]; // = -sqrt(2)/2
1623 for (i = 3; i < n; i += 8)
1624 {
1625 i2 = i+4;
1626 point1_real = Real[i];
1627 point1_imag = Imag[i];
1628
1629 point2_real = Real[i2];
1630 point2_imag = Imag[i2];
1631
1632 // temp1 = x[i] - x[i2]
1633 point1_real -= point2_real;
1634 point1_imag -= point2_imag;
1635
1636 // x[i1] = x[i] + x[i2]
1637 Real[i] += point2_real;
1638 Imag[i] += point2_imag;
1639
1640 // x[i2] = (x[i] - x[i2]) * w
1641 Real[i2] = MUL_F(point1_real-point1_imag, w_real);
1642 Imag[i2] = MUL_F(point1_real+point1_imag, w_real);
1643 }
1644
1645
1646 // Stage 4 of 32 point FFT decimation in frequency (no multiplications)
1647 // 16*4=64 additions
1648 for (i = 0; i < n; i += 4)
1649 {
1650 i2 = i+2;
1651 point1_real = Real[i];
1652 point1_imag = Imag[i];
1653
1654 point2_real = Real[i2];
1655 point2_imag = Imag[i2];
1656
1657 // x[i1] = x[i] + x[i2]
1658 Real[i] += point2_real;
1659 Imag[i] += point2_imag;
1660
1661 // x[i2] = x[i] - x[i2]
1662 Real[i2] = point1_real - point2_real;
1663 Imag[i2] = point1_imag - point2_imag;
1664 }
1665 for (i = 1; i < n; i += 4)
1666 {
1667 i2 = i+2;
1668 point1_real = Real[i];
1669 point1_imag = Imag[i];
1670
1671 point2_real = Real[i2];
1672 point2_imag = Imag[i2];
1673
1674 // x[i] = x[i] + x[i2]
1675 Real[i] += point2_real;
1676 Imag[i] += point2_imag;
1677
1678 // x[i2] = (x[i] - x[i2]) * (-i)
1679 Real[i2] = point1_imag - point2_imag;
1680 Imag[i2] = point2_real - point1_real;
1681 }
1682
1683 // Stage 5 of 32 point FFT decimation in frequency (no multiplications)
1684 // 16*4=64 additions
1685 for (i = 0; i < n; i += 2)
1686 {
1687 i2 = i+1;
1688 point1_real = Real[i];
1689 point1_imag = Imag[i];
1690
1691 point2_real = Real[i2];
1692 point2_imag = Imag[i2];
1693
1694 // out[i1] = point1 + point2
1695 Real[i] += point2_real;
1696 Imag[i] += point2_imag;
1697
1698 // out[i2] = point1 - point2
1699 Real[i2] = point1_real - point2_real;
1700 Imag[i2] = point1_imag - point2_imag;
1701 }
1702
1703#ifdef REORDER_IN_FFT
1704 FFTReorder(Real, Imag);
1705#endif // #ifdef REORDER_IN_FFT
1706}
1707#undef n
1708#undef log2n
1709
1710static const real_t dct4_64_tab[] = {
1711 COEF_CONST(0.999924719333649), COEF_CONST(0.998118102550507), 1456 COEF_CONST(0.999924719333649), COEF_CONST(0.998118102550507),
1712 COEF_CONST(0.993906974792480), COEF_CONST(0.987301409244537), 1457 COEF_CONST(0.993906974792480), COEF_CONST(0.987301409244537),
1713 COEF_CONST(0.978317379951477), COEF_CONST(0.966976463794708), 1458 COEF_CONST(0.978317379951477), COEF_CONST(0.966976463794708),
@@ -1806,57 +1551,65 @@ static const real_t dct4_64_tab[] = {
1806 COEF_CONST(0.897167563438416), COEF_CONST(0.949727773666382) 1551 COEF_CONST(0.897167563438416), COEF_CONST(0.949727773666382)
1807}; 1552};
1808 1553
1554// Table adapted from codeclib to fit into IRAM
1555const uint32_t dct4_revtab[32] ICONST_ATTR = {
1556 0, 24, 12, 22, 6, 30, 11, 19, 3, 27, 15, 21, 5, 29, 9, 17,
1557 1, 25, 13, 23, 7, 31, 10, 18, 2, 26, 14, 20, 4, 28, 8, 16};
1558
1809/* size 64 only! */ 1559/* size 64 only! */
1810void dct4_kernel(real_t * in_real, real_t * in_imag, real_t * out_real, real_t * out_imag) 1560void dct4_kernel(real_t *real, real_t *imag)
1811{ 1561{
1812 // Tables with bit reverse values for 5 bits, bit reverse of i at i-th position 1562 uint32_t i, idx;
1813 const uint8_t bit_rev_tab[32] = { 0,16,8,24,4,20,12,28,2,18,10,26,6,22,14,30,1,17,9,25,5,21,13,29,3,19,11,27,7,23,15,31 }; 1563 real_t x_re, x_im, tmp;
1814 uint16_t i, i_rev; 1564 FFTComplex xc[32]; /* used for calling codeclib's fft implementation */
1815 1565
1816 /* Step 2: modulate */ 1566 /* Step 2: modulate and pre-rotate for codeclib's fft implementation */
1817 // 3*32=96 multiplications 1567 // 3*32=96 multiplications
1818 // 3*32=96 additions 1568 // 3*32=96 additions
1819 for (i = 0; i < 32; i++) 1569 for (i = 0; i < 32; i++)
1820 { 1570 {
1821 real_t x_re, x_im, tmp; 1571 idx = dct4_revtab[i];
1822 x_re = in_real[i]; 1572 x_re = real[i];
1823 x_im = in_imag[i]; 1573 x_im = imag[i];
1824 tmp = MUL_C(x_re + x_im, dct4_64_tab[i]); 1574 tmp = MUL_C(x_re + x_im, dct4_64_tab[i ]);
1825 in_real[i] = MUL_C(x_im, dct4_64_tab[i + 64]) + tmp; 1575 xc[idx].re = MUL_C(x_im , dct4_64_tab[i + 64]) + tmp;
1826 in_imag[i] = MUL_C(x_re, dct4_64_tab[i + 32]) + tmp; 1576 xc[idx].im = MUL_C(x_re , dct4_64_tab[i + 32]) + tmp;
1827 } 1577 }
1828 1578
1829 /* Step 3: FFT, but with output in bit reverse order */ 1579 /* Step 3: FFT (codeclib's implementation) */
1830 fft_dif(in_real, in_imag); 1580 ff_fft_calc_c(5, xc);
1831 1581
1832 /* Step 4: modulate + bitreverse reordering */ 1582 /* Step 4: modulate + reordering */
1833 // 3*31+2=95 multiplications 1583 // 3*31+2=95 multiplications
1834 // 3*31+2=95 additions 1584 // 3*31+2=95 additions
1835 for (i = 0; i < 16; i++) 1585 x_re = xc[0].re;
1586 x_im = xc[0].im;
1587 tmp = MUL_C(x_re + x_im, dct4_64_tab[0 + 3*32]);
1588 real[0] = MUL_C(x_im , dct4_64_tab[0 + 5*32]) + tmp;
1589 imag[0] = MUL_C(x_re , dct4_64_tab[0 + 4*32]) + tmp;
1590 for (i = 1; i < 16; i++)
1836 { 1591 {
1837 real_t x_re, x_im, tmp; 1592 idx = 32-i;
1838 i_rev = bit_rev_tab[i]; 1593 x_re = xc[idx].re;
1839 x_re = in_real[i_rev]; 1594 x_im = xc[idx].im;
1840 x_im = in_imag[i_rev]; 1595 tmp = MUL_C(x_re + x_im, dct4_64_tab[i + 3*32]);
1841 1596 real[i] = MUL_C(x_im , dct4_64_tab[i + 5*32]) + tmp;
1842 tmp = MUL_C(x_re + x_im, dct4_64_tab[i + 3*32]); 1597 imag[i] = MUL_C(x_re , dct4_64_tab[i + 4*32]) + tmp;
1843 out_real[i] = MUL_C(x_im, dct4_64_tab[i + 5*32]) + tmp;
1844 out_imag[i] = MUL_C(x_re, dct4_64_tab[i + 4*32]) + tmp;
1845 } 1598 }
1846 // i = 16, i_rev = 1 = rev(16); 1599 // i = 16, idx = 16 = reorder_tab[16];
1847 out_imag[16] = MUL_C(in_imag[1] - in_real[1], dct4_64_tab[16 + 3*32]); 1600 x_re = xc[16].re;
1848 out_real[16] = MUL_C(in_real[1] + in_imag[1], dct4_64_tab[16 + 3*32]); 1601 x_im = xc[16].im;
1602 imag[16] = MUL_C(x_im - x_re, dct4_64_tab[16 + 3*32]);
1603 real[16] = MUL_C(x_re + x_im, dct4_64_tab[16 + 3*32]);
1849 for (i = 17; i < 32; i++) 1604 for (i = 17; i < 32; i++)
1850 { 1605 {
1851 real_t x_re, x_im, tmp; 1606 idx = 32-i;
1852 i_rev = bit_rev_tab[i]; 1607 x_re = xc[idx].re;
1853 x_re = in_real[i_rev]; 1608 x_im = xc[idx].im;
1854 x_im = in_imag[i_rev]; 1609 tmp = MUL_C(x_re + x_im, dct4_64_tab[i + 3*32]);
1855 tmp = MUL_C(x_re + x_im, dct4_64_tab[i + 3*32]); 1610 real[i] = MUL_C(x_im , dct4_64_tab[i + 5*32]) + tmp;
1856 out_real[i] = MUL_C(x_im, dct4_64_tab[i + 5*32]) + tmp; 1611 imag[i] = MUL_C(x_re , dct4_64_tab[i + 4*32]) + tmp;
1857 out_imag[i] = MUL_C(x_re, dct4_64_tab[i + 4*32]) + tmp;
1858 } 1612 }
1859
1860} 1613}
1861 1614
1862void DST4_32(real_t *y, real_t *x) 1615void DST4_32(real_t *y, real_t *x)
@@ -2266,6 +2019,6 @@ void DST4_32(real_t *y, real_t *x)
2266 y[0] = MUL_R(REAL_CONST(20.3738781672314530), f304); 2019 y[0] = MUL_R(REAL_CONST(20.3738781672314530), f304);
2267} 2020}
2268 2021
2269#endif 2022#endif /* #ifdef SBR_LOW_POWER */
2270 2023
2271#endif 2024#endif /* #ifdef SBR_DEC */
diff --git a/apps/codecs/libfaad/sbr_dct.h b/apps/codecs/libfaad/sbr_dct.h
index 124f159d5b..95394df307 100644
--- a/apps/codecs/libfaad/sbr_dct.h
+++ b/apps/codecs/libfaad/sbr_dct.h
@@ -32,7 +32,7 @@
32extern "C" { 32extern "C" {
33#endif 33#endif
34 34
35void dct4_kernel(real_t * in_real, real_t * in_imag, real_t * out_real, real_t * out_imag); 35void dct4_kernel(real_t *real, real_t *imag);
36 36
37void DCT3_32_unscaled(real_t *y, real_t *x); 37void DCT3_32_unscaled(real_t *y, real_t *x);
38void DCT4_32(real_t *y, real_t *x); 38void DCT4_32(real_t *y, real_t *x);
diff --git a/apps/codecs/libfaad/sbr_dec.c b/apps/codecs/libfaad/sbr_dec.c
index 97f1d9b647..60bb2a6bdb 100644
--- a/apps/codecs/libfaad/sbr_dec.c
+++ b/apps/codecs/libfaad/sbr_dec.c
@@ -454,6 +454,7 @@ uint8_t sbrDecodeCoupleFrame(sbr_info *sbr, real_t *left_chan, real_t *right_cha
454} 454}
455 455
456ALIGN qmf_t X[MAX_NTSR][64]; 456ALIGN qmf_t X[MAX_NTSR][64];
457
457uint8_t sbrDecodeSingleFrame(sbr_info *sbr, real_t *channel, 458uint8_t sbrDecodeSingleFrame(sbr_info *sbr, real_t *channel,
458 const uint8_t just_seeked, const uint8_t downSampledSBR) 459 const uint8_t just_seeked, const uint8_t downSampledSBR)
459{ 460{
@@ -520,9 +521,8 @@ uint8_t sbrDecodeSingleFrame(sbr_info *sbr, real_t *channel,
520 return 0; 521 return 0;
521} 522}
522 523
523 524ALIGN qmf_t X_left[MAX_NTSRHFG][64];// = {{0}};
524ALIGN qmf_t X_left[38][64];// = {{0}}; 525ALIGN qmf_t X_right[MAX_NTSRHFG][64];// = {{0}}; /* must set this to 0 */
525ALIGN qmf_t X_right[38][64];// = {{0}}; /* must set this to 0 */
526 526
527#if (defined(PS_DEC) || defined(DRM_PS)) 527#if (defined(PS_DEC) || defined(DRM_PS))
528uint8_t sbrDecodeSingleFramePS(sbr_info *sbr, real_t *left_channel, real_t *right_channel, 528uint8_t sbrDecodeSingleFramePS(sbr_info *sbr, real_t *left_channel, real_t *right_channel,
diff --git a/apps/codecs/libfaad/sbr_qmf.c b/apps/codecs/libfaad/sbr_qmf.c
index 5486cd283d..7b70cc6a5e 100644
--- a/apps/codecs/libfaad/sbr_qmf.c
+++ b/apps/codecs/libfaad/sbr_qmf.c
@@ -38,6 +38,16 @@
38#include "sbr_qmf_c.h" 38#include "sbr_qmf_c.h"
39#include "sbr_syntax.h" 39#include "sbr_syntax.h"
40 40
41#ifdef FIXED_POINT
42 #define FAAD_SYNTHESIS_SCALE(X) ((X)>>1)
43 #define FAAD_ANALYSIS_SCALE1(X) ((X)>>4)
44 #define FAAD_ANALYSIS_SCALE2(X) ((X))
45#else
46 #define FAAD_ANALYSIS_SCALE1(X) ((X)*scale)
47 #define FAAD_ANALYSIS_SCALE1(X) ((X))
48 #define FAAD_ANALYSIS_SCALE2(X) (2.*(X))
49#endif
50
41qmfa_info *qmfa_init(uint8_t channels) 51qmfa_info *qmfa_init(uint8_t channels)
42{ 52{
43 qmfa_info *qmfa = (qmfa_info*)faad_malloc(sizeof(qmfa_info)); 53 qmfa_info *qmfa = (qmfa_info*)faad_malloc(sizeof(qmfa_info));
@@ -68,40 +78,44 @@ void sbr_qmf_analysis_32(sbr_info *sbr, qmfa_info *qmfa, const real_t *input,
68{ 78{
69 ALIGN real_t u[64]; 79 ALIGN real_t u[64];
70#ifndef SBR_LOW_POWER 80#ifndef SBR_LOW_POWER
71 static ALIGN real_t in_real[32], in_imag[32], out_real[32], out_imag[32]; 81 ALIGN real_t real[32];
82 ALIGN real_t imag[32];
72#else 83#else
73 ALIGN real_t y[32]; 84 ALIGN real_t y[32];
74#endif 85#endif
75 uint16_t in = 0; 86 qmf_t *pX;
76 uint8_t l; 87 uint32_t in = 0;
88 uint32_t l, idx0, idx1;
77 89
78 /* qmf subsample l */ 90 /* qmf subsample l */
79 for (l = 0; l < sbr->numTimeSlotsRate; l++) 91 for (l = 0; l < sbr->numTimeSlotsRate; l++)
80 { 92 {
81 int16_t n; 93 int32_t n;
82 94
83 /* shift input buffer x */ 95 /* shift input buffer x */
84 /* input buffer is not shifted anymore, x is implemented as double ringbuffer */ 96 /* input buffer is not shifted anymore, x is implemented as double ringbuffer */
85 //memmove(qmfa->x + 32, qmfa->x, (320-32)*sizeof(real_t)); 97 //memmove(qmfa->x + 32, qmfa->x, (320-32)*sizeof(real_t));
86 98
87 /* add new samples to input buffer x */ 99 /* add new samples to input buffer x */
88 for (n = 32 - 1; n >= 0; n--) 100 idx0 = qmfa->x_index + 31; idx1 = idx0 + 320;
101 for (n = 32 - 1; n >= 0; n-=4)
89 { 102 {
90#ifdef FIXED_POINT 103 qmfa->x[idx0--] = qmfa->x[idx1--] = (input[in++]);
91 qmfa->x[qmfa->x_index + n] = qmfa->x[qmfa->x_index + n + 320] = (input[in++]) >> 4; 104 qmfa->x[idx0--] = qmfa->x[idx1--] = (input[in++]);
92#else 105 qmfa->x[idx0--] = qmfa->x[idx1--] = (input[in++]);
93 qmfa->x[qmfa->x_index + n] = qmfa->x[qmfa->x_index + n + 320] = input[in++]; 106 qmfa->x[idx0--] = qmfa->x[idx1--] = (input[in++]);
94#endif
95 } 107 }
96 108
97 /* window and summation to create array u */ 109 /* window and summation to create array u */
98 for (n = 0; n < 64; n++) 110 for (n = 0; n < 64; n++)
99 { 111 {
100 u[n] = MUL_F(qmfa->x[qmfa->x_index + n], qmf_c[2*n]) + 112 idx0 = qmfa->x_index + n; idx1 = n * 2;
101 MUL_F(qmfa->x[qmfa->x_index + n + 64], qmf_c[2*(n + 64)]) + 113 u[n] = FAAD_ANALYSIS_SCALE1(
102 MUL_F(qmfa->x[qmfa->x_index + n + 128], qmf_c[2*(n + 128)]) + 114 MUL_F(qmfa->x[idx0 ], qmf_c[idx1]) +
103 MUL_F(qmfa->x[qmfa->x_index + n + 192], qmf_c[2*(n + 192)]) + 115 MUL_F(qmfa->x[idx0 + 64], qmf_c[idx1 + 2 * 64]) +
104 MUL_F(qmfa->x[qmfa->x_index + n + 256], qmf_c[2*(n + 256)]); 116 MUL_F(qmfa->x[idx0 + 128], qmf_c[idx1 + 2 * 128]) +
117 MUL_F(qmfa->x[idx0 + 192], qmf_c[idx1 + 2 * 192]) +
118 MUL_F(qmfa->x[idx0 + 256], qmf_c[idx1 + 2 * 256]));
105 } 119 }
106 120
107 /* update ringbuffer index */ 121 /* update ringbuffer index */
@@ -123,64 +137,52 @@ void sbr_qmf_analysis_32(sbr_info *sbr, qmfa_info *qmfa, const real_t *input,
123 { 137 {
124 if (n < kx) 138 if (n < kx)
125 { 139 {
126#ifdef FIXED_POINT 140 QMF_RE(X[l + offset][n]) = FAAD_ANALYSIS_SCALE2(u[n]);
127 QMF_RE(X[l + offset][n]) = u[n] /*<< 1*/;
128#else
129 QMF_RE(X[l + offset][n]) = 2. * u[n];
130#endif
131 } else { 141 } else {
132 QMF_RE(X[l + offset][n]) = 0; 142 QMF_RE(X[l + offset][n]) = 0;
133 } 143 }
134 } 144 }
135#else 145#else /* #ifdef SBR_LOW_POWER */
136 146
137 // Reordering of data moved from DCT_IV to here 147 // Reordering of data moved from DCT_IV to here
138 in_imag[31] = u[1]; 148 idx0 = 30; idx1 = 63;
139 in_real[0] = u[0]; 149 imag[31] = u[ 1]; real[ 0] = u[ 0];
140 for (n = 1; n < 31; n++) 150 for (n = 1; n < 31; n+=3)
141 { 151 {
142 in_imag[31 - n] = u[n+1]; 152 imag[idx0--] = u[n+1]; real[n ] = -u[idx1--];
143 in_real[n] = -u[64-n]; 153 imag[idx0--] = u[n+2]; real[n+1] = -u[idx1--];
154 imag[idx0--] = u[n+3]; real[n+2] = -u[idx1--];
144 } 155 }
145 in_imag[0] = u[32]; 156 imag[ 0] = u[32]; real[31] = -u[33];
146 in_real[31] = -u[33];
147 157
148 // dct4_kernel is DCT_IV without reordering which is done before and after FFT 158 // dct4_kernel is DCT_IV without reordering which is done before and after FFT
149 dct4_kernel(in_real, in_imag, out_real, out_imag); 159 dct4_kernel(real, imag);
150 160
151 // Reordering of data moved from DCT_IV to here 161 // Reordering of data moved from DCT_IV to here
152 for (n = 0; n < 16; n++) { 162 /* Step 1: Calculate all non-zero pairs */
153 if (2*n+1 < kx) { 163 pX = X[l + offset];
154#ifdef FIXED_POINT 164 for (n = 0; n < kx/2; n++) {
155 QMF_RE(X[l + offset][2*n]) = out_real[n]; 165 idx0 = 2*n; idx1 = idx0 + 1;
156 QMF_IM(X[l + offset][2*n]) = out_imag[n]; 166 QMF_RE(pX[idx0]) = FAAD_ANALYSIS_SCALE2( real[n ]);
157 QMF_RE(X[l + offset][2*n+1]) = -out_imag[31-n]; 167 QMF_IM(pX[idx0]) = FAAD_ANALYSIS_SCALE2( imag[n ]);
158 QMF_IM(X[l + offset][2*n+1]) = -out_real[31-n]; 168 QMF_RE(pX[idx1]) = FAAD_ANALYSIS_SCALE2(-imag[31-n]);
159#else 169 QMF_IM(pX[idx1]) = FAAD_ANALYSIS_SCALE2(-real[31-n]);
160 QMF_RE(X[l + offset][2*n]) = 2. * out_real[n];
161 QMF_IM(X[l + offset][2*n]) = 2. * out_imag[n];
162 QMF_RE(X[l + offset][2*n+1]) = -2. * out_imag[31-n];
163 QMF_IM(X[l + offset][2*n+1]) = -2. * out_real[31-n];
164#endif
165 } else {
166 if (2*n < kx) {
167#ifdef FIXED_POINT
168 QMF_RE(X[l + offset][2*n]) = out_real[n];
169 QMF_IM(X[l + offset][2*n]) = out_imag[n];
170#else
171 QMF_RE(X[l + offset][2*n]) = 2. * out_real[n];
172 QMF_IM(X[l + offset][2*n]) = 2. * out_imag[n];
173#endif
174 }
175 else {
176 QMF_RE(X[l + offset][2*n]) = 0;
177 QMF_IM(X[l + offset][2*n]) = 0;
178 }
179 QMF_RE(X[l + offset][2*n+1]) = 0;
180 QMF_IM(X[l + offset][2*n+1]) = 0;
181 }
182 } 170 }
183#endif 171 /* Step 2: Calculate a single pair with half zero'ed */
172 if (kx&1) {
173 idx0 = 2*n; idx1 = idx0 + 1;
174 QMF_RE(pX[idx0]) = FAAD_ANALYSIS_SCALE2( real[n]);
175 QMF_IM(pX[idx0]) = FAAD_ANALYSIS_SCALE2( imag[n]);
176 QMF_RE(pX[idx1]) = QMF_IM(pX[idx1]) = 0;
177 n++;
178 }
179 /* Step 3: All other are zero'ed */
180 for (; n < 16; n++) {
181 idx0 = 2*n; idx1 = idx0 + 1;
182 QMF_RE(pX[idx0]) = QMF_IM(pX[idx0]) = 0;
183 QMF_RE(pX[idx1]) = QMF_IM(pX[idx1]) = 0;
184 }
185#endif /* #ifdef SBR_LOW_POWER */
184 } 186 }
185} 187}
186 188
@@ -297,7 +299,7 @@ void sbr_qmf_synthesis_32(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][6
297 for (k = 0; k < 32; k++) 299 for (k = 0; k < 32; k++)
298 { 300 {
299 output[out++] = MUL_F(qmfs->v[qmfs->v_index + k], qmf_c[2*k]) + 301 output[out++] = MUL_F(qmfs->v[qmfs->v_index + k], qmf_c[2*k]) +
300 MUL_F(qmfs->v[qmfs->v_index + 96 + k], qmf_c[64 + 2*k]) + 302 MUL_F(qmfs->v[qmfs->v_index + 96 + k], qmf_c[ 64 + 2*k]) +
301 MUL_F(qmfs->v[qmfs->v_index + 128 + k], qmf_c[128 + 2*k]) + 303 MUL_F(qmfs->v[qmfs->v_index + 128 + k], qmf_c[128 + 2*k]) +
302 MUL_F(qmfs->v[qmfs->v_index + 224 + k], qmf_c[192 + 2*k]) + 304 MUL_F(qmfs->v[qmfs->v_index + 224 + k], qmf_c[192 + 2*k]) +
303 MUL_F(qmfs->v[qmfs->v_index + 256 + k], qmf_c[256 + 2*k]) + 305 MUL_F(qmfs->v[qmfs->v_index + 256 + k], qmf_c[256 + 2*k]) +
@@ -384,17 +386,26 @@ void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][6
384 qmfs->v_index = (1280-128); 386 qmfs->v_index = (1280-128);
385 } 387 }
386} 388}
387#else 389#else /* #ifdef SBR_LOW_POWER */
390
391#define FAAD_CMPLX_PRETWIDDLE_SUB(k) \
392 (MUL_F(QMF_RE(X[l][k]), RE(qmf32_pre_twiddle[k])) - \
393 MUL_F(QMF_IM(X[l][k]), IM(qmf32_pre_twiddle[k]))) \
394
395#define FAAD_CMPLX_PRETWIDDLE_ADD(k) \
396 (MUL_F(QMF_IM(X[l][k]), RE(qmf32_pre_twiddle[k])) + \
397 MUL_F(QMF_RE(X[l][k]), IM(qmf32_pre_twiddle[k]))) \
398
388void sbr_qmf_synthesis_32(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64], 399void sbr_qmf_synthesis_32(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64],
389 real_t *output) 400 real_t *output)
390{ 401{
391 ALIGN real_t x1[32], x2[32]; 402 ALIGN real_t x1[32];
403 ALIGN real_t x2[32];
392#ifndef FIXED_POINT 404#ifndef FIXED_POINT
393 real_t scale = 1.f/64.f; 405 real_t scale = 1.f/64.f;
394#endif 406#endif
395 int16_t n, k, out = 0; 407 int32_t n, k, idx0, idx1, out = 0;
396 uint8_t l; 408 uint32_t l;
397
398 409
399 /* qmf subsample l */ 410 /* qmf subsample l */
400 for (l = 0; l < sbr->numTimeSlotsRate; l++) 411 for (l = 0; l < sbr->numTimeSlotsRate; l++)
@@ -405,43 +416,43 @@ void sbr_qmf_synthesis_32(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][6
405 416
406 /* calculate 64 samples */ 417 /* calculate 64 samples */
407 /* complex pre-twiddle */ 418 /* complex pre-twiddle */
408 for (k = 0; k < 32; k++) 419 for (k = 0; k < 32;)
409 { 420 {
410 x1[k] = MUL_F(QMF_RE(X[l][k]), RE(qmf32_pre_twiddle[k])) - MUL_F(QMF_IM(X[l][k]), IM(qmf32_pre_twiddle[k])); 421 x1[k] = FAAD_CMPLX_PRETWIDDLE_SUB(k); x2[k] = FAAD_CMPLX_PRETWIDDLE_ADD(k); k++;
411 x2[k] = MUL_F(QMF_IM(X[l][k]), RE(qmf32_pre_twiddle[k])) + MUL_F(QMF_RE(X[l][k]), IM(qmf32_pre_twiddle[k])); 422 x1[k] = FAAD_CMPLX_PRETWIDDLE_SUB(k); x2[k] = FAAD_CMPLX_PRETWIDDLE_ADD(k); k++;
412 423 x1[k] = FAAD_CMPLX_PRETWIDDLE_SUB(k); x2[k] = FAAD_CMPLX_PRETWIDDLE_ADD(k); k++;
413#ifndef FIXED_POINT 424 x1[k] = FAAD_CMPLX_PRETWIDDLE_SUB(k); x2[k] = FAAD_CMPLX_PRETWIDDLE_ADD(k); k++;
414 x1[k] *= scale;
415 x2[k] *= scale;
416#else
417 x1[k] >>= 1;
418 x2[k] >>= 1;
419#endif
420 } 425 }
421 426
422 /* transform */ 427 /* transform */
423 DCT4_32(x1, x1); 428 DCT4_32(x1, x1);
424 DST4_32(x2, x2); 429 DST4_32(x2, x2);
425 430
426 for (n = 0; n < 32; n++) 431 idx0 = qmfs->v_index;
432 idx1 = qmfs->v_index + 63;
433 for (n = 0; n < 32; n+=2)
427 { 434 {
428 qmfs->v[qmfs->v_index + n] = qmfs->v[qmfs->v_index + 640 + n] = -x1[n] + x2[n]; 435 qmfs->v[idx0] = qmfs->v[idx0 + 640] = -x1[n ] + x2[n ]; idx0++;
429 qmfs->v[qmfs->v_index + 63 - n] = qmfs->v[qmfs->v_index + 640 + 63 - n] = x1[n] + x2[n]; 436 qmfs->v[idx1] = qmfs->v[idx1 + 640] = x1[n ] + x2[n ]; idx1--;
437 qmfs->v[idx0] = qmfs->v[idx0 + 640] = -x1[n+1] + x2[n+1]; idx0++;
438 qmfs->v[idx1] = qmfs->v[idx1 + 640] = x1[n+1] + x2[n+1]; idx1--;
430 } 439 }
431 440
432 /* calculate 32 output samples and window */ 441 /* calculate 32 output samples and window */
433 for (k = 0; k < 32; k++) 442 for (k = 0; k < 32; k++)
434 { 443 {
435 output[out++] = MUL_F(qmfs->v[qmfs->v_index + k], qmf_c[2*k]) + 444 idx0 = qmfs->v_index + k; idx1 = 2*k;
436 MUL_F(qmfs->v[qmfs->v_index + 96 + k], qmf_c[64 + 2*k]) + 445 output[out++] = FAAD_SYNTHESIS_SCALE(
437 MUL_F(qmfs->v[qmfs->v_index + 128 + k], qmf_c[128 + 2*k]) + 446 MUL_F(qmfs->v[idx0 ], qmf_c[idx1 ]) +
438 MUL_F(qmfs->v[qmfs->v_index + 224 + k], qmf_c[192 + 2*k]) + 447 MUL_F(qmfs->v[idx0 + 96], qmf_c[idx1 + 64]) +
439 MUL_F(qmfs->v[qmfs->v_index + 256 + k], qmf_c[256 + 2*k]) + 448 MUL_F(qmfs->v[idx0 + 128], qmf_c[idx1 + 128]) +
440 MUL_F(qmfs->v[qmfs->v_index + 352 + k], qmf_c[320 + 2*k]) + 449 MUL_F(qmfs->v[idx0 + 224], qmf_c[idx1 + 192]) +
441 MUL_F(qmfs->v[qmfs->v_index + 384 + k], qmf_c[384 + 2*k]) + 450 MUL_F(qmfs->v[idx0 + 256], qmf_c[idx1 + 256]) +
442 MUL_F(qmfs->v[qmfs->v_index + 480 + k], qmf_c[448 + 2*k]) + 451 MUL_F(qmfs->v[idx0 + 352], qmf_c[idx1 + 320]) +
443 MUL_F(qmfs->v[qmfs->v_index + 512 + k], qmf_c[512 + 2*k]) + 452 MUL_F(qmfs->v[idx0 + 384], qmf_c[idx1 + 384]) +
444 MUL_F(qmfs->v[qmfs->v_index + 608 + k], qmf_c[576 + 2*k]); 453 MUL_F(qmfs->v[idx0 + 480], qmf_c[idx1 + 448]) +
454 MUL_F(qmfs->v[idx0 + 512], qmf_c[idx1 + 512]) +
455 MUL_F(qmfs->v[idx0 + 608], qmf_c[idx1 + 576]));
445 } 456 }
446 457
447 /* update ringbuffer index */ 458 /* update ringbuffer index */
@@ -454,31 +465,18 @@ void sbr_qmf_synthesis_32(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][6
454void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64], 465void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64],
455 real_t *output) 466 real_t *output)
456{ 467{
457// ALIGN real_t x1[64], x2[64]; 468 ALIGN real_t real1[32];
458#ifndef SBR_LOW_POWER 469 ALIGN real_t imag1[32];
459 static ALIGN real_t in_real1[32], in_imag1[32], out_real1[32], out_imag1[32]; 470 ALIGN real_t real2[32];
460 static ALIGN real_t in_real2[32], in_imag2[32], out_real2[32], out_imag2[32]; 471 ALIGN real_t imag2[32];
461#endif
462 qmf_t * pX; 472 qmf_t * pX;
463 real_t * pring_buffer_1, * pring_buffer_3; 473 real_t * p_buf_1, * p_buf_3;
464// real_t * ptemp_1, * ptemp_2;
465#ifdef PREFER_POINTERS
466 // These pointers are used if target platform has autoinc address generators
467 real_t * pring_buffer_2, * pring_buffer_4;
468 real_t * pring_buffer_5, * pring_buffer_6;
469 real_t * pring_buffer_7, * pring_buffer_8;
470 real_t * pring_buffer_9, * pring_buffer_10;
471 const real_t * pqmf_c_1, * pqmf_c_2, * pqmf_c_3, * pqmf_c_4;
472 const real_t * pqmf_c_5, * pqmf_c_6, * pqmf_c_7, * pqmf_c_8;
473 const real_t * pqmf_c_9, * pqmf_c_10;
474#endif // #ifdef PREFER_POINTERS
475#ifndef FIXED_POINT 474#ifndef FIXED_POINT
476 real_t scale = 1.f/64.f; 475 real_t scale = 1.f/64.f;
477#endif 476#endif
478 int16_t n, k, out = 0; 477 int32_t n, k, idx0, idx1, out = 0;
479 uint8_t l; 478 uint32_t l;
480 479
481
482 /* qmf subsample l */ 480 /* qmf subsample l */
483 for (l = 0; l < sbr->numTimeSlotsRate; l++) 481 for (l = 0; l < sbr->numTimeSlotsRate; l++)
484 { 482 {
@@ -487,139 +485,46 @@ void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][6
487 //memmove(qmfs->v + 128, qmfs->v, (1280-128)*sizeof(real_t)); 485 //memmove(qmfs->v + 128, qmfs->v, (1280-128)*sizeof(real_t));
488 486
489 /* calculate 128 samples */ 487 /* calculate 128 samples */
490#ifndef FIXED_POINT
491
492 pX = X[l]; 488 pX = X[l];
493 489 for (k = 0; k < 32; k++)
494 in_imag1[31] = scale*QMF_RE(pX[1]);
495 in_real1[0] = scale*QMF_RE(pX[0]);
496 in_imag2[31] = scale*QMF_IM(pX[63-1]);
497 in_real2[0] = scale*QMF_IM(pX[63-0]);
498 for (k = 1; k < 31; k++)
499 {
500 in_imag1[31 - k] = scale*QMF_RE(pX[2*k + 1]);
501 in_real1[ k] = scale*QMF_RE(pX[2*k ]);
502 in_imag2[31 - k] = scale*QMF_IM(pX[63 - (2*k + 1)]);
503 in_real2[ k] = scale*QMF_IM(pX[63 - (2*k )]);
504 }
505 in_imag1[0] = scale*QMF_RE(pX[63]);
506 in_real1[31] = scale*QMF_RE(pX[62]);
507 in_imag2[0] = scale*QMF_IM(pX[63-63]);
508 in_real2[31] = scale*QMF_IM(pX[63-62]);
509
510#else
511
512 pX = X[l];
513
514 in_imag1[31] = QMF_RE(pX[1]) >> 1;
515 in_real1[0] = QMF_RE(pX[0]) >> 1;
516 in_imag2[31] = QMF_IM(pX[62]) >> 1;
517 in_real2[0] = QMF_IM(pX[63]) >> 1;
518 for (k = 1; k < 31; k++)
519 { 490 {
520 in_imag1[31 - k] = QMF_RE(pX[2*k + 1]) >> 1; 491 idx0 = 2*k; idx1 = idx0+1;
521 in_real1[ k] = QMF_RE(pX[2*k ]) >> 1; 492 real1[ k] = QMF_RE(pX[idx0]); imag2[ k] = QMF_IM(pX[idx0]);
522 in_imag2[31 - k] = QMF_IM(pX[63 - (2*k + 1)]) >> 1; 493 imag1[31-k] = QMF_RE(pX[idx1]); real2[31-k] = QMF_IM(pX[idx1]);
523 in_real2[ k] = QMF_IM(pX[63 - (2*k )]) >> 1;
524 } 494 }
525 in_imag1[0] = QMF_RE(pX[63]) >> 1; 495
526 in_real1[31] = QMF_RE(pX[62]) >> 1;
527 in_imag2[0] = QMF_IM(pX[0]) >> 1;
528 in_real2[31] = QMF_IM(pX[1]) >> 1;
529
530#endif
531
532
533 // dct4_kernel is DCT_IV without reordering which is done before and after FFT 496 // dct4_kernel is DCT_IV without reordering which is done before and after FFT
534 dct4_kernel(in_real1, in_imag1, out_real1, out_imag1); 497 dct4_kernel(real1, imag1);
535 dct4_kernel(in_real2, in_imag2, out_real2, out_imag2); 498 dct4_kernel(real2, imag2);
536 499
537 500 p_buf_1 = qmfs->v + qmfs->v_index;
538 pring_buffer_1 = qmfs->v + qmfs->v_index; 501 p_buf_3 = p_buf_1 + 1280;
539 pring_buffer_3 = pring_buffer_1 + 1280;
540#ifdef PREFER_POINTERS
541 pring_buffer_2 = pring_buffer_1 + 127;
542 pring_buffer_4 = pring_buffer_1 + (1280 + 127);
543#endif // #ifdef PREFER_POINTERS
544// ptemp_1 = x1;
545// ptemp_2 = x2;
546#ifdef PREFER_POINTERS
547 for (n = 0; n < 32; n ++)
548 {
549 //real_t x1 = *ptemp_1++;
550 //real_t x2 = *ptemp_2++;
551 // pring_buffer_3 and pring_buffer_4 are needed only for double ring buffer
552 *pring_buffer_1++ = *pring_buffer_3++ = out_real2[n] - out_real1[n];
553 *pring_buffer_2-- = *pring_buffer_4-- = out_real2[n] + out_real1[n];
554 //x1 = *ptemp_1++;
555 //x2 = *ptemp_2++;
556 *pring_buffer_1++ = *pring_buffer_3++ = out_imag2[31-n] + out_imag1[31-n];
557 *pring_buffer_2-- = *pring_buffer_4-- = out_imag2[31-n] - out_imag1[31-n];
558 }
559#else // #ifdef PREFER_POINTERS
560 502
503 idx0 = 0; idx1 = 127;
561 for (n = 0; n < 32; n++) 504 for (n = 0; n < 32; n++)
562 { 505 {
563 // pring_buffer_3 and pring_buffer_4 are needed only for double ring buffer 506 p_buf_1[idx0] = p_buf_3[idx0] = real2[ n] - real1[ n]; idx0++;
564 pring_buffer_1[2*n] = pring_buffer_3[2*n] = out_real2[n] - out_real1[n]; 507 p_buf_1[idx1] = p_buf_3[idx1] = real2[ n] + real1[ n]; idx1--;
565 pring_buffer_1[127-2*n] = pring_buffer_3[127-2*n] = out_real2[n] + out_real1[n]; 508 p_buf_1[idx0] = p_buf_3[idx0] = imag2[31-n] + imag1[31-n]; idx0++;
566 pring_buffer_1[2*n+1] = pring_buffer_3[2*n+1] = out_imag2[31-n] + out_imag1[31-n]; 509 p_buf_1[idx1] = p_buf_3[idx1] = imag2[31-n] - imag1[31-n]; idx1--;
567 pring_buffer_1[127-(2*n+1)] = pring_buffer_3[127-(2*n+1)] = out_imag2[31-n] - out_imag1[31-n];
568 } 510 }
569 511
570#endif // #ifdef PREFER_POINTERS 512 p_buf_1 = qmfs->v + qmfs->v_index;
571
572 pring_buffer_1 = qmfs->v + qmfs->v_index;
573#ifdef PREFER_POINTERS
574 pring_buffer_2 = pring_buffer_1 + 192;
575 pring_buffer_3 = pring_buffer_1 + 256;
576 pring_buffer_4 = pring_buffer_1 + (256 + 192);
577 pring_buffer_5 = pring_buffer_1 + 512;
578 pring_buffer_6 = pring_buffer_1 + (512 + 192);
579 pring_buffer_7 = pring_buffer_1 + 768;
580 pring_buffer_8 = pring_buffer_1 + (768 + 192);
581 pring_buffer_9 = pring_buffer_1 + 1024;
582 pring_buffer_10 = pring_buffer_1 + (1024 + 192);
583 pqmf_c_1 = qmf_c;
584 pqmf_c_2 = qmf_c + 64;
585 pqmf_c_3 = qmf_c + 128;
586 pqmf_c_4 = qmf_c + 192;
587 pqmf_c_5 = qmf_c + 256;
588 pqmf_c_6 = qmf_c + 320;
589 pqmf_c_7 = qmf_c + 384;
590 pqmf_c_8 = qmf_c + 448;
591 pqmf_c_9 = qmf_c + 512;
592 pqmf_c_10 = qmf_c + 576;
593#endif // #ifdef PREFER_POINTERS
594 513
595 /* calculate 64 output samples and window */ 514 /* calculate 64 output samples and window */
596 for (k = 0; k < 64; k++) 515 for (k = 0; k < 64; k++)
597 { 516 {
598#ifdef PREFER_POINTERS 517 output[out++] = FAAD_SYNTHESIS_SCALE(
599 output[out++] = 518 MUL_F(p_buf_1[k ], qmf_c[k ]) +
600 MUL_F(*pring_buffer_1++, *pqmf_c_1++) + 519 MUL_F(p_buf_1[k+ 192 ], qmf_c[k+ 64]) +
601 MUL_F(*pring_buffer_2++, *pqmf_c_2++) + 520 MUL_F(p_buf_1[k+ 256 ], qmf_c[k+128]) +
602 MUL_F(*pring_buffer_3++, *pqmf_c_3++) + 521 MUL_F(p_buf_1[k+ 256+192], qmf_c[k+192]) +
603 MUL_F(*pring_buffer_4++, *pqmf_c_4++) + 522 MUL_F(p_buf_1[k+ 512 ], qmf_c[k+256]) +
604 MUL_F(*pring_buffer_5++, *pqmf_c_5++) + 523 MUL_F(p_buf_1[k+ 512+192], qmf_c[k+320]) +
605 MUL_F(*pring_buffer_6++, *pqmf_c_6++) + 524 MUL_F(p_buf_1[k+ 768 ], qmf_c[k+384]) +
606 MUL_F(*pring_buffer_7++, *pqmf_c_7++) + 525 MUL_F(p_buf_1[k+ 768+192], qmf_c[k+448]) +
607 MUL_F(*pring_buffer_8++, *pqmf_c_8++) + 526 MUL_F(p_buf_1[k+1024 ], qmf_c[k+512]) +
608 MUL_F(*pring_buffer_9++, *pqmf_c_9++) + 527 MUL_F(p_buf_1[k+1024+192], qmf_c[k+576]));
609 MUL_F(*pring_buffer_10++, *pqmf_c_10++);
610#else // #ifdef PREFER_POINTERS
611 output[out++] =
612 MUL_F(pring_buffer_1[k+0], qmf_c[k+0]) +
613 MUL_F(pring_buffer_1[k+192], qmf_c[k+64]) +
614 MUL_F(pring_buffer_1[k+256], qmf_c[k+128]) +
615 MUL_F(pring_buffer_1[k+(256+192)], qmf_c[k+192]) +
616 MUL_F(pring_buffer_1[k+512], qmf_c[k+256]) +
617 MUL_F(pring_buffer_1[k+(512+192)], qmf_c[k+320]) +
618 MUL_F(pring_buffer_1[k+768], qmf_c[k+384]) +
619 MUL_F(pring_buffer_1[k+(768+192)], qmf_c[k+448]) +
620 MUL_F(pring_buffer_1[k+1024], qmf_c[k+512]) +
621 MUL_F(pring_buffer_1[k+(1024+192)], qmf_c[k+576]);
622#endif // #ifdef PREFER_POINTERS
623 } 528 }
624 529
625 /* update ringbuffer index */ 530 /* update ringbuffer index */
@@ -628,6 +533,6 @@ void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][6
628 qmfs->v_index = (1280 - 128); 533 qmfs->v_index = (1280 - 128);
629 } 534 }
630} 535}
631#endif 536#endif /* #ifdef SBR_LOW_POWER */
632 537
633#endif 538#endif /* #ifdef SBR_DEC */
diff --git a/apps/codecs/libfaad/sbr_qmf_c.h b/apps/codecs/libfaad/sbr_qmf_c.h
index 19592a7ff6..150d72e1a6 100644
--- a/apps/codecs/libfaad/sbr_qmf_c.h
+++ b/apps/codecs/libfaad/sbr_qmf_c.h
@@ -38,7 +38,7 @@ extern "C" {
38#pragma warning(disable:4244) 38#pragma warning(disable:4244)
39#endif 39#endif
40 40
41ALIGN static const real_t qmf_c[640] = { 41ALIGN static const real_t qmf_c[640] ICONST_ATTR_FAAD_LARGE_IRAM = {
42 FRAC_CONST(0), FRAC_CONST(-0.00055252865047), 42 FRAC_CONST(0), FRAC_CONST(-0.00055252865047),
43 FRAC_CONST(-0.00056176925738), FRAC_CONST(-0.00049475180896), 43 FRAC_CONST(-0.00056176925738), FRAC_CONST(-0.00049475180896),
44 FRAC_CONST(-0.00048752279712), FRAC_CONST(-0.00048937912498), 44 FRAC_CONST(-0.00048752279712), FRAC_CONST(-0.00048937912498),
diff --git a/apps/codecs/libfaad/specrec.c b/apps/codecs/libfaad/specrec.c
index 74bf1f36f6..d21a923384 100644
--- a/apps/codecs/libfaad/specrec.c
+++ b/apps/codecs/libfaad/specrec.c
@@ -458,14 +458,14 @@ static INLINE real_t iquant(int16_t q, const real_t *tab, uint8_t *error)
458 if (q < 0) 458 if (q < 0)
459 { 459 {
460 /* tab contains a value for all possible q [0,8192] */ 460 /* tab contains a value for all possible q [0,8192] */
461 if (-q < IQ_TABLE_SIZE) 461 if (LIKELY(-q < IQ_TABLE_SIZE))
462 return -tab[-q]; 462 return -tab[-q];
463 463
464 *error = 17; 464 *error = 17;
465 return 0; 465 return 0;
466 } else { 466 } else {
467 /* tab contains a value for all possible q [0,8192] */ 467 /* tab contains a value for all possible q [0,8192] */
468 if (q < IQ_TABLE_SIZE) 468 if (LIKELY(q < IQ_TABLE_SIZE))
469 return tab[q]; 469 return tab[q];
470 470
471 *error = 17; 471 *error = 17;
@@ -523,17 +523,17 @@ ALIGN static const real_t pow2sf_tab[] = {
523 - Within a scalefactor window band, the coefficients are in ascending 523 - Within a scalefactor window band, the coefficients are in ascending
524 spectral order. 524 spectral order.
525*/ 525*/
526ALIGN static const real_t pow2_table[] ICONST_ATTR =
527{
528 COEF_CONST(1.0),
529 COEF_CONST(1.1892071150027210667174999705605), /* 2^0.25 */
530 COEF_CONST(1.4142135623730950488016887242097), /* 2^0.50 */
531 COEF_CONST(1.6817928305074290860622509524664) /* 2^0.75 */
532};
526static uint8_t quant_to_spec(NeAACDecHandle hDecoder, 533static uint8_t quant_to_spec(NeAACDecHandle hDecoder,
527 ic_stream *ics, int16_t *quant_data, 534 ic_stream *ics, int16_t *quant_data,
528 real_t *spec_data, uint16_t frame_len) 535 real_t *spec_data, uint16_t frame_len)
529{ 536{
530 ALIGN static const real_t pow2_table[] ICONST_ATTR =
531 {
532 COEF_CONST(1.0),
533 COEF_CONST(1.1892071150027210667174999705605), /* 2^0.25 */
534 COEF_CONST(1.4142135623730950488016887242097), /* 2^0.5 */
535 COEF_CONST(1.6817928305074290860622509524664) /* 2^0.75 */
536 };
537 const real_t *tab = iq_table; 537 const real_t *tab = iq_table;
538 538
539 (void)frame_len; 539 (void)frame_len;