Sync to upstream libopus

Sync to commit bb4b6885a139644cf3ac14e7deda9f633ec2d93c This brings in a bunch of optimizations to decode speed and memory usage. Allocations are switched from using the pseudostack to using the real stack. Enabled hacks to reduce stack usage. This should fix crashes on sansa clip, although some files will not play due to failing allocations in the codec buffer. Speeds up decoding of the following test files: H300 (cf) C200 (arm7tdmi) ipod classic (arm9e) 16 kbps (silk) 14.28 MHz 4.00 MHz 2.61 MHz 64 kbps (celt) 4.09 MHz 8.08 MHz 6.24 MHz 128 kbps (celt) 1.93 MHz 8.83 MHz 6.53 MHz Change-Id: I851733a8a5824b61feb363a173091bc7e6629b58
author: Nils Wallménius <nils@rockbox.org> 2014-01-19 16:31:59 +0100
committer: Nils Wallménius <nils@rockbox.org> 2014-07-13 11:12:40 +0200
commit: 9b7ec42403073ee887efc531c153e6b1b6c15bab (patch)
tree: 07e72fe9d817c65a6fede22955344a870842d5e6 /lib/rbcodec/codecs/libopus/celt/celt.c
parent: e557951c94c1efa769900257e466900f0ffeb53b (diff)
download: rockbox-9b7ec42403073ee887efc531c153e6b1b6c15bab.tar.gz
rockbox-9b7ec42403073ee887efc531c153e6b1b6c15bab.zip
1 files changed, 73 insertions, 6 deletions
diff --git a/lib/rbcodec/codecs/libopus/celt/celt.c b/lib/rbcodec/codecs/libopus/celt/celt.c
index 3e0ce6e6a5..c0a1e0dab9 100644
--- a/lib/rbcodec/codecs/libopus/celt/celt.c
+++ b/lib/rbcodec/codecs/libopus/celt/celt.c
@@ -54,6 +54,10 @@
 #define PACKAGE_VERSION "unknown"
 #endif
+#if defined(MIPSr1_ASM)
+#include "mips/celt_mipsr1.h"
+#endif
 int resampling_factor(opus_int32 rate)
 {
@@ -86,6 +90,63 @@ int resampling_factor(opus_int32 rate)
 }
 #ifndef OVERRIDE_COMB_FILTER_CONST
+/* This version should be faster on ARM */
+#ifdef OPUS_ARM_ASM
+static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
+      opus_val16 g10, opus_val16 g11, opus_val16 g12)
+{
+   opus_val32 x0, x1, x2, x3, x4;
+   int i;
+   x4 = SHL32(x[-T-2], 1);
+   x3 = SHL32(x[-T-1], 1);
+   x2 = SHL32(x[-T], 1);
+   x1 = SHL32(x[-T+1], 1);
+   for (i=0;i<N-4;i+=5)
+   {
+      opus_val32 t;
+      x0=SHL32(x[i-T+2],1);
+      t = MAC16_32_Q16(x[i], g10, x2);
+      t = MAC16_32_Q16(t, g11, ADD32(x1,x3));
+      t = MAC16_32_Q16(t, g12, ADD32(x0,x4));
+      y[i] = t;
+      x4=SHL32(x[i-T+3],1);
+      t = MAC16_32_Q16(x[i+1], g10, x1);
+      t = MAC16_32_Q16(t, g11, ADD32(x0,x2));
+      t = MAC16_32_Q16(t, g12, ADD32(x4,x3));
+      y[i+1] = t;
+      x3=SHL32(x[i-T+4],1);
+      t = MAC16_32_Q16(x[i+2], g10, x0);
+      t = MAC16_32_Q16(t, g11, ADD32(x4,x1));
+      t = MAC16_32_Q16(t, g12, ADD32(x3,x2));
+      y[i+2] = t;
+      x2=SHL32(x[i-T+5],1);
+      t = MAC16_32_Q16(x[i+3], g10, x4);
+      t = MAC16_32_Q16(t, g11, ADD32(x3,x0));
+      t = MAC16_32_Q16(t, g12, ADD32(x2,x1));
+      y[i+3] = t;
+      x1=SHL32(x[i-T+6],1);
+      t = MAC16_32_Q16(x[i+4], g10, x3);
+      t = MAC16_32_Q16(t, g11, ADD32(x2,x4));
+      t = MAC16_32_Q16(t, g12, ADD32(x1,x0));
+      y[i+4] = t;
+   }
+#ifdef CUSTOM_MODES
+   for (;i<N;i++)
+   {
+      opus_val32 t;
+      x0=SHL32(x[i-T+2],1);
+      t = MAC16_32_Q16(x[i], g10, x2);
+      t = MAC16_32_Q16(t, g11, ADD32(x1,x3));
+      t = MAC16_32_Q16(t, g12, ADD32(x0,x4));
+      y[i] = t;
+      x4=x3;
+      x3=x2;
+      x2=x1;
+      x1=x0;
+   }
+#endif
+}
+#else
 static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
      opus_val16 g10, opus_val16 g11, opus_val16 g12)
 {
@@ -110,7 +171,9 @@ static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
 }
 #endif
+#endif
+#ifndef OVERRIDE_comb_filter
 void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
      opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
      const opus_val16 *window, int overlap)
@@ -131,16 +194,19 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
         OPUS_MOVE(y, x, N);
      return;
   }
-   g00 = MULT16_16_Q15(g0, gains[tapset0][0]);
+   g00 = MULT16_16_P15(g0, gains[tapset0][0]);
-   g01 = MULT16_16_Q15(g0, gains[tapset0][1]);
+   g01 = MULT16_16_P15(g0, gains[tapset0][1]);
-   g02 = MULT16_16_Q15(g0, gains[tapset0][2]);
+   g02 = MULT16_16_P15(g0, gains[tapset0][2]);
-   g10 = MULT16_16_Q15(g1, gains[tapset1][0]);
+   g10 = MULT16_16_P15(g1, gains[tapset1][0]);
-   g11 = MULT16_16_Q15(g1, gains[tapset1][1]);
+   g11 = MULT16_16_P15(g1, gains[tapset1][1]);
-   g12 = MULT16_16_Q15(g1, gains[tapset1][2]);
+   g12 = MULT16_16_P15(g1, gains[tapset1][2]);
   x1 = x[-T1+1];
   x2 = x[-T1  ];
   x3 = x[-T1-1];
   x4 = x[-T1-2];
+   /* If the filter didn't change, we don't need the overlap */
+   if (g0==g1 && T0==T1 && tapset0==tapset1)
+      overlap=0;
   for (i=0;i<overlap;i++)
   {
      opus_val16 f;
@@ -170,6 +236,7 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
   /* Compute the part with the constant filter. */
   comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12);
 }
+#endif /* OVERRIDE_comb_filter */
 const signed char tf_select_table[4][8] = {
      {0, -1, 0, -1,    0,-1, 0,-1},
author	Nils Wallménius <nils@rockbox.org>	2014-01-19 16:31:59 +0100
committer	Nils Wallménius <nils@rockbox.org>	2014-07-13 11:12:40 +0200
commit	9b7ec42403073ee887efc531c153e6b1b6c15bab (patch)
tree	07e72fe9d817c65a6fede22955344a870842d5e6 /lib/rbcodec/codecs/libopus/celt/celt.c
parent	e557951c94c1efa769900257e466900f0ffeb53b (diff)
download	rockbox-9b7ec42403073ee887efc531c153e6b1b6c15bab.tar.gz rockbox-9b7ec42403073ee887efc531c153e6b1b6c15bab.zip

diff --git a/lib/rbcodec/codecs/libopus/celt/celt.c b/lib/rbcodec/codecs/libopus/celt/celt.c index 3e0ce6e6a5..c0a1e0dab9 100644 --- a/lib/rbcodec/codecs/libopus/celt/celt.c +++ b/lib/rbcodec/codecs/libopus/celt/celt.c
@@ -54,6 +54,10 @@
54	#define PACKAGE_VERSION "unknown"	54	#define PACKAGE_VERSION "unknown"
55	#endif	55	#endif
56		56
		57	#if defined(MIPSr1_ASM)
		58	#include "mips/celt_mipsr1.h"
		59	#endif
		60
57		61
58	int resampling_factor(opus_int32 rate)	62	int resampling_factor(opus_int32 rate)
59	{	63	{
@@ -86,6 +90,63 @@ int resampling_factor(opus_int32 rate)
86	}	90	}
87		91
88	#ifndef OVERRIDE_COMB_FILTER_CONST	92	#ifndef OVERRIDE_COMB_FILTER_CONST
		93	/* This version should be faster on ARM */
		94	#ifdef OPUS_ARM_ASM
		95	static void comb_filter_const(opus_val32 y, opus_val32 x, int T, int N,
		96	opus_val16 g10, opus_val16 g11, opus_val16 g12)
		97	{
		98	opus_val32 x0, x1, x2, x3, x4;
		99	int i;
		100	x4 = SHL32(x[-T-2], 1);
		101	x3 = SHL32(x[-T-1], 1);
		102	x2 = SHL32(x[-T], 1);
		103	x1 = SHL32(x[-T+1], 1);
		104	for (i=0;i<N-4;i+=5)
		105	{
		106	opus_val32 t;
		107	x0=SHL32(x[i-T+2],1);
		108	t = MAC16_32_Q16(x[i], g10, x2);
		109	t = MAC16_32_Q16(t, g11, ADD32(x1,x3));
		110	t = MAC16_32_Q16(t, g12, ADD32(x0,x4));
		111	y[i] = t;
		112	x4=SHL32(x[i-T+3],1);
		113	t = MAC16_32_Q16(x[i+1], g10, x1);
		114	t = MAC16_32_Q16(t, g11, ADD32(x0,x2));
		115	t = MAC16_32_Q16(t, g12, ADD32(x4,x3));
		116	y[i+1] = t;
		117	x3=SHL32(x[i-T+4],1);
		118	t = MAC16_32_Q16(x[i+2], g10, x0);
		119	t = MAC16_32_Q16(t, g11, ADD32(x4,x1));
		120	t = MAC16_32_Q16(t, g12, ADD32(x3,x2));
		121	y[i+2] = t;
		122	x2=SHL32(x[i-T+5],1);
		123	t = MAC16_32_Q16(x[i+3], g10, x4);
		124	t = MAC16_32_Q16(t, g11, ADD32(x3,x0));
		125	t = MAC16_32_Q16(t, g12, ADD32(x2,x1));
		126	y[i+3] = t;
		127	x1=SHL32(x[i-T+6],1);
		128	t = MAC16_32_Q16(x[i+4], g10, x3);
		129	t = MAC16_32_Q16(t, g11, ADD32(x2,x4));
		130	t = MAC16_32_Q16(t, g12, ADD32(x1,x0));
		131	y[i+4] = t;
		132	}
		133	#ifdef CUSTOM_MODES
		134	for (;i<N;i++)
		135	{
		136	opus_val32 t;
		137	x0=SHL32(x[i-T+2],1);
		138	t = MAC16_32_Q16(x[i], g10, x2);
		139	t = MAC16_32_Q16(t, g11, ADD32(x1,x3));
		140	t = MAC16_32_Q16(t, g12, ADD32(x0,x4));
		141	y[i] = t;
		142	x4=x3;
		143	x3=x2;
		144	x2=x1;
		145	x1=x0;
		146	}
		147	#endif
		148	}
		149	#else
89	static void comb_filter_const(opus_val32 y, opus_val32 x, int T, int N,	150	static void comb_filter_const(opus_val32 y, opus_val32 x, int T, int N,
90	opus_val16 g10, opus_val16 g11, opus_val16 g12)	151	opus_val16 g10, opus_val16 g11, opus_val16 g12)
91	{	152	{
@@ -110,7 +171,9 @@ static void comb_filter_const(opus_val32 y, opus_val32 x, int T, int N,
110		171
111	}	172	}
112	#endif	173	#endif
		174	#endif
113		175
		176	#ifndef OVERRIDE_comb_filter
114	void comb_filter(opus_val32 y, opus_val32 x, int T0, int T1, int N,	177	void comb_filter(opus_val32 y, opus_val32 x, int T0, int T1, int N,
115	opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,	178	opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
116	const opus_val16 *window, int overlap)	179	const opus_val16 *window, int overlap)
@@ -131,16 +194,19 @@ void comb_filter(opus_val32 y, opus_val32 x, int T0, int T1, int N,
131	OPUS_MOVE(y, x, N);	194	OPUS_MOVE(y, x, N);
132	return;	195	return;
133	}	196	}
134	g00 = MULT16_16_Q15(g0, gains[tapset0][0]);	197	g00 = MULT16_16_P15(g0, gains[tapset0][0]);
135	g01 = MULT16_16_Q15(g0, gains[tapset0][1]);	198	g01 = MULT16_16_P15(g0, gains[tapset0][1]);
136	g02 = MULT16_16_Q15(g0, gains[tapset0][2]);	199	g02 = MULT16_16_P15(g0, gains[tapset0][2]);
137	g10 = MULT16_16_Q15(g1, gains[tapset1][0]);	200	g10 = MULT16_16_P15(g1, gains[tapset1][0]);
138	g11 = MULT16_16_Q15(g1, gains[tapset1][1]);	201	g11 = MULT16_16_P15(g1, gains[tapset1][1]);
139	g12 = MULT16_16_Q15(g1, gains[tapset1][2]);	202	g12 = MULT16_16_P15(g1, gains[tapset1][2]);
140	x1 = x[-T1+1];	203	x1 = x[-T1+1];
141	x2 = x[-T1 ];	204	x2 = x[-T1 ];
142	x3 = x[-T1-1];	205	x3 = x[-T1-1];
143	x4 = x[-T1-2];	206	x4 = x[-T1-2];
		207	/* If the filter didn't change, we don't need the overlap */
		208	if (g0==g1 && T0==T1 && tapset0==tapset1)
		209	overlap=0;
144	for (i=0;i<overlap;i++)	210	for (i=0;i<overlap;i++)
145	{	211	{
146	opus_val16 f;	212	opus_val16 f;
@@ -170,6 +236,7 @@ void comb_filter(opus_val32 y, opus_val32 x, int T0, int T1, int N,
170	/* Compute the part with the constant filter. */	236	/* Compute the part with the constant filter. */
171	comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12);	237	comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12);
172	}	238	}
		239	#endif /* OVERRIDE_comb_filter */
173		240
174	const signed char tf_select_table[4][8] = {	241	const signed char tf_select_table[4][8] = {
175	{0, -1, 0, -1, 0,-1, 0,-1},	242	{0, -1, 0, -1, 0,-1, 0,-1},