summaryrefslogtreecommitdiff
path: root/lib/rbcodec/codecs/libopus/celt/pitch.c
diff options
context:
space:
mode:
authorNils Wallménius <nils@rockbox.org>2013-05-20 22:25:57 +0200
committerNils Wallménius <nils@rockbox.org>2013-08-31 08:30:51 +0200
commit580b307fd791c0997a8831bc800bba87797bfb7e (patch)
tree807846056f06fd944a750ce41217a877910ebd59 /lib/rbcodec/codecs/libopus/celt/pitch.c
parent74761b70acd96cecc0d35450dd56a98ad9ee7d3d (diff)
downloadrockbox-580b307fd791c0997a8831bc800bba87797bfb7e.tar.gz
rockbox-580b307fd791c0997a8831bc800bba87797bfb7e.zip
Sync opus codec to upstream git
Sync opus codec to upstream commit 02fed471a4568852d6618e041c4f2af0d7730ee2 (August 30 2013) This brings in a lot of optimizations but also makes the diff between our codec and the upstream much smaller as most of our optimizations have been upstreamed or supeceded. Speedups across the board for CELT mode files: 64kbps 128kbps H300 9.82MHz 15.48MHz c200 4.86MHz 9.63MHz fuze v1 10.32MHz 15.92MHz For the silk mode test file (16kbps) arm targets get a speedup of about 2MHz while the H300 is 7.8MHz slower, likely because it's now using the pseudostack more rather than the real stack which is in iram. Patches to get around that are upcomming. Change-Id: Ifecf963e461c51ac42e09dac1e91bc4bc3b12fa3
Diffstat (limited to 'lib/rbcodec/codecs/libopus/celt/pitch.c')
-rw-r--r--lib/rbcodec/codecs/libopus/celt/pitch.c190
1 files changed, 156 insertions, 34 deletions
diff --git a/lib/rbcodec/codecs/libopus/celt/pitch.c b/lib/rbcodec/codecs/libopus/celt/pitch.c
index 1b7efd945d..0d8be13025 100644
--- a/lib/rbcodec/codecs/libopus/celt/pitch.c
+++ b/lib/rbcodec/codecs/libopus/celt/pitch.c
@@ -32,7 +32,7 @@
32*/ 32*/
33 33
34#ifdef HAVE_CONFIG_H 34#ifdef HAVE_CONFIG_H
35#include "opus_config.h" 35#include "config.h"
36#endif 36#endif
37 37
38#include "pitch.h" 38#include "pitch.h"
@@ -77,7 +77,7 @@ static void find_best_pitch(opus_val32 *xcorr, opus_val16 *y, int len,
77#ifndef FIXED_POINT 77#ifndef FIXED_POINT
78 /* Considering the range of xcorr16, this should avoid both underflows 78 /* Considering the range of xcorr16, this should avoid both underflows
79 and overflows (inf) when squaring xcorr16 */ 79 and overflows (inf) when squaring xcorr16 */
80 xcorr16 *= 1e-12; 80 xcorr16 *= 1e-12f;
81#endif 81#endif
82 num = MULT16_16_Q15(xcorr16,xcorr16); 82 num = MULT16_16_Q15(xcorr16,xcorr16);
83 if (MULT16_32_Q15(num,best_den[1]) > MULT16_32_Q15(best_num[1],Syy)) 83 if (MULT16_32_Q15(num,best_den[1]) > MULT16_32_Q15(best_num[1],Syy))
@@ -102,13 +102,57 @@ static void find_best_pitch(opus_val32 *xcorr, opus_val16 *y, int len,
102 } 102 }
103} 103}
104 104
105static void celt_fir5(const opus_val16 *x,
106 const opus_val16 *num,
107 opus_val16 *y,
108 int N,
109 opus_val16 *mem)
110{
111 int i;
112 opus_val16 num0, num1, num2, num3, num4;
113 opus_val32 mem0, mem1, mem2, mem3, mem4;
114 num0=num[0];
115 num1=num[1];
116 num2=num[2];
117 num3=num[3];
118 num4=num[4];
119 mem0=mem[0];
120 mem1=mem[1];
121 mem2=mem[2];
122 mem3=mem[3];
123 mem4=mem[4];
124 for (i=0;i<N;i++)
125 {
126 opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
127 sum = MAC16_16(sum,num0,mem0);
128 sum = MAC16_16(sum,num1,mem1);
129 sum = MAC16_16(sum,num2,mem2);
130 sum = MAC16_16(sum,num3,mem3);
131 sum = MAC16_16(sum,num4,mem4);
132 mem4 = mem3;
133 mem3 = mem2;
134 mem2 = mem1;
135 mem1 = mem0;
136 mem0 = x[i];
137 y[i] = ROUND16(sum, SIG_SHIFT);
138 }
139 mem[0]=mem0;
140 mem[1]=mem1;
141 mem[2]=mem2;
142 mem[3]=mem3;
143 mem[4]=mem4;
144}
145
146
105void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x_lp, 147void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x_lp,
106 int len, int C) 148 int len, int C)
107{ 149{
108 int i; 150 int i;
109 opus_val32 ac[5]; 151 opus_val32 ac[5];
110 opus_val16 tmp=Q15ONE; 152 opus_val16 tmp=Q15ONE;
111 opus_val16 lpc[4], mem[4]={0,0,0,0}; 153 opus_val16 lpc[4], mem[5]={0,0,0,0,0};
154 opus_val16 lpc2[5];
155 opus_val16 c1 = QCONST16(.8f,15);
112#ifdef FIXED_POINT 156#ifdef FIXED_POINT
113 int shift; 157 int shift;
114 opus_val32 maxabs = celt_maxabs32(x[0], len); 158 opus_val32 maxabs = celt_maxabs32(x[0], len);
@@ -161,14 +205,89 @@ void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x
161 tmp = MULT16_16_Q15(QCONST16(.9f,15), tmp); 205 tmp = MULT16_16_Q15(QCONST16(.9f,15), tmp);
162 lpc[i] = MULT16_16_Q15(lpc[i], tmp); 206 lpc[i] = MULT16_16_Q15(lpc[i], tmp);
163 } 207 }
164 celt_fir(x_lp, lpc, x_lp, len>>1, 4, mem); 208 /* Add a zero */
209 lpc2[0] = lpc[0] + QCONST16(.8f,SIG_SHIFT);
210 lpc2[1] = lpc[1] + MULT16_16_Q15(c1,lpc[0]);
211 lpc2[2] = lpc[2] + MULT16_16_Q15(c1,lpc[1]);
212 lpc2[3] = lpc[3] + MULT16_16_Q15(c1,lpc[2]);
213 lpc2[4] = MULT16_16_Q15(c1,lpc[3]);
214 celt_fir5(x_lp, lpc2, x_lp, len>>1, mem);
215}
165 216
166 mem[0]=0; 217#if 0 /* This is a simple version of the pitch correlation that should work
167 lpc[0]=QCONST16(.8f,12); 218 well on DSPs like Blackfin and TI C5x/C6x */
168 celt_fir(x_lp, lpc, x_lp, len>>1, 1, mem);
169 219
220#ifdef FIXED_POINT
221opus_val32
222#else
223void
224#endif
225celt_pitch_xcorr(opus_val16 *x, opus_val16 *y, opus_val32 *xcorr, int len, int max_pitch)
226{
227 int i, j;
228#ifdef FIXED_POINT
229 opus_val32 maxcorr=1;
230#endif
231 for (i=0;i<max_pitch;i++)
232 {
233 opus_val32 sum = 0;
234 for (j=0;j<len;j++)
235 sum = MAC16_16(sum, x[j],y[i+j]);
236 xcorr[i] = sum;
237#ifdef FIXED_POINT
238 maxcorr = MAX32(maxcorr, sum);
239#endif
240 }
241#ifdef FIXED_POINT
242 return maxcorr;
243#endif
170} 244}
171 245
246#else /* Unrolled version of the pitch correlation -- runs faster on x86 and ARM */
247
248#ifdef FIXED_POINT
249opus_val32
250#else
251void
252#endif
253celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch)
254{
255 int i,j;
256#ifdef FIXED_POINT
257 opus_val32 maxcorr=1;
258#endif
259 for (i=0;i<max_pitch-3;i+=4)
260 {
261 opus_val32 sum[4]={0,0,0,0};
262 xcorr_kernel(_x, _y+i, sum, len);
263 xcorr[i]=sum[0];
264 xcorr[i+1]=sum[1];
265 xcorr[i+2]=sum[2];
266 xcorr[i+3]=sum[3];
267#ifdef FIXED_POINT
268 sum[0] = MAX32(sum[0], sum[1]);
269 sum[2] = MAX32(sum[2], sum[3]);
270 sum[0] = MAX32(sum[0], sum[2]);
271 maxcorr = MAX32(maxcorr, sum[0]);
272#endif
273 }
274 /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
275 for (;i<max_pitch;i++)
276 {
277 opus_val32 sum = 0;
278 for (j=0;j<len;j++)
279 sum = MAC16_16(sum, _x[j],_y[i+j]);
280 xcorr[i] = sum;
281#ifdef FIXED_POINT
282 maxcorr = MAX32(maxcorr, sum);
283#endif
284 }
285#ifdef FIXED_POINT
286 return maxcorr;
287#endif
288}
289
290#endif
172void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTRICT y, 291void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTRICT y,
173 int len, int max_pitch, int *pitch) 292 int len, int max_pitch, int *pitch)
174{ 293{
@@ -179,8 +298,8 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR
179 VARDECL(opus_val16, y_lp4); 298 VARDECL(opus_val16, y_lp4);
180 VARDECL(opus_val32, xcorr); 299 VARDECL(opus_val32, xcorr);
181#ifdef FIXED_POINT 300#ifdef FIXED_POINT
182 opus_val32 maxcorr=1; 301 opus_val32 maxcorr;
183 opus_val16 xmax, ymax; 302 opus_val32 xmax, ymax;
184 int shift=0; 303 int shift=0;
185#endif 304#endif
186 int offset; 305 int offset;
@@ -204,7 +323,7 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR
204#ifdef FIXED_POINT 323#ifdef FIXED_POINT
205 xmax = celt_maxabs16(x_lp4, len>>2); 324 xmax = celt_maxabs16(x_lp4, len>>2);
206 ymax = celt_maxabs16(y_lp4, lag>>2); 325 ymax = celt_maxabs16(y_lp4, lag>>2);
207 shift = celt_ilog2(MAX16(1, MAX16(xmax, ymax)))-11; 326 shift = celt_ilog2(MAX32(1, MAX32(xmax, ymax)))-11;
208 if (shift>0) 327 if (shift>0)
209 { 328 {
210 for (j=0;j<len>>2;j++) 329 for (j=0;j<len>>2;j++)
@@ -220,16 +339,11 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR
220 339
221 /* Coarse search with 4x decimation */ 340 /* Coarse search with 4x decimation */
222 341
223 for (i=0;i<max_pitch>>2;i++)
224 {
225 opus_val32 sum = 0;
226 for (j=0;j<len>>2;j++)
227 sum = MAC16_16(sum, x_lp4[j],y_lp4[i+j]);
228 xcorr[i] = MAX32(-1, sum);
229#ifdef FIXED_POINT 342#ifdef FIXED_POINT
230 maxcorr = MAX32(maxcorr, sum); 343 maxcorr =
231#endif 344#endif
232 } 345 celt_pitch_xcorr(x_lp4, y_lp4, xcorr, len>>2, max_pitch>>2);
346
233 find_best_pitch(xcorr, y_lp4, len>>2, max_pitch>>2, best_pitch 347 find_best_pitch(xcorr, y_lp4, len>>2, max_pitch>>2, best_pitch
234#ifdef FIXED_POINT 348#ifdef FIXED_POINT
235 , 0, maxcorr 349 , 0, maxcorr
@@ -288,11 +402,13 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
288 int k, i, T, T0; 402 int k, i, T, T0;
289 opus_val16 g, g0; 403 opus_val16 g, g0;
290 opus_val16 pg; 404 opus_val16 pg;
291 opus_val32 xy,xx,yy; 405 opus_val32 xy,xx,yy,xy2;
292 opus_val32 xcorr[3]; 406 opus_val32 xcorr[3];
293 opus_val32 best_xy, best_yy; 407 opus_val32 best_xy, best_yy;
294 int offset; 408 int offset;
295 int minperiod0; 409 int minperiod0;
410 VARDECL(opus_val32, yy_lookup);
411 SAVE_STACK;
296 412
297 minperiod0 = minperiod; 413 minperiod0 = minperiod;
298 maxperiod /= 2; 414 maxperiod /= 2;
@@ -305,13 +421,16 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
305 *T0_=maxperiod-1; 421 *T0_=maxperiod-1;
306 422
307 T = T0 = *T0_; 423 T = T0 = *T0_;
308 xx=xy=yy=0; 424 ALLOC(yy_lookup, maxperiod+1, opus_val32);
309 for (i=0;i<N;i++) 425 dual_inner_prod(x, x, x-T0, N, &xx, &xy);
426 yy_lookup[0] = xx;
427 yy=xx;
428 for (i=1;i<=maxperiod;i++)
310 { 429 {
311 xy = MAC16_16(xy, x[i], x[i-T0]); 430 yy = yy+MULT16_16(x[-i],x[-i])-MULT16_16(x[N-i],x[N-i]);
312 xx = MAC16_16(xx, x[i], x[i]); 431 yy_lookup[i] = MAX32(0, yy);
313 yy = MAC16_16(yy, x[i-T0],x[i-T0]);
314 } 432 }
433 yy = yy_lookup[T0];
315 best_xy = xy; 434 best_xy = xy;
316 best_yy = yy; 435 best_yy = yy;
317#ifdef FIXED_POINT 436#ifdef FIXED_POINT
@@ -332,6 +451,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
332 int T1, T1b; 451 int T1, T1b;
333 opus_val16 g1; 452 opus_val16 g1;
334 opus_val16 cont=0; 453 opus_val16 cont=0;
454 opus_val16 thresh;
335 T1 = (2*T0+k)/(2*k); 455 T1 = (2*T0+k)/(2*k);
336 if (T1 < minperiod) 456 if (T1 < minperiod)
337 break; 457 break;
@@ -346,15 +466,9 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
346 { 466 {
347 T1b = (2*second_check[k]*T0+k)/(2*k); 467 T1b = (2*second_check[k]*T0+k)/(2*k);
348 } 468 }
349 xy=yy=0; 469 dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2);
350 for (i=0;i<N;i++) 470 xy += xy2;
351 { 471 yy = yy_lookup[T1] + yy_lookup[T1b];
352 xy = MAC16_16(xy, x[i], x[i-T1]);
353 yy = MAC16_16(yy, x[i-T1], x[i-T1]);
354
355 xy = MAC16_16(xy, x[i], x[i-T1b]);
356 yy = MAC16_16(yy, x[i-T1b], x[i-T1b]);
357 }
358#ifdef FIXED_POINT 472#ifdef FIXED_POINT
359 { 473 {
360 opus_val32 x2y2; 474 opus_val32 x2y2;
@@ -373,7 +487,14 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
373 cont = HALF32(prev_gain); 487 cont = HALF32(prev_gain);
374 else 488 else
375 cont = 0; 489 cont = 0;
376 if (g1 > QCONST16(.3f,15) + MULT16_16_Q15(QCONST16(.4f,15),g0)-cont) 490 thresh = MAX16(QCONST16(.3f,15), MULT16_16_Q15(QCONST16(.7f,15),g0)-cont);
491 /* Bias against very high pitch (very short period) to avoid false-positives
492 due to short-term correlation */
493 if (T1<3*minperiod)
494 thresh = MAX16(QCONST16(.4f,15), MULT16_16_Q15(QCONST16(.85f,15),g0)-cont);
495 else if (T1<2*minperiod)
496 thresh = MAX16(QCONST16(.5f,15), MULT16_16_Q15(QCONST16(.9f,15),g0)-cont);
497 if (g1 > thresh)
377 { 498 {
378 best_xy = xy; 499 best_xy = xy;
379 best_yy = yy; 500 best_yy = yy;
@@ -407,6 +528,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
407 528
408 if (*T0_<minperiod0) 529 if (*T0_<minperiod0)
409 *T0_=minperiod0; 530 *T0_=minperiod0;
531 RESTORE_STACK;
410 return pg; 532 return pg;
411} 533}
412#endif 534#endif