summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--apps/codecs/lib/SOURCES4
-rw-r--r--apps/codecs/lib/asm_arm.h13
-rw-r--r--apps/codecs/lib/codeclib.h2
-rw-r--r--apps/codecs/lib/mdct.c131
4 files changed, 116 insertions, 34 deletions
diff --git a/apps/codecs/lib/SOURCES b/apps/codecs/lib/SOURCES
index da77f97d30..438cd1fc62 100644
--- a/apps/codecs/lib/SOURCES
+++ b/apps/codecs/lib/SOURCES
@@ -4,14 +4,14 @@ fixedpoint.c
4 4
5/* OLD MDCT */ 5/* OLD MDCT */
6/* (when all other codecs are remediated this can be remoed) */ 6/* (when all other codecs are remediated this can be remoed) */
7mdct2.c 7/* mdct2.c */
8mdct_lookup.c 8mdct_lookup.c
9 9
10fft-ffmpeg.c 10fft-ffmpeg.c
11mdct.c 11mdct.c
12 12
13#ifdef CPU_ARM 13#ifdef CPU_ARM
14mdct_arm.S 14/*mdct_arm.S*/
15setjmp_arm.S 15setjmp_arm.S
16../../../firmware/target/arm/support-arm.S 16../../../firmware/target/arm/support-arm.S
17#endif 17#endif
diff --git a/apps/codecs/lib/asm_arm.h b/apps/codecs/lib/asm_arm.h
index 4f31f80c3e..9dcbcef755 100644
--- a/apps/codecs/lib/asm_arm.h
+++ b/apps/codecs/lib/asm_arm.h
@@ -226,14 +226,11 @@ void vect_mult_bw(int32_t *data, int32_t *window, int n)
226#define _V_CLIP_MATH 226#define _V_CLIP_MATH
227 227
228static inline int32_t CLIP_TO_15(int32_t x) { 228static inline int32_t CLIP_TO_15(int32_t x) {
229 int tmp; 229 const int32_t mask = 0xffff7fff;
230 asm volatile("subs %1, %0, #32768\n\t" 230 asm volatile("teq %0,%0,asr #31\n\t"
231 "movpl %0, #0x7f00\n\t" 231 "eorne %0,%1,%0,asr #31\n\t"
232 "orrpl %0, %0, #0xff\n" 232 : "+r"(x)
233 "adds %1, %0, #32768\n\t" 233 : "r" (mask)
234 "movmi %0, #0x8000"
235 : "+r"(x),"=r"(tmp)
236 :
237 : "cc"); 234 : "cc");
238 return(x); 235 return(x);
239} 236}
diff --git a/apps/codecs/lib/codeclib.h b/apps/codecs/lib/codeclib.h
index 817d86a6a3..32a4696b9d 100644
--- a/apps/codecs/lib/codeclib.h
+++ b/apps/codecs/lib/codeclib.h
@@ -65,7 +65,7 @@ void qsort(void *base, size_t nmemb, size_t size, int(*compar)(const void *, con
65 65
66/*MDCT library functions*/ 66/*MDCT library functions*/
67/* -1- Tremor mdct */ 67/* -1- Tremor mdct */
68extern void mdct_backward(int n, int32_t *in, int32_t *out); 68/* extern void mdct_backward(int n, int32_t *in, int32_t *out); */
69/* -2- ffmpeg fft-based mdct */ 69/* -2- ffmpeg fft-based mdct */
70extern void ff_imdct_half(unsigned int nbits, int32_t *output, const int32_t *input); 70extern void ff_imdct_half(unsigned int nbits, int32_t *output, const int32_t *input);
71extern void ff_imdct_calc(unsigned int nbits, int32_t *output, const int32_t *input); 71extern void ff_imdct_calc(unsigned int nbits, int32_t *output, const int32_t *input);
diff --git a/apps/codecs/lib/mdct.c b/apps/codecs/lib/mdct.c
index aefd553f25..9747bd14d9 100644
--- a/apps/codecs/lib/mdct.c
+++ b/apps/codecs/lib/mdct.c
@@ -72,8 +72,9 @@ void ff_imdct_half(unsigned int nbits, fixed32 *output, const fixed32 *input)
72 For postrotation, the factors are sin,cos(2PI*(i+1/4)/N) 72 For postrotation, the factors are sin,cos(2PI*(i+1/4)/N)
73 73
74 Therefore, prerotation can immediately reuse the same twiddles as fft 74 Therefore, prerotation can immediately reuse the same twiddles as fft
75 (for postrotation it's still a bit complex, so this is still using 75 (for postrotation it's still a bit complex, we reuse the fft trig tables
76 an mdct-local set of twiddles to do that part) 76 where we can, or a special table for N=2048, or interpolate between
77 trig tables for N>2048)
77 */ 78 */
78 const int32_t *T = sincos_lookup0; 79 const int32_t *T = sincos_lookup0;
79 const int step = 2<<(12-nbits); 80 const int step = 2<<(12-nbits);
@@ -248,25 +249,49 @@ void ff_imdct_half(unsigned int nbits, fixed32 *output, const fixed32 *input)
248 * <----input----> 249 * <----input---->
249 * <-----------output-----------> 250 * <-----------output----------->
250 * 251 *
252 * The result of ff_imdct_half is to put the 'half' imdct here
253 *
254 * N/2 N-1
255 * <--half imdct-->
256 *
257 * We want it here for the full imdct:
258 * N/4 3N/4-1
259 * <-------------->
260 *
261 * In addition we need to apply two symmetries to get the full imdct:
262 *
263 * <AAAAAA> <DDDDDD>
264 * <BBBBBB><CCCCCC>
265 *
266 * D is a reflection of C
267 * A is a reflection of B (but with sign flipped)
268 *
269 * We process the symmetries at the same time as we 'move' the half imdct
270 * from [N/2,N-1] to [N/4,3N/4-1]
271 *
272 * TODO: find a way to make ff_imdct_half put the result in [N/4..3N/4-1]
273 * This would require being able to use revtab 'inplace' (since the input
274 * and output of imdct_half would then overlap somewhat)
251 */ 275 */
252void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input) ICODE_ATTR_TREMOR_MDCT; 276void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input) ICODE_ATTR_TREMOR_MDCT;
277#ifndef CPU_ARM
253void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input) 278void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input)
254{ 279{
255 const int n = (1<<nbits); 280 const int n = (1<<nbits);
256 const int n2 = (n>>1); 281 const int n2 = (n>>1);
257 const int n4 = (n>>2); 282 const int n4 = (n>>2);
258 283
284 /* tell imdct_half to put the output in [N/2..3N/4-1] i.e. output+n2 */
259 ff_imdct_half(nbits,output+n2,input); 285 ff_imdct_half(nbits,output+n2,input);
260 286
261 /* reflect the half imdct into the full N samples */
262 /* TODO: this could easily be optimised more! */
263 fixed32 * in_r, * in_r2, * out_r, * out_r2; 287 fixed32 * in_r, * in_r2, * out_r, * out_r2;
264 288
289 /* Copy BBBB to AAAA, reflected and sign-flipped.
290 Also copy BBBB to its correct destination (from [N/2..3N/4-1] to [N/4..N/2-1]) */
265 out_r = output; 291 out_r = output;
266 out_r2 = output+n2-8; 292 out_r2 = output+n2-8;
267 in_r = output+n2+n4-8; 293 in_r = output+n2+n4-8;
268 while(out_r<out_r2) 294 while(out_r<out_r2)
269 {
270 out_r[0] = -(out_r2[7] = in_r[7]); 295 out_r[0] = -(out_r2[7] = in_r[7]);
271 out_r[1] = -(out_r2[6] = in_r[6]); 296 out_r[1] = -(out_r2[6] = in_r[6]);
272 out_r[2] = -(out_r2[5] = in_r[5]); 297 out_r[2] = -(out_r2[5] = in_r[5]);
@@ -279,7 +304,6 @@ void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input)
279 out_r += 8; 304 out_r += 8;
280 out_r2 -= 8; 305 out_r2 -= 8;
281 } 306 }
282
283 in_r = output + n2+n4; 307 in_r = output + n2+n4;
284 in_r2 = output + n-4; 308 in_r2 = output + n-4;
285 out_r = output + n2; 309 out_r = output + n2;
@@ -289,28 +313,30 @@ void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input)
289 register fixed32 t0,t1,t2,t3; 313 register fixed32 t0,t1,t2,t3;
290 register fixed32 s0,s1,s2,s3; 314 register fixed32 s0,s1,s2,s3;
291 315
292 //simultaneously do the following things: 316 /* Copy and reflect CCCC to DDDD. Because CCCC is already where
293 // 1. copy range from [n2+n4 .. n-1] to range[n2 .. n2+n4-1] 317 we actually want to put DDDD this is a bit complicated.
294 // 2. reflect range from [n2+n4 .. n-1] inplace 318 * So simultaneously do the following things:
295 // 319 * 1. copy range from [n2+n4 .. n-1] to range[n2 .. n2+n4-1]
296 // [ | ] 320 * 2. reflect range from [n2+n4 .. n-1] inplace
297 // ^a -> <- ^b ^c -> <- ^d 321 *
298 // 322 * [ | ]
299 // #1: copy from ^c to ^a 323 * ^a -> <- ^b ^c -> <- ^d
300 // #2: copy from ^d to ^b 324 *
301 // #3: swap ^c and ^d in place 325 * #1: copy from ^c to ^a
302 // 326 * #2: copy from ^d to ^b
303 // #1 pt1 : load 4 words from ^c. 327 * #3: swap ^c and ^d in place
328 */
329 /* #1 pt1 : load 4 words from ^c. */
304 t0=in_r[0]; t1=in_r[1]; t2=in_r[2]; t3=in_r[3]; 330 t0=in_r[0]; t1=in_r[1]; t2=in_r[2]; t3=in_r[3];
305 // #1 pt2 : write to ^a 331 /* #1 pt2 : write to ^a */
306 out_r[0]=t0;out_r[1]=t1;out_r[2]=t2;out_r[3]=t3; 332 out_r[0]=t0;out_r[1]=t1;out_r[2]=t2;out_r[3]=t3;
307 // #2 pt1 : load 4 words from ^d 333 /* #2 pt1 : load 4 words from ^d */
308 s0=in_r2[0];s1=in_r2[1];s2=in_r2[2];s3=in_r2[3]; 334 s0=in_r2[0];s1=in_r2[1];s2=in_r2[2];s3=in_r2[3];
309 // #2 pt2 : write to ^b 335 /* #2 pt2 : write to ^b */
310 out_r2[0]=s0;out_r2[1]=s1;out_r2[2]=s2;out_r2[3]=s3; 336 out_r2[0]=s0;out_r2[1]=s1;out_r2[2]=s2;out_r2[3]=s3;
311 // #3 pt1 : write words from #2 to ^c 337 /* #3 pt1 : write words from #2 to ^c */
312 in_r[0]=s3;in_r[1]=s2;in_r[2]=s1;in_r[3]=s0; 338 in_r[0]=s3;in_r[1]=s2;in_r[2]=s1;in_r[3]=s0;
313 // #3 pt2 : write words from #1 to ^d 339 /* #3 pt2 : write words from #1 to ^d */
314 in_r2[0]=t3;in_r2[1]=t2;in_r2[2]=t1;in_r2[3]=t0; 340 in_r2[0]=t3;in_r2[1]=t2;in_r2[2]=t1;in_r2[3]=t0;
315 341
316 in_r += 4; 342 in_r += 4;
@@ -319,6 +345,65 @@ void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input)
319 out_r2 -= 4; 345 out_r2 -= 4;
320 } 346 }
321} 347}
348#else
349/* Follows the same structure as the canonical version above */
350void ff_imdct_calc(unsigned int nbits, fixed32 *output, const fixed32 *input)
351{
352 const int n = (1<<nbits);
353 const int n2 = (n>>1);
354 const int n4 = (n>>2);
355
356 ff_imdct_half(nbits,output+n2,input);
357
358 fixed32 * in_r, * in_r2, * out_r, * out_r2;
359
360 out_r = output;
361 out_r2 = output+n2;
362 in_r = output+n2+n4;
363 while(out_r<out_r2)
364 {
365 asm volatile(
366 "ldmdb %[in_r]!, {r0-r7}\n\t"
367 "stmdb %[out_r2]!, {r0-r7}\n\t"
368 "rsb r8,r0,#0\n\t"
369 "rsb r0,r7,#0\n\t"
370 "rsb r7,r1,#0\n\t"
371 "rsb r1,r6,#0\n\t"
372 "rsb r6,r2,#0\n\t"
373 "rsb r2,r5,#0\n\t"
374 "rsb r5,r3,#0\n\t"
375 "rsb r3,r4,#0\n\t"
376 "stmia %[out_r]!, {r0-r3,r5-r8}\n\t"
377 : [in_r] "+r" (in_r), [out_r] "+r" (out_r), [out_r2] "+r" (out_r2)
378 :
379 : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8" );
380 }
381 in_r = output + n2+n4;
382 in_r2 = output + n;
383 out_r = output + n2;
384 out_r2 = output + n2 + n4;
385 while(in_r<in_r2)
386 {
387 asm volatile(
388 "ldmia %[in_r], {r0-r3}\n\t"
389 "stmia %[out_r]!, {r0-r3}\n\t"
390 "ldmdb %[in_r2], {r5-r8}\n\t"
391 "stmdb %[out_r2]!, {r5-r8}\n\t"
392 "mov r4,r0\n\t"
393 "mov r0,r3\n\t"
394 "mov r3,r1\n\t"
395 "stmdb %[in_r2]!, {r0,r2,r3,r4}\n\t"
396 "mov r4,r8\n\t"
397 "mov r8,r5\n\t"
398 "mov r5,r7\n\t"
399 "stmia %[in_r]!, {r4,r5,r6,r8}\n\t"
400 :
401 [in_r] "+r" (in_r), [in_r2] "+r" (in_r2), [out_r] "+r" (out_r), [out_r2] "+r" (out_r2)
402 :
403 : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8" );
404 }
405}
406#endif
322 407
323static const long cordic_circular_gain = 0xb2458939; /* 0.607252929 */ 408static const long cordic_circular_gain = 0xb2458939; /* 0.607252929 */
324 409