summaryrefslogtreecommitdiff
path: root/lib/rbcodec/codecs/libopus/celt/mdct.c
diff options
context:
space:
mode:
authorNils Wallménius <nils@rockbox.org>2014-01-19 16:31:59 +0100
committerNils Wallménius <nils@rockbox.org>2014-07-13 11:12:40 +0200
commit9b7ec42403073ee887efc531c153e6b1b6c15bab (patch)
tree07e72fe9d817c65a6fede22955344a870842d5e6 /lib/rbcodec/codecs/libopus/celt/mdct.c
parente557951c94c1efa769900257e466900f0ffeb53b (diff)
downloadrockbox-9b7ec42403073ee887efc531c153e6b1b6c15bab.tar.gz
rockbox-9b7ec42403073ee887efc531c153e6b1b6c15bab.zip
Sync to upstream libopus
Sync to commit bb4b6885a139644cf3ac14e7deda9f633ec2d93c This brings in a bunch of optimizations to decode speed and memory usage. Allocations are switched from using the pseudostack to using the real stack. Enabled hacks to reduce stack usage. This should fix crashes on sansa clip, although some files will not play due to failing allocations in the codec buffer. Speeds up decoding of the following test files: H300 (cf) C200 (arm7tdmi) ipod classic (arm9e) 16 kbps (silk) 14.28 MHz 4.00 MHz 2.61 MHz 64 kbps (celt) 4.09 MHz 8.08 MHz 6.24 MHz 128 kbps (celt) 1.93 MHz 8.83 MHz 6.53 MHz Change-Id: I851733a8a5824b61feb363a173091bc7e6629b58
Diffstat (limited to 'lib/rbcodec/codecs/libopus/celt/mdct.c')
-rw-r--r--lib/rbcodec/codecs/libopus/celt/mdct.c191
1 files changed, 110 insertions, 81 deletions
diff --git a/lib/rbcodec/codecs/libopus/celt/mdct.c b/lib/rbcodec/codecs/libopus/celt/mdct.c
index 72ea180568..7fa8eaf6bf 100644
--- a/lib/rbcodec/codecs/libopus/celt/mdct.c
+++ b/lib/rbcodec/codecs/libopus/celt/mdct.c
@@ -53,18 +53,20 @@
53#include "mathops.h" 53#include "mathops.h"
54#include "stack_alloc.h" 54#include "stack_alloc.h"
55 55
56#if defined(MIPSr1_ASM)
57#include "mips/mdct_mipsr1.h"
58#endif
59
60
56#ifdef CUSTOM_MODES 61#ifdef CUSTOM_MODES
57 62
58int clt_mdct_init(mdct_lookup *l,int N, int maxshift) 63int clt_mdct_init(mdct_lookup *l,int N, int maxshift)
59{ 64{
60 int i; 65 int i;
61 int N4;
62 kiss_twiddle_scalar *trig; 66 kiss_twiddle_scalar *trig;
63#if defined(FIXED_POINT) 67 int shift;
64 int N2=N>>1; 68 int N2=N>>1;
65#endif
66 l->n = N; 69 l->n = N;
67 N4 = N>>2;
68 l->maxshift = maxshift; 70 l->maxshift = maxshift;
69 for (i=0;i<=maxshift;i++) 71 for (i=0;i<=maxshift;i++)
70 { 72 {
@@ -77,17 +79,28 @@ int clt_mdct_init(mdct_lookup *l,int N, int maxshift)
77 return 0; 79 return 0;
78#endif 80#endif
79 } 81 }
80 l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N4+1)*sizeof(kiss_twiddle_scalar)); 82 l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N-(N2>>maxshift))*sizeof(kiss_twiddle_scalar));
81 if (l->trig==NULL) 83 if (l->trig==NULL)
82 return 0; 84 return 0;
83 /* We have enough points that sine isn't necessary */ 85 for (shift=0;shift<=maxshift;shift++)
86 {
87 /* We have enough points that sine isn't necessary */
84#if defined(FIXED_POINT) 88#if defined(FIXED_POINT)
85 for (i=0;i<=N4;i++) 89#if 1
86 trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2),N)); 90 for (i=0;i<N2;i++)
91 trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2+16384),N));
87#else 92#else
88 for (i=0;i<=N4;i++) 93 for (i=0;i<N2;i++)
89 trig[i] = (kiss_twiddle_scalar)cos(2*PI*i/N); 94 trig[i] = (kiss_twiddle_scalar)MAX32(-32767,MIN32(32767,floor(.5+32768*cos(2*M_PI*(i+.125)/N))));
90#endif 95#endif
96#else
97 for (i=0;i<N2;i++)
98 trig[i] = (kiss_twiddle_scalar)cos(2*PI*(i+.125)/N);
99#endif
100 trig += N2;
101 N2 >>= 1;
102 N >>= 1;
103 }
91 return 1; 104 return 1;
92} 105}
93 106
@@ -103,27 +116,37 @@ void clt_mdct_clear(mdct_lookup *l)
103 116
104#if 0 117#if 0
105/* Forward MDCT trashes the input array */ 118/* Forward MDCT trashes the input array */
119#ifndef OVERRIDE_clt_mdct_forward
106void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, 120void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
107 const opus_val16 *window, int overlap, int shift, int stride) 121 const opus_val16 *window, int overlap, int shift, int stride)
108{ 122{
109 int i; 123 int i;
110 int N, N2, N4; 124 int N, N2, N4;
111 kiss_twiddle_scalar sine;
112 VARDECL(kiss_fft_scalar, f); 125 VARDECL(kiss_fft_scalar, f);
113 VARDECL(kiss_fft_scalar, f2); 126 VARDECL(kiss_fft_cpx, f2);
127 const kiss_fft_state *st = l->kfft[shift];
128 const kiss_twiddle_scalar *trig;
129 opus_val16 scale;
130#ifdef FIXED_POINT
131 /* Allows us to scale with MULT16_32_Q16(), which is faster than
132 MULT16_32_Q15() on ARM. */
133 int scale_shift = st->scale_shift-1;
134#endif
114 SAVE_STACK; 135 SAVE_STACK;
136 scale = st->scale;
137
115 N = l->n; 138 N = l->n;
116 N >>= shift; 139 trig = l->trig;
140 for (i=0;i<shift;i++)
141 {
142 N >>= 1;
143 trig += N;
144 }
117 N2 = N>>1; 145 N2 = N>>1;
118 N4 = N>>2; 146 N4 = N>>2;
147
119 ALLOC(f, N2, kiss_fft_scalar); 148 ALLOC(f, N2, kiss_fft_scalar);
120 ALLOC(f2, N2, kiss_fft_scalar); 149 ALLOC(f2, N4, kiss_fft_cpx);
121 /* sin(x) ~= x here */
122#ifdef FIXED_POINT
123 sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
124#else
125 sine = (kiss_twiddle_scalar)2*PI*(.125f)/N;
126#endif
127 150
128 /* Consider the input to be composed of four blocks: [a, b, c, d] */ 151 /* Consider the input to be composed of four blocks: [a, b, c, d] */
129 /* Window, shuffle, fold */ 152 /* Window, shuffle, fold */
@@ -168,125 +191,131 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
168 /* Pre-rotation */ 191 /* Pre-rotation */
169 { 192 {
170 kiss_fft_scalar * OPUS_RESTRICT yp = f; 193 kiss_fft_scalar * OPUS_RESTRICT yp = f;
171 const kiss_twiddle_scalar *t = &l->trig[0]; 194 const kiss_twiddle_scalar *t = &trig[0];
172 for(i=0;i<N4;i++) 195 for(i=0;i<N4;i++)
173 { 196 {
197 kiss_fft_cpx yc;
198 kiss_twiddle_scalar t0, t1;
174 kiss_fft_scalar re, im, yr, yi; 199 kiss_fft_scalar re, im, yr, yi;
175 re = yp[0]; 200 t0 = t[i];
176 im = yp[1]; 201 t1 = t[N4+i];
177 yr = -S_MUL(re,t[i<<shift]) - S_MUL(im,t[(N4-i)<<shift]); 202 re = *yp++;
178 yi = -S_MUL(im,t[i<<shift]) + S_MUL(re,t[(N4-i)<<shift]); 203 im = *yp++;
179 /* works because the cos is nearly one */ 204 yr = S_MUL(re,t0) - S_MUL(im,t1);
180 *yp++ = yr + S_MUL(yi,sine); 205 yi = S_MUL(im,t0) + S_MUL(re,t1);
181 *yp++ = yi - S_MUL(yr,sine); 206 yc.r = yr;
207 yc.i = yi;
208 yc.r = PSHR32(MULT16_32_Q16(scale, yc.r), scale_shift);
209 yc.i = PSHR32(MULT16_32_Q16(scale, yc.i), scale_shift);
210 f2[st->bitrev[i]] = yc;
182 } 211 }
183 } 212 }
184 213
185 /* N/4 complex FFT, down-scales by 4/N */ 214 /* N/4 complex FFT, does not downscale anymore */
186 opus_fft(l->kfft[shift], (kiss_fft_cpx *)f, (kiss_fft_cpx *)f2); 215 opus_fft_impl(st, f2);
187 216
188 /* Post-rotate */ 217 /* Post-rotate */
189 { 218 {
190 /* Temp pointers to make it really clear to the compiler what we're doing */ 219 /* Temp pointers to make it really clear to the compiler what we're doing */
191 const kiss_fft_scalar * OPUS_RESTRICT fp = f2; 220 const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
192 kiss_fft_scalar * OPUS_RESTRICT yp1 = out; 221 kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
193 kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1); 222 kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
194 const kiss_twiddle_scalar *t = &l->trig[0]; 223 const kiss_twiddle_scalar *t = &trig[0];
195 /* Temp pointers to make it really clear to the compiler what we're doing */ 224 /* Temp pointers to make it really clear to the compiler what we're doing */
196 for(i=0;i<N4;i++) 225 for(i=0;i<N4;i++)
197 { 226 {
198 kiss_fft_scalar yr, yi; 227 kiss_fft_scalar yr, yi;
199 yr = S_MUL(fp[1],t[(N4-i)<<shift]) + S_MUL(fp[0],t[i<<shift]); 228 yr = S_MUL(fp->i,t[N4+i]) - S_MUL(fp->r,t[i]);
200 yi = S_MUL(fp[0],t[(N4-i)<<shift]) - S_MUL(fp[1],t[i<<shift]); 229 yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]);
201 /* works because the cos is nearly one */ 230 *yp1 = yr;
202 *yp1 = yr - S_MUL(yi,sine); 231 *yp2 = yi;
203 *yp2 = yi + S_MUL(yr,sine);; 232 fp++;
204 fp += 2;
205 yp1 += 2*stride; 233 yp1 += 2*stride;
206 yp2 -= 2*stride; 234 yp2 -= 2*stride;
207 } 235 }
208 } 236 }
209 RESTORE_STACK; 237 RESTORE_STACK;
210} 238}
239#endif /* OVERRIDE_clt_mdct_forward */
211#endif 240#endif
212 241
242#ifndef OVERRIDE_clt_mdct_backward
213void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, 243void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
214 const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride) 244 const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride)
215{ 245{
216 int i; 246 int i;
217 int N, N2, N4; 247 int N, N2, N4;
218 kiss_twiddle_scalar sine; 248 const kiss_twiddle_scalar *trig;
219/* VARDECL(kiss_fft_scalar, f2); 249
220 SAVE_STACK; */
221 N = l->n; 250 N = l->n;
222 N >>= shift; 251 trig = l->trig;
252 for (i=0;i<shift;i++)
253 {
254 N >>= 1;
255 trig += N;
256 }
223 N2 = N>>1; 257 N2 = N>>1;
224 N4 = N>>2; 258 N4 = N>>2;
225/* ALLOC(f2, N2, kiss_fft_scalar); */
226 kiss_fft_scalar f2[N2]; /* worst case 3840b */
227 /* sin(x) ~= x here */
228#ifdef FIXED_POINT
229 sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
230#else
231 sine = (kiss_twiddle_scalar)2*PI*(.125f)/N;
232#endif
233 259
234 /* Pre-rotate */ 260 /* Pre-rotate */
235 { 261 {
236 /* Temp pointers to make it really clear to the compiler what we're doing */ 262 /* Temp pointers to make it really clear to the compiler what we're doing */
237 const kiss_fft_scalar * OPUS_RESTRICT xp1 = in; 263 const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
238 const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1); 264 const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
239 kiss_fft_scalar * OPUS_RESTRICT yp = f2; 265 kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1);
240 const kiss_twiddle_scalar *t = &l->trig[0]; 266 const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
267 const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev;
241 for(i=0;i<N4;i++) 268 for(i=0;i<N4;i++)
242 { 269 {
270 int rev;
243 kiss_fft_scalar yr, yi; 271 kiss_fft_scalar yr, yi;
244 yr = -S_MUL(*xp2, t[i<<shift]) + S_MUL(*xp1,t[(N4-i)<<shift]); 272 rev = *bitrev++;
245 yi = -S_MUL(*xp2, t[(N4-i)<<shift]) - S_MUL(*xp1,t[i<<shift]); 273 yr = S_MUL(*xp2, t[i]) + S_MUL(*xp1, t[N4+i]);
246 /* works because the cos is nearly one */ 274 yi = S_MUL(*xp1, t[i]) - S_MUL(*xp2, t[N4+i]);
247 *yp++ = yr - S_MUL(yi,sine); 275 /* We swap real and imag because we use an FFT instead of an IFFT. */
248 *yp++ = yi + S_MUL(yr,sine); 276 yp[2*rev+1] = yr;
277 yp[2*rev] = yi;
278 /* Storing the pre-rotation directly in the bitrev order. */
249 xp1+=2*stride; 279 xp1+=2*stride;
250 xp2-=2*stride; 280 xp2-=2*stride;
251 } 281 }
252 } 282 }
253 283
254 /* Inverse N/4 complex FFT. This one should *not* downscale even in fixed-point */ 284 opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)));
255 opus_ifft(l->kfft[shift], (kiss_fft_cpx *)f2, (kiss_fft_cpx *)(out+(overlap>>1)));
256 285
257 /* Post-rotate and de-shuffle from both ends of the buffer at once to make 286 /* Post-rotate and de-shuffle from both ends of the buffer at once to make
258 it in-place. */ 287 it in-place. */
259 { 288 {
260 kiss_fft_scalar * OPUS_RESTRICT yp0 = out+(overlap>>1); 289 kiss_fft_scalar * yp0 = out+(overlap>>1);
261 kiss_fft_scalar * OPUS_RESTRICT yp1 = out+(overlap>>1)+N2-2; 290 kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
262 const kiss_twiddle_scalar *t = &l->trig[0]; 291 const kiss_twiddle_scalar *t = &trig[0];
263 /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the 292 /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
264 middle pair will be computed twice. */ 293 middle pair will be computed twice. */
265 for(i=0;i<(N4+1)>>1;i++) 294 for(i=0;i<(N4+1)>>1;i++)
266 { 295 {
267 kiss_fft_scalar re, im, yr, yi; 296 kiss_fft_scalar re, im, yr, yi;
268 kiss_twiddle_scalar t0, t1; 297 kiss_twiddle_scalar t0, t1;
269 re = yp0[0]; 298 /* We swap real and imag because we're using an FFT instead of an IFFT. */
270 im = yp0[1]; 299 re = yp0[1];
271 t0 = t[i<<shift]; 300 im = yp0[0];
272 t1 = t[(N4-i)<<shift]; 301 t0 = t[i];
302 t1 = t[N4+i];
273 /* We'd scale up by 2 here, but instead it's done when mixing the windows */ 303 /* We'd scale up by 2 here, but instead it's done when mixing the windows */
274 yr = S_MUL(re,t0) - S_MUL(im,t1); 304 yr = S_MUL(re,t0) + S_MUL(im,t1);
275 yi = S_MUL(im,t0) + S_MUL(re,t1); 305 yi = S_MUL(re,t1) - S_MUL(im,t0);
276 re = yp1[0]; 306 /* We swap real and imag because we're using an FFT instead of an IFFT. */
277 im = yp1[1]; 307 re = yp1[1];
278 /* works because the cos is nearly one */ 308 im = yp1[0];
279 yp0[0] = -(yr - S_MUL(yi,sine)); 309 yp0[0] = yr;
280 yp1[1] = yi + S_MUL(yr,sine); 310 yp1[1] = yi;
281 311
282 t0 = t[(N4-i-1)<<shift]; 312 t0 = t[(N4-i-1)];
283 t1 = t[(i+1)<<shift]; 313 t1 = t[(N2-i-1)];
284 /* We'd scale up by 2 here, but instead it's done when mixing the windows */ 314 /* We'd scale up by 2 here, but instead it's done when mixing the windows */
285 yr = S_MUL(re,t0) - S_MUL(im,t1); 315 yr = S_MUL(re,t0) + S_MUL(im,t1);
286 yi = S_MUL(im,t0) + S_MUL(re,t1); 316 yi = S_MUL(re,t1) - S_MUL(im,t0);
287 /* works because the cos is nearly one */ 317 yp1[0] = yr;
288 yp1[0] = -(yr - S_MUL(yi,sine)); 318 yp0[1] = yi;
289 yp0[1] = yi + S_MUL(yr,sine);
290 yp0 += 2; 319 yp0 += 2;
291 yp1 -= 2; 320 yp1 -= 2;
292 } 321 }
@@ -310,5 +339,5 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
310 wp2--; 339 wp2--;
311 } 340 }
312 } 341 }
313/* RESTORE_STACK; */
314} 342}
343#endif /* OVERRIDE_clt_mdct_backward */