diff options
author | Nils Wallménius <nils@rockbox.org> | 2014-01-19 16:31:59 +0100 |
---|---|---|
committer | Nils Wallménius <nils@rockbox.org> | 2014-07-13 11:12:40 +0200 |
commit | 9b7ec42403073ee887efc531c153e6b1b6c15bab (patch) | |
tree | 07e72fe9d817c65a6fede22955344a870842d5e6 /lib/rbcodec/codecs/libopus/celt/mdct.c | |
parent | e557951c94c1efa769900257e466900f0ffeb53b (diff) | |
download | rockbox-9b7ec42403073ee887efc531c153e6b1b6c15bab.tar.gz rockbox-9b7ec42403073ee887efc531c153e6b1b6c15bab.zip |
Sync to upstream libopus
Sync to commit bb4b6885a139644cf3ac14e7deda9f633ec2d93c
This brings in a bunch of optimizations to decode speed
and memory usage. Allocations are switched from using
the pseudostack to using the real stack. Enabled hacks
to reduce stack usage.
This should fix crashes on sansa clip, although some
files will not play due to failing allocations in the
codec buffer.
Speeds up decoding of the following test files:
H300 (cf) C200 (arm7tdmi) ipod classic (arm9e)
16 kbps (silk) 14.28 MHz 4.00 MHz 2.61 MHz
64 kbps (celt) 4.09 MHz 8.08 MHz 6.24 MHz
128 kbps (celt) 1.93 MHz 8.83 MHz 6.53 MHz
Change-Id: I851733a8a5824b61feb363a173091bc7e6629b58
Diffstat (limited to 'lib/rbcodec/codecs/libopus/celt/mdct.c')
-rw-r--r-- | lib/rbcodec/codecs/libopus/celt/mdct.c | 191 |
1 files changed, 110 insertions, 81 deletions
diff --git a/lib/rbcodec/codecs/libopus/celt/mdct.c b/lib/rbcodec/codecs/libopus/celt/mdct.c index 72ea180568..7fa8eaf6bf 100644 --- a/lib/rbcodec/codecs/libopus/celt/mdct.c +++ b/lib/rbcodec/codecs/libopus/celt/mdct.c | |||
@@ -53,18 +53,20 @@ | |||
53 | #include "mathops.h" | 53 | #include "mathops.h" |
54 | #include "stack_alloc.h" | 54 | #include "stack_alloc.h" |
55 | 55 | ||
56 | #if defined(MIPSr1_ASM) | ||
57 | #include "mips/mdct_mipsr1.h" | ||
58 | #endif | ||
59 | |||
60 | |||
56 | #ifdef CUSTOM_MODES | 61 | #ifdef CUSTOM_MODES |
57 | 62 | ||
58 | int clt_mdct_init(mdct_lookup *l,int N, int maxshift) | 63 | int clt_mdct_init(mdct_lookup *l,int N, int maxshift) |
59 | { | 64 | { |
60 | int i; | 65 | int i; |
61 | int N4; | ||
62 | kiss_twiddle_scalar *trig; | 66 | kiss_twiddle_scalar *trig; |
63 | #if defined(FIXED_POINT) | 67 | int shift; |
64 | int N2=N>>1; | 68 | int N2=N>>1; |
65 | #endif | ||
66 | l->n = N; | 69 | l->n = N; |
67 | N4 = N>>2; | ||
68 | l->maxshift = maxshift; | 70 | l->maxshift = maxshift; |
69 | for (i=0;i<=maxshift;i++) | 71 | for (i=0;i<=maxshift;i++) |
70 | { | 72 | { |
@@ -77,17 +79,28 @@ int clt_mdct_init(mdct_lookup *l,int N, int maxshift) | |||
77 | return 0; | 79 | return 0; |
78 | #endif | 80 | #endif |
79 | } | 81 | } |
80 | l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N4+1)*sizeof(kiss_twiddle_scalar)); | 82 | l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N-(N2>>maxshift))*sizeof(kiss_twiddle_scalar)); |
81 | if (l->trig==NULL) | 83 | if (l->trig==NULL) |
82 | return 0; | 84 | return 0; |
83 | /* We have enough points that sine isn't necessary */ | 85 | for (shift=0;shift<=maxshift;shift++) |
86 | { | ||
87 | /* We have enough points that sine isn't necessary */ | ||
84 | #if defined(FIXED_POINT) | 88 | #if defined(FIXED_POINT) |
85 | for (i=0;i<=N4;i++) | 89 | #if 1 |
86 | trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2),N)); | 90 | for (i=0;i<N2;i++) |
91 | trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2+16384),N)); | ||
87 | #else | 92 | #else |
88 | for (i=0;i<=N4;i++) | 93 | for (i=0;i<N2;i++) |
89 | trig[i] = (kiss_twiddle_scalar)cos(2*PI*i/N); | 94 | trig[i] = (kiss_twiddle_scalar)MAX32(-32767,MIN32(32767,floor(.5+32768*cos(2*M_PI*(i+.125)/N)))); |
90 | #endif | 95 | #endif |
96 | #else | ||
97 | for (i=0;i<N2;i++) | ||
98 | trig[i] = (kiss_twiddle_scalar)cos(2*PI*(i+.125)/N); | ||
99 | #endif | ||
100 | trig += N2; | ||
101 | N2 >>= 1; | ||
102 | N >>= 1; | ||
103 | } | ||
91 | return 1; | 104 | return 1; |
92 | } | 105 | } |
93 | 106 | ||
@@ -103,27 +116,37 @@ void clt_mdct_clear(mdct_lookup *l) | |||
103 | 116 | ||
104 | #if 0 | 117 | #if 0 |
105 | /* Forward MDCT trashes the input array */ | 118 | /* Forward MDCT trashes the input array */ |
119 | #ifndef OVERRIDE_clt_mdct_forward | ||
106 | void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, | 120 | void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, |
107 | const opus_val16 *window, int overlap, int shift, int stride) | 121 | const opus_val16 *window, int overlap, int shift, int stride) |
108 | { | 122 | { |
109 | int i; | 123 | int i; |
110 | int N, N2, N4; | 124 | int N, N2, N4; |
111 | kiss_twiddle_scalar sine; | ||
112 | VARDECL(kiss_fft_scalar, f); | 125 | VARDECL(kiss_fft_scalar, f); |
113 | VARDECL(kiss_fft_scalar, f2); | 126 | VARDECL(kiss_fft_cpx, f2); |
127 | const kiss_fft_state *st = l->kfft[shift]; | ||
128 | const kiss_twiddle_scalar *trig; | ||
129 | opus_val16 scale; | ||
130 | #ifdef FIXED_POINT | ||
131 | /* Allows us to scale with MULT16_32_Q16(), which is faster than | ||
132 | MULT16_32_Q15() on ARM. */ | ||
133 | int scale_shift = st->scale_shift-1; | ||
134 | #endif | ||
114 | SAVE_STACK; | 135 | SAVE_STACK; |
136 | scale = st->scale; | ||
137 | |||
115 | N = l->n; | 138 | N = l->n; |
116 | N >>= shift; | 139 | trig = l->trig; |
140 | for (i=0;i<shift;i++) | ||
141 | { | ||
142 | N >>= 1; | ||
143 | trig += N; | ||
144 | } | ||
117 | N2 = N>>1; | 145 | N2 = N>>1; |
118 | N4 = N>>2; | 146 | N4 = N>>2; |
147 | |||
119 | ALLOC(f, N2, kiss_fft_scalar); | 148 | ALLOC(f, N2, kiss_fft_scalar); |
120 | ALLOC(f2, N2, kiss_fft_scalar); | 149 | ALLOC(f2, N4, kiss_fft_cpx); |
121 | /* sin(x) ~= x here */ | ||
122 | #ifdef FIXED_POINT | ||
123 | sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N; | ||
124 | #else | ||
125 | sine = (kiss_twiddle_scalar)2*PI*(.125f)/N; | ||
126 | #endif | ||
127 | 150 | ||
128 | /* Consider the input to be composed of four blocks: [a, b, c, d] */ | 151 | /* Consider the input to be composed of four blocks: [a, b, c, d] */ |
129 | /* Window, shuffle, fold */ | 152 | /* Window, shuffle, fold */ |
@@ -168,125 +191,131 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar | |||
168 | /* Pre-rotation */ | 191 | /* Pre-rotation */ |
169 | { | 192 | { |
170 | kiss_fft_scalar * OPUS_RESTRICT yp = f; | 193 | kiss_fft_scalar * OPUS_RESTRICT yp = f; |
171 | const kiss_twiddle_scalar *t = &l->trig[0]; | 194 | const kiss_twiddle_scalar *t = &trig[0]; |
172 | for(i=0;i<N4;i++) | 195 | for(i=0;i<N4;i++) |
173 | { | 196 | { |
197 | kiss_fft_cpx yc; | ||
198 | kiss_twiddle_scalar t0, t1; | ||
174 | kiss_fft_scalar re, im, yr, yi; | 199 | kiss_fft_scalar re, im, yr, yi; |
175 | re = yp[0]; | 200 | t0 = t[i]; |
176 | im = yp[1]; | 201 | t1 = t[N4+i]; |
177 | yr = -S_MUL(re,t[i<<shift]) - S_MUL(im,t[(N4-i)<<shift]); | 202 | re = *yp++; |
178 | yi = -S_MUL(im,t[i<<shift]) + S_MUL(re,t[(N4-i)<<shift]); | 203 | im = *yp++; |
179 | /* works because the cos is nearly one */ | 204 | yr = S_MUL(re,t0) - S_MUL(im,t1); |
180 | *yp++ = yr + S_MUL(yi,sine); | 205 | yi = S_MUL(im,t0) + S_MUL(re,t1); |
181 | *yp++ = yi - S_MUL(yr,sine); | 206 | yc.r = yr; |
207 | yc.i = yi; | ||
208 | yc.r = PSHR32(MULT16_32_Q16(scale, yc.r), scale_shift); | ||
209 | yc.i = PSHR32(MULT16_32_Q16(scale, yc.i), scale_shift); | ||
210 | f2[st->bitrev[i]] = yc; | ||
182 | } | 211 | } |
183 | } | 212 | } |
184 | 213 | ||
185 | /* N/4 complex FFT, down-scales by 4/N */ | 214 | /* N/4 complex FFT, does not downscale anymore */ |
186 | opus_fft(l->kfft[shift], (kiss_fft_cpx *)f, (kiss_fft_cpx *)f2); | 215 | opus_fft_impl(st, f2); |
187 | 216 | ||
188 | /* Post-rotate */ | 217 | /* Post-rotate */ |
189 | { | 218 | { |
190 | /* Temp pointers to make it really clear to the compiler what we're doing */ | 219 | /* Temp pointers to make it really clear to the compiler what we're doing */ |
191 | const kiss_fft_scalar * OPUS_RESTRICT fp = f2; | 220 | const kiss_fft_cpx * OPUS_RESTRICT fp = f2; |
192 | kiss_fft_scalar * OPUS_RESTRICT yp1 = out; | 221 | kiss_fft_scalar * OPUS_RESTRICT yp1 = out; |
193 | kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1); | 222 | kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1); |
194 | const kiss_twiddle_scalar *t = &l->trig[0]; | 223 | const kiss_twiddle_scalar *t = &trig[0]; |
195 | /* Temp pointers to make it really clear to the compiler what we're doing */ | 224 | /* Temp pointers to make it really clear to the compiler what we're doing */ |
196 | for(i=0;i<N4;i++) | 225 | for(i=0;i<N4;i++) |
197 | { | 226 | { |
198 | kiss_fft_scalar yr, yi; | 227 | kiss_fft_scalar yr, yi; |
199 | yr = S_MUL(fp[1],t[(N4-i)<<shift]) + S_MUL(fp[0],t[i<<shift]); | 228 | yr = S_MUL(fp->i,t[N4+i]) - S_MUL(fp->r,t[i]); |
200 | yi = S_MUL(fp[0],t[(N4-i)<<shift]) - S_MUL(fp[1],t[i<<shift]); | 229 | yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]); |
201 | /* works because the cos is nearly one */ | 230 | *yp1 = yr; |
202 | *yp1 = yr - S_MUL(yi,sine); | 231 | *yp2 = yi; |
203 | *yp2 = yi + S_MUL(yr,sine);; | 232 | fp++; |
204 | fp += 2; | ||
205 | yp1 += 2*stride; | 233 | yp1 += 2*stride; |
206 | yp2 -= 2*stride; | 234 | yp2 -= 2*stride; |
207 | } | 235 | } |
208 | } | 236 | } |
209 | RESTORE_STACK; | 237 | RESTORE_STACK; |
210 | } | 238 | } |
239 | #endif /* OVERRIDE_clt_mdct_forward */ | ||
211 | #endif | 240 | #endif |
212 | 241 | ||
242 | #ifndef OVERRIDE_clt_mdct_backward | ||
213 | void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, | 243 | void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, |
214 | const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride) | 244 | const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride) |
215 | { | 245 | { |
216 | int i; | 246 | int i; |
217 | int N, N2, N4; | 247 | int N, N2, N4; |
218 | kiss_twiddle_scalar sine; | 248 | const kiss_twiddle_scalar *trig; |
219 | /* VARDECL(kiss_fft_scalar, f2); | 249 | |
220 | SAVE_STACK; */ | ||
221 | N = l->n; | 250 | N = l->n; |
222 | N >>= shift; | 251 | trig = l->trig; |
252 | for (i=0;i<shift;i++) | ||
253 | { | ||
254 | N >>= 1; | ||
255 | trig += N; | ||
256 | } | ||
223 | N2 = N>>1; | 257 | N2 = N>>1; |
224 | N4 = N>>2; | 258 | N4 = N>>2; |
225 | /* ALLOC(f2, N2, kiss_fft_scalar); */ | ||
226 | kiss_fft_scalar f2[N2]; /* worst case 3840b */ | ||
227 | /* sin(x) ~= x here */ | ||
228 | #ifdef FIXED_POINT | ||
229 | sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N; | ||
230 | #else | ||
231 | sine = (kiss_twiddle_scalar)2*PI*(.125f)/N; | ||
232 | #endif | ||
233 | 259 | ||
234 | /* Pre-rotate */ | 260 | /* Pre-rotate */ |
235 | { | 261 | { |
236 | /* Temp pointers to make it really clear to the compiler what we're doing */ | 262 | /* Temp pointers to make it really clear to the compiler what we're doing */ |
237 | const kiss_fft_scalar * OPUS_RESTRICT xp1 = in; | 263 | const kiss_fft_scalar * OPUS_RESTRICT xp1 = in; |
238 | const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1); | 264 | const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1); |
239 | kiss_fft_scalar * OPUS_RESTRICT yp = f2; | 265 | kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1); |
240 | const kiss_twiddle_scalar *t = &l->trig[0]; | 266 | const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0]; |
267 | const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev; | ||
241 | for(i=0;i<N4;i++) | 268 | for(i=0;i<N4;i++) |
242 | { | 269 | { |
270 | int rev; | ||
243 | kiss_fft_scalar yr, yi; | 271 | kiss_fft_scalar yr, yi; |
244 | yr = -S_MUL(*xp2, t[i<<shift]) + S_MUL(*xp1,t[(N4-i)<<shift]); | 272 | rev = *bitrev++; |
245 | yi = -S_MUL(*xp2, t[(N4-i)<<shift]) - S_MUL(*xp1,t[i<<shift]); | 273 | yr = S_MUL(*xp2, t[i]) + S_MUL(*xp1, t[N4+i]); |
246 | /* works because the cos is nearly one */ | 274 | yi = S_MUL(*xp1, t[i]) - S_MUL(*xp2, t[N4+i]); |
247 | *yp++ = yr - S_MUL(yi,sine); | 275 | /* We swap real and imag because we use an FFT instead of an IFFT. */ |
248 | *yp++ = yi + S_MUL(yr,sine); | 276 | yp[2*rev+1] = yr; |
277 | yp[2*rev] = yi; | ||
278 | /* Storing the pre-rotation directly in the bitrev order. */ | ||
249 | xp1+=2*stride; | 279 | xp1+=2*stride; |
250 | xp2-=2*stride; | 280 | xp2-=2*stride; |
251 | } | 281 | } |
252 | } | 282 | } |
253 | 283 | ||
254 | /* Inverse N/4 complex FFT. This one should *not* downscale even in fixed-point */ | 284 | opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1))); |
255 | opus_ifft(l->kfft[shift], (kiss_fft_cpx *)f2, (kiss_fft_cpx *)(out+(overlap>>1))); | ||
256 | 285 | ||
257 | /* Post-rotate and de-shuffle from both ends of the buffer at once to make | 286 | /* Post-rotate and de-shuffle from both ends of the buffer at once to make |
258 | it in-place. */ | 287 | it in-place. */ |
259 | { | 288 | { |
260 | kiss_fft_scalar * OPUS_RESTRICT yp0 = out+(overlap>>1); | 289 | kiss_fft_scalar * yp0 = out+(overlap>>1); |
261 | kiss_fft_scalar * OPUS_RESTRICT yp1 = out+(overlap>>1)+N2-2; | 290 | kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2; |
262 | const kiss_twiddle_scalar *t = &l->trig[0]; | 291 | const kiss_twiddle_scalar *t = &trig[0]; |
263 | /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the | 292 | /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the |
264 | middle pair will be computed twice. */ | 293 | middle pair will be computed twice. */ |
265 | for(i=0;i<(N4+1)>>1;i++) | 294 | for(i=0;i<(N4+1)>>1;i++) |
266 | { | 295 | { |
267 | kiss_fft_scalar re, im, yr, yi; | 296 | kiss_fft_scalar re, im, yr, yi; |
268 | kiss_twiddle_scalar t0, t1; | 297 | kiss_twiddle_scalar t0, t1; |
269 | re = yp0[0]; | 298 | /* We swap real and imag because we're using an FFT instead of an IFFT. */ |
270 | im = yp0[1]; | 299 | re = yp0[1]; |
271 | t0 = t[i<<shift]; | 300 | im = yp0[0]; |
272 | t1 = t[(N4-i)<<shift]; | 301 | t0 = t[i]; |
302 | t1 = t[N4+i]; | ||
273 | /* We'd scale up by 2 here, but instead it's done when mixing the windows */ | 303 | /* We'd scale up by 2 here, but instead it's done when mixing the windows */ |
274 | yr = S_MUL(re,t0) - S_MUL(im,t1); | 304 | yr = S_MUL(re,t0) + S_MUL(im,t1); |
275 | yi = S_MUL(im,t0) + S_MUL(re,t1); | 305 | yi = S_MUL(re,t1) - S_MUL(im,t0); |
276 | re = yp1[0]; | 306 | /* We swap real and imag because we're using an FFT instead of an IFFT. */ |
277 | im = yp1[1]; | 307 | re = yp1[1]; |
278 | /* works because the cos is nearly one */ | 308 | im = yp1[0]; |
279 | yp0[0] = -(yr - S_MUL(yi,sine)); | 309 | yp0[0] = yr; |
280 | yp1[1] = yi + S_MUL(yr,sine); | 310 | yp1[1] = yi; |
281 | 311 | ||
282 | t0 = t[(N4-i-1)<<shift]; | 312 | t0 = t[(N4-i-1)]; |
283 | t1 = t[(i+1)<<shift]; | 313 | t1 = t[(N2-i-1)]; |
284 | /* We'd scale up by 2 here, but instead it's done when mixing the windows */ | 314 | /* We'd scale up by 2 here, but instead it's done when mixing the windows */ |
285 | yr = S_MUL(re,t0) - S_MUL(im,t1); | 315 | yr = S_MUL(re,t0) + S_MUL(im,t1); |
286 | yi = S_MUL(im,t0) + S_MUL(re,t1); | 316 | yi = S_MUL(re,t1) - S_MUL(im,t0); |
287 | /* works because the cos is nearly one */ | 317 | yp1[0] = yr; |
288 | yp1[0] = -(yr - S_MUL(yi,sine)); | 318 | yp0[1] = yi; |
289 | yp0[1] = yi + S_MUL(yr,sine); | ||
290 | yp0 += 2; | 319 | yp0 += 2; |
291 | yp1 -= 2; | 320 | yp1 -= 2; |
292 | } | 321 | } |
@@ -310,5 +339,5 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala | |||
310 | wp2--; | 339 | wp2--; |
311 | } | 340 | } |
312 | } | 341 | } |
313 | /* RESTORE_STACK; */ | ||
314 | } | 342 | } |
343 | #endif /* OVERRIDE_clt_mdct_backward */ | ||