diff options
author | Nils Wallménius <nils@rockbox.org> | 2013-05-20 22:25:57 +0200 |
---|---|---|
committer | Nils Wallménius <nils@rockbox.org> | 2013-08-31 08:30:51 +0200 |
commit | 580b307fd791c0997a8831bc800bba87797bfb7e (patch) | |
tree | 807846056f06fd944a750ce41217a877910ebd59 /lib/rbcodec/codecs/libopus/celt/mdct.c | |
parent | 74761b70acd96cecc0d35450dd56a98ad9ee7d3d (diff) | |
download | rockbox-580b307fd791c0997a8831bc800bba87797bfb7e.tar.gz rockbox-580b307fd791c0997a8831bc800bba87797bfb7e.zip |
Sync opus codec to upstream git
Sync opus codec to upstream commit
02fed471a4568852d6618e041c4f2af0d7730ee2 (August 30 2013)
This brings in a lot of optimizations but also makes the diff
between our codec and the upstream much smaller as most of our
optimizations have been upstreamed or supeceded.
Speedups across the board for CELT mode files:
64kbps 128kbps
H300 9.82MHz 15.48MHz
c200 4.86MHz 9.63MHz
fuze v1 10.32MHz 15.92MHz
For the silk mode test file (16kbps) arm targets get a speedup
of about 2MHz while the H300 is 7.8MHz slower, likely because it's
now using the pseudostack more rather than the real stack which
is in iram. Patches to get around that are upcomming.
Change-Id: Ifecf963e461c51ac42e09dac1e91bc4bc3b12fa3
Diffstat (limited to 'lib/rbcodec/codecs/libopus/celt/mdct.c')
-rw-r--r-- | lib/rbcodec/codecs/libopus/celt/mdct.c | 147 |
1 files changed, 55 insertions, 92 deletions
diff --git a/lib/rbcodec/codecs/libopus/celt/mdct.c b/lib/rbcodec/codecs/libopus/celt/mdct.c index 0df77fd5ec..72ea180568 100644 --- a/lib/rbcodec/codecs/libopus/celt/mdct.c +++ b/lib/rbcodec/codecs/libopus/celt/mdct.c | |||
@@ -41,7 +41,7 @@ | |||
41 | 41 | ||
42 | #ifndef SKIP_CONFIG_H | 42 | #ifndef SKIP_CONFIG_H |
43 | #ifdef HAVE_CONFIG_H | 43 | #ifdef HAVE_CONFIG_H |
44 | #include "opus_config.h" | 44 | #include "config.h" |
45 | #endif | 45 | #endif |
46 | #endif | 46 | #endif |
47 | 47 | ||
@@ -110,12 +110,14 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar | |||
110 | int N, N2, N4; | 110 | int N, N2, N4; |
111 | kiss_twiddle_scalar sine; | 111 | kiss_twiddle_scalar sine; |
112 | VARDECL(kiss_fft_scalar, f); | 112 | VARDECL(kiss_fft_scalar, f); |
113 | VARDECL(kiss_fft_scalar, f2); | ||
113 | SAVE_STACK; | 114 | SAVE_STACK; |
114 | N = l->n; | 115 | N = l->n; |
115 | N >>= shift; | 116 | N >>= shift; |
116 | N2 = N>>1; | 117 | N2 = N>>1; |
117 | N4 = N>>2; | 118 | N4 = N>>2; |
118 | ALLOC(f, N2, kiss_fft_scalar); | 119 | ALLOC(f, N2, kiss_fft_scalar); |
120 | ALLOC(f2, N2, kiss_fft_scalar); | ||
119 | /* sin(x) ~= x here */ | 121 | /* sin(x) ~= x here */ |
120 | #ifdef FIXED_POINT | 122 | #ifdef FIXED_POINT |
121 | sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N; | 123 | sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N; |
@@ -132,7 +134,7 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar | |||
132 | kiss_fft_scalar * OPUS_RESTRICT yp = f; | 134 | kiss_fft_scalar * OPUS_RESTRICT yp = f; |
133 | const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1); | 135 | const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1); |
134 | const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; | 136 | const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; |
135 | for(i=0;i<(overlap>>2);i++) | 137 | for(i=0;i<((overlap+3)>>2);i++) |
136 | { | 138 | { |
137 | /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ | 139 | /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ |
138 | *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2); | 140 | *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2); |
@@ -144,7 +146,7 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar | |||
144 | } | 146 | } |
145 | wp1 = window; | 147 | wp1 = window; |
146 | wp2 = window+overlap-1; | 148 | wp2 = window+overlap-1; |
147 | for(;i<N4-(overlap>>2);i++) | 149 | for(;i<N4-((overlap+3)>>2);i++) |
148 | { | 150 | { |
149 | /* Real part arranged as a-bR, Imag part arranged as -c-dR */ | 151 | /* Real part arranged as a-bR, Imag part arranged as -c-dR */ |
150 | *yp++ = *xp2; | 152 | *yp++ = *xp2; |
@@ -181,12 +183,12 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar | |||
181 | } | 183 | } |
182 | 184 | ||
183 | /* N/4 complex FFT, down-scales by 4/N */ | 185 | /* N/4 complex FFT, down-scales by 4/N */ |
184 | opus_fft(l->kfft[shift], (kiss_fft_cpx *)f, (kiss_fft_cpx *)in); | 186 | opus_fft(l->kfft[shift], (kiss_fft_cpx *)f, (kiss_fft_cpx *)f2); |
185 | 187 | ||
186 | /* Post-rotate */ | 188 | /* Post-rotate */ |
187 | { | 189 | { |
188 | /* Temp pointers to make it really clear to the compiler what we're doing */ | 190 | /* Temp pointers to make it really clear to the compiler what we're doing */ |
189 | const kiss_fft_scalar * OPUS_RESTRICT fp = in; | 191 | const kiss_fft_scalar * OPUS_RESTRICT fp = f2; |
190 | kiss_fft_scalar * OPUS_RESTRICT yp1 = out; | 192 | kiss_fft_scalar * OPUS_RESTRICT yp1 = out; |
191 | kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1); | 193 | kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1); |
192 | const kiss_twiddle_scalar *t = &l->trig[0]; | 194 | const kiss_twiddle_scalar *t = &l->trig[0]; |
@@ -208,35 +210,20 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar | |||
208 | } | 210 | } |
209 | #endif | 211 | #endif |
210 | 212 | ||
211 | #define S_F_BUF_SIZE (1920>>1) /* N = 1920 for static modes */ | ||
212 | static kiss_fft_scalar s_f2[S_F_BUF_SIZE] IBSS_ATTR MEM_ALIGN_ATTR; | ||
213 | void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, | 213 | void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, |
214 | const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride) | 214 | const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride) |
215 | { | 215 | { |
216 | int i; | 216 | int i; |
217 | int N, N2, N4; | 217 | int N, N2, N4; |
218 | int tstride = 1<<shift; | ||
219 | kiss_twiddle_scalar sine; | 218 | kiss_twiddle_scalar sine; |
220 | VARDECL(kiss_fft_scalar, f); | 219 | /* VARDECL(kiss_fft_scalar, f2); |
221 | VARDECL(kiss_fft_scalar, f2); | 220 | SAVE_STACK; */ |
222 | SAVE_STACK; | ||
223 | N = l->n; | 221 | N = l->n; |
224 | N >>= shift; | 222 | N >>= shift; |
225 | N2 = N>>1; | 223 | N2 = N>>1; |
226 | N4 = N>>2; | 224 | N4 = N>>2; |
227 | kiss_fft_scalar s_f[S_F_BUF_SIZE]; | 225 | /* ALLOC(f2, N2, kiss_fft_scalar); */ |
228 | 226 | kiss_fft_scalar f2[N2]; /* worst case 3840b */ | |
229 | if (S_F_BUF_SIZE >= N2) | ||
230 | { | ||
231 | f = s_f; | ||
232 | f2 = s_f2; | ||
233 | } | ||
234 | else | ||
235 | { | ||
236 | ALLOC(f , N2, kiss_fft_scalar); | ||
237 | ALLOC(f2, N2, kiss_fft_scalar); | ||
238 | } | ||
239 | |||
240 | /* sin(x) ~= x here */ | 227 | /* sin(x) ~= x here */ |
241 | #ifdef FIXED_POINT | 228 | #ifdef FIXED_POINT |
242 | sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N; | 229 | sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N; |
@@ -250,102 +237,78 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala | |||
250 | const kiss_fft_scalar * OPUS_RESTRICT xp1 = in; | 237 | const kiss_fft_scalar * OPUS_RESTRICT xp1 = in; |
251 | const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1); | 238 | const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1); |
252 | kiss_fft_scalar * OPUS_RESTRICT yp = f2; | 239 | kiss_fft_scalar * OPUS_RESTRICT yp = f2; |
253 | const kiss_twiddle_scalar *t0 = &l->trig[0]; | 240 | const kiss_twiddle_scalar *t = &l->trig[0]; |
254 | const kiss_twiddle_scalar *t1 = &l->trig[N4<<shift]; | ||
255 | for(i=0;i<N4;i++) | 241 | for(i=0;i<N4;i++) |
256 | { | 242 | { |
257 | kiss_fft_scalar yr, yi; | 243 | kiss_fft_scalar yr, yi; |
258 | yr = -S_MUL(*xp2, *t0) + S_MUL(*xp1, *t1); | 244 | yr = -S_MUL(*xp2, t[i<<shift]) + S_MUL(*xp1,t[(N4-i)<<shift]); |
259 | yi = -S_MUL(*xp2, *t1) - S_MUL(*xp1, *t0); | 245 | yi = -S_MUL(*xp2, t[(N4-i)<<shift]) - S_MUL(*xp1,t[i<<shift]); |
260 | /* works because the cos is nearly one */ | 246 | /* works because the cos is nearly one */ |
261 | *yp++ = yr - S_MUL(yi,sine); | 247 | *yp++ = yr - S_MUL(yi,sine); |
262 | *yp++ = yi + S_MUL(yr,sine); | 248 | *yp++ = yi + S_MUL(yr,sine); |
263 | xp1+=2*stride; | 249 | xp1+=2*stride; |
264 | xp2-=2*stride; | 250 | xp2-=2*stride; |
265 | t0 += tstride; | ||
266 | t1 -= tstride; | ||
267 | } | 251 | } |
268 | } | 252 | } |
269 | 253 | ||
270 | /* Inverse N/4 complex FFT. This one should *not* downscale even in fixed-point */ | 254 | /* Inverse N/4 complex FFT. This one should *not* downscale even in fixed-point */ |
271 | opus_ifft(l->kfft[shift], (kiss_fft_cpx *)f2, (kiss_fft_cpx *)f); | 255 | opus_ifft(l->kfft[shift], (kiss_fft_cpx *)f2, (kiss_fft_cpx *)(out+(overlap>>1))); |
272 | 256 | ||
273 | /* Post-rotate */ | 257 | /* Post-rotate and de-shuffle from both ends of the buffer at once to make |
258 | it in-place. */ | ||
274 | { | 259 | { |
275 | kiss_fft_scalar * OPUS_RESTRICT fp = f; | 260 | kiss_fft_scalar * OPUS_RESTRICT yp0 = out+(overlap>>1); |
276 | const kiss_twiddle_scalar *t0 = &l->trig[0]; | 261 | kiss_fft_scalar * OPUS_RESTRICT yp1 = out+(overlap>>1)+N2-2; |
277 | const kiss_twiddle_scalar *t1 = &l->trig[N4<<shift]; | 262 | const kiss_twiddle_scalar *t = &l->trig[0]; |
278 | for(i=0;i<N4;i++) | 263 | /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the |
264 | middle pair will be computed twice. */ | ||
265 | for(i=0;i<(N4+1)>>1;i++) | ||
279 | { | 266 | { |
280 | kiss_fft_scalar re, im, yr, yi; | 267 | kiss_fft_scalar re, im, yr, yi; |
281 | re = fp[0]; | 268 | kiss_twiddle_scalar t0, t1; |
282 | im = fp[1]; | 269 | re = yp0[0]; |
270 | im = yp0[1]; | ||
271 | t0 = t[i<<shift]; | ||
272 | t1 = t[(N4-i)<<shift]; | ||
283 | /* We'd scale up by 2 here, but instead it's done when mixing the windows */ | 273 | /* We'd scale up by 2 here, but instead it's done when mixing the windows */ |
284 | yr = S_MUL(re, *t0) - S_MUL(im, *t1); | 274 | yr = S_MUL(re,t0) - S_MUL(im,t1); |
285 | yi = S_MUL(im, *t0) + S_MUL(re, *t1); | 275 | yi = S_MUL(im,t0) + S_MUL(re,t1); |
276 | re = yp1[0]; | ||
277 | im = yp1[1]; | ||
286 | /* works because the cos is nearly one */ | 278 | /* works because the cos is nearly one */ |
287 | *fp++ = yr - S_MUL(yi,sine); | 279 | yp0[0] = -(yr - S_MUL(yi,sine)); |
288 | *fp++ = yi + S_MUL(yr,sine); | 280 | yp1[1] = yi + S_MUL(yr,sine); |
289 | t0 += tstride; | ||
290 | t1 -= tstride; | ||
291 | } | ||
292 | } | ||
293 | /* De-shuffle the components for the middle of the window only */ | ||
294 | { | ||
295 | const kiss_fft_scalar * OPUS_RESTRICT fp1 = f; | ||
296 | const kiss_fft_scalar * OPUS_RESTRICT fp2 = f+N2-1; | ||
297 | kiss_fft_scalar * OPUS_RESTRICT yp = f2; | ||
298 | for(i = 0; i < N4; i++) | ||
299 | { | ||
300 | *yp++ =-*fp1; | ||
301 | *yp++ = *fp2; | ||
302 | fp1 += 2; | ||
303 | fp2 -= 2; | ||
304 | } | ||
305 | } | ||
306 | out -= (N2-overlap)>>1; | ||
307 | /* Mirror on both sides for TDAC */ | ||
308 | { | ||
309 | kiss_fft_scalar * OPUS_RESTRICT fp1 = f2+N4-1; | ||
310 | kiss_fft_scalar * OPUS_RESTRICT xp1 = out+N2-1; | ||
311 | kiss_fft_scalar * OPUS_RESTRICT yp1 = out+N4-overlap/2; | ||
312 | const opus_val16 * OPUS_RESTRICT wp1 = window; | ||
313 | const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1; | ||
314 | 281 | ||
315 | i = N4-overlap/2; | 282 | t0 = t[(N4-i-1)<<shift]; |
316 | xp1 -= N4-overlap/2; | 283 | t1 = t[(i+1)<<shift]; |
317 | fp1 -= N4-overlap/2; | 284 | /* We'd scale up by 2 here, but instead it's done when mixing the windows */ |
318 | OPUS_COPY(xp1+1, fp1+1, N4-overlap/2); | 285 | yr = S_MUL(re,t0) - S_MUL(im,t1); |
319 | for(; i < N4; i++) | 286 | yi = S_MUL(im,t0) + S_MUL(re,t1); |
320 | { | 287 | /* works because the cos is nearly one */ |
321 | kiss_fft_scalar x1; | 288 | yp1[0] = -(yr - S_MUL(yi,sine)); |
322 | x1 = *fp1--; | 289 | yp0[1] = yi + S_MUL(yr,sine); |
323 | *yp1++ +=-MULT16_32_Q15(*wp1, x1); | 290 | yp0 += 2; |
324 | *xp1-- += MULT16_32_Q15(*wp2, x1); | 291 | yp1 -= 2; |
325 | wp1++; | ||
326 | wp2--; | ||
327 | } | 292 | } |
328 | } | 293 | } |
294 | |||
295 | /* Mirror on both sides for TDAC */ | ||
329 | { | 296 | { |
330 | kiss_fft_scalar * OPUS_RESTRICT fp2 = f2+N4; | 297 | kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1; |
331 | kiss_fft_scalar * OPUS_RESTRICT xp2 = out+N2; | 298 | kiss_fft_scalar * OPUS_RESTRICT yp1 = out; |
332 | kiss_fft_scalar * OPUS_RESTRICT yp2 = out+N-1-(N4-overlap/2); | ||
333 | const opus_val16 * OPUS_RESTRICT wp1 = window; | 299 | const opus_val16 * OPUS_RESTRICT wp1 = window; |
334 | const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1; | 300 | const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1; |
335 | 301 | ||
336 | i = N4-overlap/2; | 302 | for(i = 0; i < overlap/2; i++) |
337 | OPUS_COPY(xp2, fp2, N4-overlap/2); | ||
338 | xp2 += N4-overlap/2; | ||
339 | fp2 += N4-overlap/2; | ||
340 | for(; i < N4; i++) | ||
341 | { | 303 | { |
342 | kiss_fft_scalar x2; | 304 | kiss_fft_scalar x1, x2; |
343 | x2 = *fp2++; | 305 | x1 = *xp1; |
344 | *yp2-- = MULT16_32_Q15(*wp1, x2); | 306 | x2 = *yp1; |
345 | *xp2++ = MULT16_32_Q15(*wp2, x2); | 307 | *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1); |
308 | *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1); | ||
346 | wp1++; | 309 | wp1++; |
347 | wp2--; | 310 | wp2--; |
348 | } | 311 | } |
349 | } | 312 | } |
350 | RESTORE_STACK; | 313 | /* RESTORE_STACK; */ |
351 | } | 314 | } |