diff options
author | Nils Wallménius <nils@rockbox.org> | 2014-01-19 16:31:59 +0100 |
---|---|---|
committer | Nils Wallménius <nils@rockbox.org> | 2014-07-13 11:12:40 +0200 |
commit | 9b7ec42403073ee887efc531c153e6b1b6c15bab (patch) | |
tree | 07e72fe9d817c65a6fede22955344a870842d5e6 /lib/rbcodec/codecs/libopus/celt/kiss_fft.c | |
parent | e557951c94c1efa769900257e466900f0ffeb53b (diff) | |
download | rockbox-9b7ec42403073ee887efc531c153e6b1b6c15bab.tar.gz rockbox-9b7ec42403073ee887efc531c153e6b1b6c15bab.zip |
Sync to upstream libopus
Sync to commit bb4b6885a139644cf3ac14e7deda9f633ec2d93c
This brings in a bunch of optimizations to decode speed
and memory usage. Allocations are switched from using
the pseudostack to using the real stack. Enabled hacks
to reduce stack usage.
This should fix crashes on sansa clip, although some
files will not play due to failing allocations in the
codec buffer.
Speeds up decoding of the following test files:
H300 (cf) C200 (arm7tdmi) ipod classic (arm9e)
16 kbps (silk) 14.28 MHz 4.00 MHz 2.61 MHz
64 kbps (celt) 4.09 MHz 8.08 MHz 6.24 MHz
128 kbps (celt) 1.93 MHz 8.83 MHz 6.53 MHz
Change-Id: I851733a8a5824b61feb363a173091bc7e6629b58
Diffstat (limited to 'lib/rbcodec/codecs/libopus/celt/kiss_fft.c')
-rw-r--r-- | lib/rbcodec/codecs/libopus/celt/kiss_fft.c | 482 |
1 files changed, 173 insertions, 309 deletions
diff --git a/lib/rbcodec/codecs/libopus/celt/kiss_fft.c b/lib/rbcodec/codecs/libopus/celt/kiss_fft.c index e2b8f3b3da..833ef5a71f 100644 --- a/lib/rbcodec/codecs/libopus/celt/kiss_fft.c +++ b/lib/rbcodec/codecs/libopus/celt/kiss_fft.c | |||
@@ -45,73 +45,62 @@ | |||
45 | complex numbers. It also delares the kf_ internal functions. | 45 | complex numbers. It also delares the kf_ internal functions. |
46 | */ | 46 | */ |
47 | 47 | ||
48 | #if 0 | ||
49 | static void kf_bfly2( | 48 | static void kf_bfly2( |
50 | kiss_fft_cpx * Fout, | 49 | kiss_fft_cpx * Fout, |
51 | const size_t fstride, | ||
52 | const kiss_fft_state *st, | ||
53 | int m, | 50 | int m, |
54 | int N, | 51 | int N |
55 | int mm | ||
56 | ) | 52 | ) |
57 | { | 53 | { |
58 | kiss_fft_cpx * Fout2; | 54 | kiss_fft_cpx * Fout2; |
59 | const kiss_twiddle_cpx * tw1; | 55 | int i; |
60 | int i,j; | 56 | (void)m; |
61 | kiss_fft_cpx * Fout_beg = Fout; | 57 | #ifdef CUSTOM_MODES |
62 | for (i=0;i<N;i++) | 58 | if (m==1) |
63 | { | 59 | { |
64 | Fout = Fout_beg + i*mm; | 60 | celt_assert(m==1); |
65 | Fout2 = Fout + m; | 61 | for (i=0;i<N;i++) |
66 | tw1 = st->twiddles; | ||
67 | for(j=0;j<m;j++) | ||
68 | { | 62 | { |
69 | kiss_fft_cpx t; | 63 | kiss_fft_cpx t; |
70 | Fout->r = SHR32(Fout->r, 1);Fout->i = SHR32(Fout->i, 1); | 64 | Fout2 = Fout + 1; |
71 | Fout2->r = SHR32(Fout2->r, 1);Fout2->i = SHR32(Fout2->i, 1); | 65 | t = *Fout2; |
72 | C_MUL (t, *Fout2 , *tw1); | ||
73 | tw1 += fstride; | ||
74 | C_SUB( *Fout2 , *Fout , t ); | 66 | C_SUB( *Fout2 , *Fout , t ); |
75 | C_ADDTO( *Fout , t ); | 67 | C_ADDTO( *Fout , t ); |
76 | ++Fout2; | 68 | Fout += 2; |
77 | ++Fout; | ||
78 | } | 69 | } |
79 | } | 70 | } else |
80 | } | ||
81 | #endif | 71 | #endif |
82 | |||
83 | static void ki_bfly2( | ||
84 | kiss_fft_cpx * Fout, | ||
85 | const size_t fstride, | ||
86 | const kiss_fft_state *st, | ||
87 | int m, | ||
88 | int N, | ||
89 | int mm | ||
90 | ) | ||
91 | { | ||
92 | kiss_fft_cpx * Fout2; | ||
93 | const kiss_twiddle_cpx * tw1; | ||
94 | kiss_fft_cpx t; | ||
95 | int i,j; | ||
96 | kiss_fft_cpx * Fout_beg = Fout; | ||
97 | for (i=0;i<N;i++) | ||
98 | { | 72 | { |
99 | Fout = Fout_beg + i*mm; | 73 | opus_val16 tw; |
100 | Fout2 = Fout + m; | 74 | tw = QCONST16(0.7071067812f, 15); |
101 | tw1 = st->twiddles; | 75 | /* We know that m==4 here because the radix-2 is just after a radix-4 */ |
102 | for(j=0;j<m;j++) | 76 | celt_assert(m==4); |
77 | for (i=0;i<N;i++) | ||
103 | { | 78 | { |
104 | C_MULC (t, *Fout2 , *tw1); | 79 | kiss_fft_cpx t; |
105 | tw1 += fstride; | 80 | Fout2 = Fout + 4; |
106 | C_SUB( *Fout2 , *Fout , t ); | 81 | t = Fout2[0]; |
107 | C_ADDTO( *Fout , t ); | 82 | C_SUB( Fout2[0] , Fout[0] , t ); |
108 | ++Fout2; | 83 | C_ADDTO( Fout[0] , t ); |
109 | ++Fout; | 84 | |
85 | t.r = S_MUL(Fout2[1].r+Fout2[1].i, tw); | ||
86 | t.i = S_MUL(Fout2[1].i-Fout2[1].r, tw); | ||
87 | C_SUB( Fout2[1] , Fout[1] , t ); | ||
88 | C_ADDTO( Fout[1] , t ); | ||
89 | |||
90 | t.r = Fout2[2].i; | ||
91 | t.i = -Fout2[2].r; | ||
92 | C_SUB( Fout2[2] , Fout[2] , t ); | ||
93 | C_ADDTO( Fout[2] , t ); | ||
94 | |||
95 | t.r = S_MUL(Fout2[3].i-Fout2[3].r, tw); | ||
96 | t.i = S_MUL(-Fout2[3].i-Fout2[3].r, tw); | ||
97 | C_SUB( Fout2[3] , Fout[3] , t ); | ||
98 | C_ADDTO( Fout[3] , t ); | ||
99 | Fout += 8; | ||
110 | } | 100 | } |
111 | } | 101 | } |
112 | } | 102 | } |
113 | 103 | ||
114 | #if 0 | ||
115 | static void kf_bfly4( | 104 | static void kf_bfly4( |
116 | kiss_fft_cpx * Fout, | 105 | kiss_fft_cpx * Fout, |
117 | const size_t fstride, | 106 | const size_t fstride, |
@@ -121,93 +110,69 @@ static void kf_bfly4( | |||
121 | int mm | 110 | int mm |
122 | ) | 111 | ) |
123 | { | 112 | { |
124 | const kiss_twiddle_cpx *tw1,*tw2,*tw3; | 113 | int i; |
125 | kiss_fft_cpx scratch[6]; | ||
126 | const size_t m2=2*m; | ||
127 | const size_t m3=3*m; | ||
128 | int i, j; | ||
129 | 114 | ||
130 | kiss_fft_cpx * Fout_beg = Fout; | 115 | if (m==1) |
131 | for (i=0;i<N;i++) | ||
132 | { | 116 | { |
133 | Fout = Fout_beg + i*mm; | 117 | /* Degenerate case where all the twiddles are 1. */ |
134 | tw3 = tw2 = tw1 = st->twiddles; | 118 | for (i=0;i<N;i++) |
135 | for (j=0;j<m;j++) | ||
136 | { | 119 | { |
137 | C_MUL4(scratch[0],Fout[m] , *tw1 ); | 120 | kiss_fft_cpx scratch0, scratch1; |
138 | C_MUL4(scratch[1],Fout[m2] , *tw2 ); | 121 | |
139 | C_MUL4(scratch[2],Fout[m3] , *tw3 ); | 122 | C_SUB( scratch0 , *Fout, Fout[2] ); |
140 | 123 | C_ADDTO(*Fout, Fout[2]); | |
141 | Fout->r = PSHR32(Fout->r, 2); | 124 | C_ADD( scratch1 , Fout[1] , Fout[3] ); |
142 | Fout->i = PSHR32(Fout->i, 2); | 125 | C_SUB( Fout[2], *Fout, scratch1 ); |
143 | C_SUB( scratch[5] , *Fout, scratch[1] ); | 126 | C_ADDTO( *Fout , scratch1 ); |
144 | C_ADDTO(*Fout, scratch[1]); | 127 | C_SUB( scratch1 , Fout[1] , Fout[3] ); |
145 | C_ADD( scratch[3] , scratch[0] , scratch[2] ); | 128 | |
146 | C_SUB( scratch[4] , scratch[0] , scratch[2] ); | 129 | Fout[1].r = scratch0.r + scratch1.i; |
147 | C_SUB( Fout[m2], *Fout, scratch[3] ); | 130 | Fout[1].i = scratch0.i - scratch1.r; |
148 | tw1 += fstride; | 131 | Fout[3].r = scratch0.r - scratch1.i; |
149 | tw2 += fstride*2; | 132 | Fout[3].i = scratch0.i + scratch1.r; |
150 | tw3 += fstride*3; | 133 | Fout+=4; |
151 | C_ADDTO( *Fout , scratch[3] ); | ||
152 | |||
153 | Fout[m].r = scratch[5].r + scratch[4].i; | ||
154 | Fout[m].i = scratch[5].i - scratch[4].r; | ||
155 | Fout[m3].r = scratch[5].r - scratch[4].i; | ||
156 | Fout[m3].i = scratch[5].i + scratch[4].r; | ||
157 | ++Fout; | ||
158 | } | 134 | } |
159 | } | 135 | } else { |
160 | } | 136 | int j; |
161 | #endif | 137 | kiss_fft_cpx scratch[6]; |
162 | 138 | const kiss_twiddle_cpx *tw1,*tw2,*tw3; | |
163 | static void ki_bfly4( | 139 | const int m2=2*m; |
164 | kiss_fft_cpx * Fout, | 140 | const int m3=3*m; |
165 | const size_t fstride, | 141 | kiss_fft_cpx * Fout_beg = Fout; |
166 | const kiss_fft_state *st, | 142 | for (i=0;i<N;i++) |
167 | int m, | ||
168 | int N, | ||
169 | int mm | ||
170 | ) | ||
171 | { | ||
172 | const kiss_twiddle_cpx *tw1,*tw2,*tw3; | ||
173 | kiss_fft_cpx scratch[6]; | ||
174 | const size_t m2=2*m; | ||
175 | const size_t m3=3*m; | ||
176 | int i, j; | ||
177 | |||
178 | kiss_fft_cpx * Fout_beg = Fout; | ||
179 | for (i=0;i<N;i++) | ||
180 | { | ||
181 | Fout = Fout_beg + i*mm; | ||
182 | tw3 = tw2 = tw1 = st->twiddles; | ||
183 | for (j=0;j<m;j++) | ||
184 | { | 143 | { |
185 | C_MULC(scratch[0],Fout[m] , *tw1 ); | 144 | Fout = Fout_beg + i*mm; |
186 | C_MULC(scratch[1],Fout[m2] , *tw2 ); | 145 | tw3 = tw2 = tw1 = st->twiddles; |
187 | C_MULC(scratch[2],Fout[m3] , *tw3 ); | 146 | /* m is guaranteed to be a multiple of 4. */ |
188 | 147 | for (j=0;j<m;j++) | |
189 | C_SUB( scratch[5] , *Fout, scratch[1] ); | 148 | { |
190 | C_ADDTO(*Fout, scratch[1]); | 149 | C_MUL(scratch[0],Fout[m] , *tw1 ); |
191 | C_ADD( scratch[3] , scratch[0] , scratch[2] ); | 150 | C_MUL(scratch[1],Fout[m2] , *tw2 ); |
192 | C_SUB( scratch[4] , scratch[0] , scratch[2] ); | 151 | C_MUL(scratch[2],Fout[m3] , *tw3 ); |
193 | C_SUB( Fout[m2], *Fout, scratch[3] ); | 152 | |
194 | tw1 += fstride; | 153 | C_SUB( scratch[5] , *Fout, scratch[1] ); |
195 | tw2 += fstride*2; | 154 | C_ADDTO(*Fout, scratch[1]); |
196 | tw3 += fstride*3; | 155 | C_ADD( scratch[3] , scratch[0] , scratch[2] ); |
197 | C_ADDTO( *Fout , scratch[3] ); | 156 | C_SUB( scratch[4] , scratch[0] , scratch[2] ); |
198 | 157 | C_SUB( Fout[m2], *Fout, scratch[3] ); | |
199 | Fout[m].r = scratch[5].r - scratch[4].i; | 158 | tw1 += fstride; |
200 | Fout[m].i = scratch[5].i + scratch[4].r; | 159 | tw2 += fstride*2; |
201 | Fout[m3].r = scratch[5].r + scratch[4].i; | 160 | tw3 += fstride*3; |
202 | Fout[m3].i = scratch[5].i - scratch[4].r; | 161 | C_ADDTO( *Fout , scratch[3] ); |
203 | ++Fout; | 162 | |
163 | Fout[m].r = scratch[5].r + scratch[4].i; | ||
164 | Fout[m].i = scratch[5].i - scratch[4].r; | ||
165 | Fout[m3].r = scratch[5].r - scratch[4].i; | ||
166 | Fout[m3].i = scratch[5].i + scratch[4].r; | ||
167 | ++Fout; | ||
168 | } | ||
204 | } | 169 | } |
205 | } | 170 | } |
206 | } | 171 | } |
207 | 172 | ||
173 | |||
208 | #ifndef RADIX_TWO_ONLY | 174 | #ifndef RADIX_TWO_ONLY |
209 | 175 | ||
210 | #if 0 | ||
211 | static void kf_bfly3( | 176 | static void kf_bfly3( |
212 | kiss_fft_cpx * Fout, | 177 | kiss_fft_cpx * Fout, |
213 | const size_t fstride, | 178 | const size_t fstride, |
@@ -225,14 +190,19 @@ static void kf_bfly3( | |||
225 | kiss_twiddle_cpx epi3; | 190 | kiss_twiddle_cpx epi3; |
226 | 191 | ||
227 | kiss_fft_cpx * Fout_beg = Fout; | 192 | kiss_fft_cpx * Fout_beg = Fout; |
193 | #ifdef FIXED_POINT | ||
194 | epi3.r = -16384; | ||
195 | epi3.i = -28378; | ||
196 | #else | ||
228 | epi3 = st->twiddles[fstride*m]; | 197 | epi3 = st->twiddles[fstride*m]; |
198 | #endif | ||
229 | for (i=0;i<N;i++) | 199 | for (i=0;i<N;i++) |
230 | { | 200 | { |
231 | Fout = Fout_beg + i*mm; | 201 | Fout = Fout_beg + i*mm; |
232 | tw1=tw2=st->twiddles; | 202 | tw1=tw2=st->twiddles; |
203 | /* For non-custom modes, m is guaranteed to be a multiple of 4. */ | ||
233 | k=m; | 204 | k=m; |
234 | do { | 205 | do { |
235 | C_FIXDIV(*Fout,3); C_FIXDIV(Fout[m],3); C_FIXDIV(Fout[m2],3); | ||
236 | 206 | ||
237 | C_MUL(scratch[1],Fout[m] , *tw1); | 207 | C_MUL(scratch[1],Fout[m] , *tw1); |
238 | C_MUL(scratch[2],Fout[m2] , *tw2); | 208 | C_MUL(scratch[2],Fout[m2] , *tw2); |
@@ -259,59 +229,9 @@ static void kf_bfly3( | |||
259 | } while(--k); | 229 | } while(--k); |
260 | } | 230 | } |
261 | } | 231 | } |
262 | #endif | ||
263 | |||
264 | static void ki_bfly3( | ||
265 | kiss_fft_cpx * Fout, | ||
266 | const size_t fstride, | ||
267 | const kiss_fft_state *st, | ||
268 | int m, | ||
269 | int N, | ||
270 | int mm | ||
271 | ) | ||
272 | { | ||
273 | int i, k; | ||
274 | const size_t m2 = 2*m; | ||
275 | const kiss_twiddle_cpx *tw1,*tw2; | ||
276 | kiss_fft_cpx scratch[5]; | ||
277 | kiss_twiddle_cpx epi3; | ||
278 | |||
279 | kiss_fft_cpx * Fout_beg = Fout; | ||
280 | epi3 = st->twiddles[fstride*m]; | ||
281 | for (i=0;i<N;i++) | ||
282 | { | ||
283 | Fout = Fout_beg + i*mm; | ||
284 | tw1=tw2=st->twiddles; | ||
285 | k=m; | ||
286 | do{ | ||
287 | |||
288 | C_MULC(scratch[1],Fout[m] , *tw1); | ||
289 | C_MULC(scratch[2],Fout[m2] , *tw2); | ||
290 | |||
291 | C_ADD(scratch[3],scratch[1],scratch[2]); | ||
292 | C_SUB(scratch[0],scratch[1],scratch[2]); | ||
293 | tw1 += fstride; | ||
294 | tw2 += fstride*2; | ||
295 | |||
296 | Fout[m].r = Fout->r - HALF_OF(scratch[3].r); | ||
297 | Fout[m].i = Fout->i - HALF_OF(scratch[3].i); | ||
298 | |||
299 | C_MULBYSCALAR( scratch[0] , -epi3.i ); | ||
300 | |||
301 | C_ADDTO(*Fout,scratch[3]); | ||
302 | |||
303 | Fout[m2].r = Fout[m].r + scratch[0].i; | ||
304 | Fout[m2].i = Fout[m].i - scratch[0].r; | ||
305 | |||
306 | Fout[m].r -= scratch[0].i; | ||
307 | Fout[m].i += scratch[0].r; | ||
308 | 232 | ||
309 | ++Fout; | ||
310 | }while(--k); | ||
311 | } | ||
312 | } | ||
313 | 233 | ||
314 | #if 0 | 234 | #ifndef OVERRIDE_kf_bfly5 |
315 | static void kf_bfly5( | 235 | static void kf_bfly5( |
316 | kiss_fft_cpx * Fout, | 236 | kiss_fft_cpx * Fout, |
317 | const size_t fstride, | 237 | const size_t fstride, |
@@ -324,13 +244,19 @@ static void kf_bfly5( | |||
324 | kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4; | 244 | kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4; |
325 | int i, u; | 245 | int i, u; |
326 | kiss_fft_cpx scratch[13]; | 246 | kiss_fft_cpx scratch[13]; |
327 | const kiss_twiddle_cpx * twiddles = st->twiddles; | ||
328 | const kiss_twiddle_cpx *tw; | 247 | const kiss_twiddle_cpx *tw; |
329 | kiss_twiddle_cpx ya,yb; | 248 | kiss_twiddle_cpx ya,yb; |
330 | kiss_fft_cpx * Fout_beg = Fout; | 249 | kiss_fft_cpx * Fout_beg = Fout; |
331 | 250 | ||
332 | ya = twiddles[fstride*m]; | 251 | #ifdef FIXED_POINT |
333 | yb = twiddles[fstride*2*m]; | 252 | ya.r = 10126; |
253 | ya.i = -31164; | ||
254 | yb.r = -26510; | ||
255 | yb.i = -19261; | ||
256 | #else | ||
257 | ya = st->twiddles[fstride*m]; | ||
258 | yb = st->twiddles[fstride*2*m]; | ||
259 | #endif | ||
334 | tw=st->twiddles; | 260 | tw=st->twiddles; |
335 | 261 | ||
336 | for (i=0;i<N;i++) | 262 | for (i=0;i<N;i++) |
@@ -342,8 +268,8 @@ static void kf_bfly5( | |||
342 | Fout3=Fout0+3*m; | 268 | Fout3=Fout0+3*m; |
343 | Fout4=Fout0+4*m; | 269 | Fout4=Fout0+4*m; |
344 | 270 | ||
271 | /* For non-custom modes, m is guaranteed to be a multiple of 4. */ | ||
345 | for ( u=0; u<m; ++u ) { | 272 | for ( u=0; u<m; ++u ) { |
346 | C_FIXDIV( *Fout0,5); C_FIXDIV( *Fout1,5); C_FIXDIV( *Fout2,5); C_FIXDIV( *Fout3,5); C_FIXDIV( *Fout4,5); | ||
347 | scratch[0] = *Fout0; | 273 | scratch[0] = *Fout0; |
348 | 274 | ||
349 | C_MUL(scratch[1] ,*Fout1, tw[u*fstride]); | 275 | C_MUL(scratch[1] ,*Fout1, tw[u*fstride]); |
@@ -380,75 +306,8 @@ static void kf_bfly5( | |||
380 | } | 306 | } |
381 | } | 307 | } |
382 | } | 308 | } |
383 | #endif | 309 | #endif /* OVERRIDE_kf_bfly5 */ |
384 | |||
385 | static void ki_bfly5( | ||
386 | kiss_fft_cpx * Fout, | ||
387 | const size_t fstride, | ||
388 | const kiss_fft_state *st, | ||
389 | int m, | ||
390 | int N, | ||
391 | int mm | ||
392 | ) | ||
393 | { | ||
394 | kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4; | ||
395 | int i, u; | ||
396 | kiss_fft_cpx scratch[13]; | ||
397 | const kiss_twiddle_cpx * twiddles = st->twiddles; | ||
398 | const kiss_twiddle_cpx *tw; | ||
399 | kiss_twiddle_cpx ya,yb; | ||
400 | kiss_fft_cpx * Fout_beg = Fout; | ||
401 | 310 | ||
402 | ya = twiddles[fstride*m]; | ||
403 | yb = twiddles[fstride*2*m]; | ||
404 | tw=st->twiddles; | ||
405 | |||
406 | for (i=0;i<N;i++) | ||
407 | { | ||
408 | Fout = Fout_beg + i*mm; | ||
409 | Fout0=Fout; | ||
410 | Fout1=Fout0+m; | ||
411 | Fout2=Fout0+2*m; | ||
412 | Fout3=Fout0+3*m; | ||
413 | Fout4=Fout0+4*m; | ||
414 | |||
415 | for ( u=0; u<m; ++u ) { | ||
416 | scratch[0] = *Fout0; | ||
417 | |||
418 | C_MULC(scratch[1] ,*Fout1, tw[u*fstride]); | ||
419 | C_MULC(scratch[2] ,*Fout2, tw[2*u*fstride]); | ||
420 | C_MULC(scratch[3] ,*Fout3, tw[3*u*fstride]); | ||
421 | C_MULC(scratch[4] ,*Fout4, tw[4*u*fstride]); | ||
422 | |||
423 | C_ADD( scratch[7],scratch[1],scratch[4]); | ||
424 | C_SUB( scratch[10],scratch[1],scratch[4]); | ||
425 | C_ADD( scratch[8],scratch[2],scratch[3]); | ||
426 | C_SUB( scratch[9],scratch[2],scratch[3]); | ||
427 | |||
428 | Fout0->r += scratch[7].r + scratch[8].r; | ||
429 | Fout0->i += scratch[7].i + scratch[8].i; | ||
430 | |||
431 | scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r); | ||
432 | scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r); | ||
433 | |||
434 | scratch[6].r = -S_MUL(scratch[10].i,ya.i) - S_MUL(scratch[9].i,yb.i); | ||
435 | scratch[6].i = S_MUL(scratch[10].r,ya.i) + S_MUL(scratch[9].r,yb.i); | ||
436 | |||
437 | C_SUB(*Fout1,scratch[5],scratch[6]); | ||
438 | C_ADD(*Fout4,scratch[5],scratch[6]); | ||
439 | |||
440 | scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r); | ||
441 | scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r); | ||
442 | scratch[12].r = S_MUL(scratch[10].i,yb.i) - S_MUL(scratch[9].i,ya.i); | ||
443 | scratch[12].i = -S_MUL(scratch[10].r,yb.i) + S_MUL(scratch[9].r,ya.i); | ||
444 | |||
445 | C_ADD(*Fout2,scratch[11],scratch[12]); | ||
446 | C_SUB(*Fout3,scratch[11],scratch[12]); | ||
447 | |||
448 | ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4; | ||
449 | } | ||
450 | } | ||
451 | } | ||
452 | 311 | ||
453 | #endif | 312 | #endif |
454 | 313 | ||
@@ -496,6 +355,9 @@ static | |||
496 | int kf_factor(int n,opus_int16 * facbuf) | 355 | int kf_factor(int n,opus_int16 * facbuf) |
497 | { | 356 | { |
498 | int p=4; | 357 | int p=4; |
358 | int i; | ||
359 | int stages=0; | ||
360 | int nbak = n; | ||
499 | 361 | ||
500 | /*factor out powers of 4, powers of 2, then any remaining primes */ | 362 | /*factor out powers of 4, powers of 2, then any remaining primes */ |
501 | do { | 363 | do { |
@@ -517,9 +379,30 @@ int kf_factor(int n,opus_int16 * facbuf) | |||
517 | { | 379 | { |
518 | return 0; | 380 | return 0; |
519 | } | 381 | } |
520 | *facbuf++ = p; | 382 | facbuf[2*stages] = p; |
521 | *facbuf++ = n; | 383 | if (p==2 && stages > 1) |
384 | { | ||
385 | facbuf[2*stages] = 4; | ||
386 | facbuf[2] = 2; | ||
387 | } | ||
388 | stages++; | ||
522 | } while (n > 1); | 389 | } while (n > 1); |
390 | n = nbak; | ||
391 | /* Reverse the order to get the radix 4 at the end, so we can use the | ||
392 | fast degenerate case. It turns out that reversing the order also | ||
393 | improves the noise behaviour. */ | ||
394 | for (i=0;i<stages/2;i++) | ||
395 | { | ||
396 | int tmp; | ||
397 | tmp = facbuf[2*i]; | ||
398 | facbuf[2*i] = facbuf[2*(stages-i-1)]; | ||
399 | facbuf[2*(stages-i-1)] = tmp; | ||
400 | } | ||
401 | for (i=0;i<stages;i++) | ||
402 | { | ||
403 | n /= facbuf[2*i]; | ||
404 | facbuf[2*i+1] = n; | ||
405 | } | ||
523 | return 1; | 406 | return 1; |
524 | } | 407 | } |
525 | 408 | ||
@@ -563,14 +446,20 @@ kiss_fft_state *opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem, co | |||
563 | kiss_twiddle_cpx *twiddles; | 446 | kiss_twiddle_cpx *twiddles; |
564 | 447 | ||
565 | st->nfft=nfft; | 448 | st->nfft=nfft; |
566 | #ifndef FIXED_POINT | 449 | #ifdef FIXED_POINT |
450 | st->scale_shift = celt_ilog2(st->nfft); | ||
451 | if (st->nfft == 1<<st->scale_shift) | ||
452 | st->scale = Q15ONE; | ||
453 | else | ||
454 | st->scale = (1073741824+st->nfft/2)/st->nfft>>(15-st->scale_shift); | ||
455 | #else | ||
567 | st->scale = 1.f/nfft; | 456 | st->scale = 1.f/nfft; |
568 | #endif | 457 | #endif |
569 | if (base != NULL) | 458 | if (base != NULL) |
570 | { | 459 | { |
571 | st->twiddles = base->twiddles; | 460 | st->twiddles = base->twiddles; |
572 | st->shift = 0; | 461 | st->shift = 0; |
573 | while (nfft<<st->shift != base->nfft && st->shift < 32) | 462 | while (st->shift < 32 && nfft<<st->shift != base->nfft) |
574 | st->shift++; | 463 | st->shift++; |
575 | if (st->shift>=32) | 464 | if (st->shift>=32) |
576 | goto fail; | 465 | goto fail; |
@@ -614,8 +503,7 @@ void opus_fft_free(const kiss_fft_state *cfg) | |||
614 | 503 | ||
615 | #endif /* CUSTOM_MODES */ | 504 | #endif /* CUSTOM_MODES */ |
616 | 505 | ||
617 | #if 0 | 506 | void opus_fft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout) |
618 | void opus_fft(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout) | ||
619 | { | 507 | { |
620 | int m2, m; | 508 | int m2, m; |
621 | int p; | 509 | int p; |
@@ -627,17 +515,6 @@ void opus_fft(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fou | |||
627 | /* st->shift can be -1 */ | 515 | /* st->shift can be -1 */ |
628 | shift = st->shift>0 ? st->shift : 0; | 516 | shift = st->shift>0 ? st->shift : 0; |
629 | 517 | ||
630 | celt_assert2 (fin != fout, "In-place FFT not supported"); | ||
631 | /* Bit-reverse the input */ | ||
632 | for (i=0;i<st->nfft;i++) | ||
633 | { | ||
634 | fout[st->bitrev[i]] = fin[i]; | ||
635 | #ifndef FIXED_POINT | ||
636 | fout[st->bitrev[i]].r *= st->scale; | ||
637 | fout[st->bitrev[i]].i *= st->scale; | ||
638 | #endif | ||
639 | } | ||
640 | |||
641 | fstride[0] = 1; | 518 | fstride[0] = 1; |
642 | L=0; | 519 | L=0; |
643 | do { | 520 | do { |
@@ -656,7 +533,7 @@ void opus_fft(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fou | |||
656 | switch (st->factors[2*i]) | 533 | switch (st->factors[2*i]) |
657 | { | 534 | { |
658 | case 2: | 535 | case 2: |
659 | kf_bfly2(fout,fstride[i]<<shift,st,m, fstride[i], m2); | 536 | kf_bfly2(fout, m, fstride[i]); |
660 | break; | 537 | break; |
661 | case 4: | 538 | case 4: |
662 | kf_bfly4(fout,fstride[i]<<shift,st,m, fstride[i], m2); | 539 | kf_bfly4(fout,fstride[i]<<shift,st,m, fstride[i], m2); |
@@ -673,57 +550,44 @@ void opus_fft(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fou | |||
673 | m = m2; | 550 | m = m2; |
674 | } | 551 | } |
675 | } | 552 | } |
676 | #endif | ||
677 | 553 | ||
678 | void opus_ifft(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout) | 554 | #if 0 |
555 | void opus_fft(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout) | ||
679 | { | 556 | { |
680 | int m2, m; | ||
681 | int p; | ||
682 | int L; | ||
683 | int fstride[MAXFACTORS]; | ||
684 | int i; | 557 | int i; |
685 | int shift; | 558 | opus_val16 scale; |
559 | #ifdef FIXED_POINT | ||
560 | /* Allows us to scale with MULT16_32_Q16(), which is faster than | ||
561 | MULT16_32_Q15() on ARM. */ | ||
562 | int scale_shift = st->scale_shift-1; | ||
563 | #endif | ||
564 | scale = st->scale; | ||
686 | 565 | ||
687 | /* st->shift can be -1 */ | ||
688 | shift = st->shift>0 ? st->shift : 0; | ||
689 | celt_assert2 (fin != fout, "In-place FFT not supported"); | 566 | celt_assert2 (fin != fout, "In-place FFT not supported"); |
690 | /* Bit-reverse the input */ | 567 | /* Bit-reverse the input */ |
691 | for (i=0;i<st->nfft;i++) | 568 | for (i=0;i<st->nfft;i++) |
692 | fout[st->bitrev[i]] = fin[i]; | ||
693 | |||
694 | fstride[0] = 1; | ||
695 | L=0; | ||
696 | do { | ||
697 | p = st->factors[2*L]; | ||
698 | m = st->factors[2*L+1]; | ||
699 | fstride[L+1] = fstride[L]*p; | ||
700 | L++; | ||
701 | } while(m!=1); | ||
702 | m = st->factors[2*L-1]; | ||
703 | for (i=L-1;i>=0;i--) | ||
704 | { | 569 | { |
705 | if (i!=0) | 570 | kiss_fft_cpx x = fin[i]; |
706 | m2 = st->factors[2*i-1]; | 571 | fout[st->bitrev[i]].r = SHR32(MULT16_32_Q16(scale, x.r), scale_shift); |
707 | else | 572 | fout[st->bitrev[i]].i = SHR32(MULT16_32_Q16(scale, x.i), scale_shift); |
708 | m2 = 1; | ||
709 | switch (st->factors[2*i]) | ||
710 | { | ||
711 | case 2: | ||
712 | ki_bfly2(fout,fstride[i]<<shift,st,m, fstride[i], m2); | ||
713 | break; | ||
714 | case 4: | ||
715 | ki_bfly4(fout,fstride[i]<<shift,st,m, fstride[i], m2); | ||
716 | break; | ||
717 | #ifndef RADIX_TWO_ONLY | ||
718 | case 3: | ||
719 | ki_bfly3(fout,fstride[i]<<shift,st,m, fstride[i], m2); | ||
720 | break; | ||
721 | case 5: | ||
722 | ki_bfly5(fout,fstride[i]<<shift,st,m, fstride[i], m2); | ||
723 | break; | ||
724 | #endif | ||
725 | } | ||
726 | m = m2; | ||
727 | } | 573 | } |
574 | opus_fft_impl(st, fout); | ||
728 | } | 575 | } |
576 | #endif | ||
577 | |||
729 | 578 | ||
579 | #ifdef TEST_UNIT_DFT_C | ||
580 | void opus_ifft(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout) | ||
581 | { | ||
582 | int i; | ||
583 | celt_assert2 (fin != fout, "In-place FFT not supported"); | ||
584 | /* Bit-reverse the input */ | ||
585 | for (i=0;i<st->nfft;i++) | ||
586 | fout[st->bitrev[i]] = fin[i]; | ||
587 | for (i=0;i<st->nfft;i++) | ||
588 | fout[i].i = -fout[i].i; | ||
589 | opus_fft_impl(st, fout); | ||
590 | for (i=0;i<st->nfft;i++) | ||
591 | fout[i].i = -fout[i].i; | ||
592 | } | ||
593 | #endif | ||