diff options
Diffstat (limited to 'lib/rbcodec/codecs/libspeex/preprocess.c')
-rw-r--r-- | lib/rbcodec/codecs/libspeex/preprocess.c | 1185 |
1 files changed, 1185 insertions, 0 deletions
diff --git a/lib/rbcodec/codecs/libspeex/preprocess.c b/lib/rbcodec/codecs/libspeex/preprocess.c new file mode 100644 index 0000000000..07a2ad3479 --- /dev/null +++ b/lib/rbcodec/codecs/libspeex/preprocess.c | |||
@@ -0,0 +1,1185 @@ | |||
1 | /* Copyright (C) 2003 Epic Games (written by Jean-Marc Valin) | ||
2 | Copyright (C) 2004-2006 Epic Games | ||
3 | |||
4 | File: preprocess.c | ||
5 | Preprocessor with denoising based on the algorithm by Ephraim and Malah | ||
6 | |||
7 | Redistribution and use in source and binary forms, with or without | ||
8 | modification, are permitted provided that the following conditions are | ||
9 | met: | ||
10 | |||
11 | 1. Redistributions of source code must retain the above copyright notice, | ||
12 | this list of conditions and the following disclaimer. | ||
13 | |||
14 | 2. Redistributions in binary form must reproduce the above copyright | ||
15 | notice, this list of conditions and the following disclaimer in the | ||
16 | documentation and/or other materials provided with the distribution. | ||
17 | |||
18 | 3. The name of the author may not be used to endorse or promote products | ||
19 | derived from this software without specific prior written permission. | ||
20 | |||
21 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR | ||
22 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES | ||
23 | OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | ||
24 | DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, | ||
25 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | ||
26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||
27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
28 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | ||
29 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN | ||
30 | ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | ||
31 | POSSIBILITY OF SUCH DAMAGE. | ||
32 | */ | ||
33 | |||
34 | |||
35 | /* | ||
36 | Recommended papers: | ||
37 | |||
38 | Y. Ephraim and D. Malah, "Speech enhancement using minimum mean-square error | ||
39 | short-time spectral amplitude estimator". IEEE Transactions on Acoustics, | ||
40 | Speech and Signal Processing, vol. ASSP-32, no. 6, pp. 1109-1121, 1984. | ||
41 | |||
42 | Y. Ephraim and D. Malah, "Speech enhancement using minimum mean-square error | ||
43 | log-spectral amplitude estimator". IEEE Transactions on Acoustics, Speech and | ||
44 | Signal Processing, vol. ASSP-33, no. 2, pp. 443-445, 1985. | ||
45 | |||
46 | I. Cohen and B. Berdugo, "Speech enhancement for non-stationary noise environments". | ||
47 | Signal Processing, vol. 81, no. 2, pp. 2403-2418, 2001. | ||
48 | |||
49 | Stefan Gustafsson, Rainer Martin, Peter Jax, and Peter Vary. "A psychoacoustic | ||
50 | approach to combined acoustic echo cancellation and noise reduction". IEEE | ||
51 | Transactions on Speech and Audio Processing, 2002. | ||
52 | |||
53 | J.-M. Valin, J. Rouat, and F. Michaud, "Microphone array post-filter for separation | ||
54 | of simultaneous non-stationary sources". In Proceedings IEEE International | ||
55 | Conference on Acoustics, Speech, and Signal Processing, 2004. | ||
56 | */ | ||
57 | |||
58 | #ifdef HAVE_CONFIG_H | ||
59 | #include "config-speex.h" | ||
60 | #endif | ||
61 | |||
62 | #include <math.h> | ||
63 | #include "speex/speex_preprocess.h" | ||
64 | #include "speex/speex_echo.h" | ||
65 | #include "arch.h" | ||
66 | #include "fftwrap.h" | ||
67 | #include "filterbank.h" | ||
68 | #include "math_approx.h" | ||
69 | #include "os_support.h" | ||
70 | |||
71 | #ifndef M_PI | ||
72 | #define M_PI 3.14159263 | ||
73 | #endif | ||
74 | |||
75 | #define LOUDNESS_EXP 5.f | ||
76 | #define AMP_SCALE .001f | ||
77 | #define AMP_SCALE_1 1000.f | ||
78 | |||
79 | #define NB_BANDS 24 | ||
80 | |||
81 | #define SPEECH_PROB_START_DEFAULT QCONST16(0.35f,15) | ||
82 | #define SPEECH_PROB_CONTINUE_DEFAULT QCONST16(0.20f,15) | ||
83 | #define NOISE_SUPPRESS_DEFAULT -15 | ||
84 | #define ECHO_SUPPRESS_DEFAULT -40 | ||
85 | #define ECHO_SUPPRESS_ACTIVE_DEFAULT -15 | ||
86 | |||
87 | #ifndef NULL | ||
88 | #define NULL 0 | ||
89 | #endif | ||
90 | |||
91 | #define SQR(x) ((x)*(x)) | ||
92 | #define SQR16(x) (MULT16_16((x),(x))) | ||
93 | #define SQR16_Q15(x) (MULT16_16_Q15((x),(x))) | ||
94 | |||
95 | #ifdef FIXED_POINT | ||
96 | static inline spx_word16_t DIV32_16_Q8(spx_word32_t a, spx_word32_t b) | ||
97 | { | ||
98 | if (SHR32(a,7) >= b) | ||
99 | { | ||
100 | return 32767; | ||
101 | } else { | ||
102 | if (b>=QCONST32(1,23)) | ||
103 | { | ||
104 | a = SHR32(a,8); | ||
105 | b = SHR32(b,8); | ||
106 | } | ||
107 | if (b>=QCONST32(1,19)) | ||
108 | { | ||
109 | a = SHR32(a,4); | ||
110 | b = SHR32(b,4); | ||
111 | } | ||
112 | if (b>=QCONST32(1,15)) | ||
113 | { | ||
114 | a = SHR32(a,4); | ||
115 | b = SHR32(b,4); | ||
116 | } | ||
117 | a = SHL32(a,8); | ||
118 | return PDIV32_16(a,b); | ||
119 | } | ||
120 | |||
121 | } | ||
122 | static inline spx_word16_t DIV32_16_Q15(spx_word32_t a, spx_word32_t b) | ||
123 | { | ||
124 | if (SHR32(a,15) >= b) | ||
125 | { | ||
126 | return 32767; | ||
127 | } else { | ||
128 | if (b>=QCONST32(1,23)) | ||
129 | { | ||
130 | a = SHR32(a,8); | ||
131 | b = SHR32(b,8); | ||
132 | } | ||
133 | if (b>=QCONST32(1,19)) | ||
134 | { | ||
135 | a = SHR32(a,4); | ||
136 | b = SHR32(b,4); | ||
137 | } | ||
138 | if (b>=QCONST32(1,15)) | ||
139 | { | ||
140 | a = SHR32(a,4); | ||
141 | b = SHR32(b,4); | ||
142 | } | ||
143 | a = SHL32(a,15)-a; | ||
144 | return DIV32_16(a,b); | ||
145 | } | ||
146 | } | ||
147 | #define SNR_SCALING 256.f | ||
148 | #define SNR_SCALING_1 0.0039062f | ||
149 | #define SNR_SHIFT 8 | ||
150 | |||
151 | #define FRAC_SCALING 32767.f | ||
152 | #define FRAC_SCALING_1 3.0518e-05 | ||
153 | #define FRAC_SHIFT 1 | ||
154 | |||
155 | #define EXPIN_SCALING 2048.f | ||
156 | #define EXPIN_SCALING_1 0.00048828f | ||
157 | #define EXPIN_SHIFT 11 | ||
158 | #define EXPOUT_SCALING_1 1.5259e-05 | ||
159 | |||
160 | #define NOISE_SHIFT 7 | ||
161 | |||
162 | #else | ||
163 | |||
164 | #define DIV32_16_Q8(a,b) ((a)/(b)) | ||
165 | #define DIV32_16_Q15(a,b) ((a)/(b)) | ||
166 | #define SNR_SCALING 1.f | ||
167 | #define SNR_SCALING_1 1.f | ||
168 | #define SNR_SHIFT 0 | ||
169 | #define FRAC_SCALING 1.f | ||
170 | #define FRAC_SCALING_1 1.f | ||
171 | #define FRAC_SHIFT 0 | ||
172 | #define NOISE_SHIFT 0 | ||
173 | |||
174 | #define EXPIN_SCALING 1.f | ||
175 | #define EXPIN_SCALING_1 1.f | ||
176 | #define EXPOUT_SCALING_1 1.f | ||
177 | |||
178 | #endif | ||
179 | |||
180 | /** Speex pre-processor state. */ | ||
181 | struct SpeexPreprocessState_ { | ||
182 | /* Basic info */ | ||
183 | int frame_size; /**< Number of samples processed each time */ | ||
184 | int ps_size; /**< Number of points in the power spectrum */ | ||
185 | int sampling_rate; /**< Sampling rate of the input/output */ | ||
186 | int nbands; | ||
187 | FilterBank *bank; | ||
188 | |||
189 | /* Parameters */ | ||
190 | int denoise_enabled; | ||
191 | int vad_enabled; | ||
192 | int dereverb_enabled; | ||
193 | spx_word16_t reverb_decay; | ||
194 | spx_word16_t reverb_level; | ||
195 | spx_word16_t speech_prob_start; | ||
196 | spx_word16_t speech_prob_continue; | ||
197 | int noise_suppress; | ||
198 | int echo_suppress; | ||
199 | int echo_suppress_active; | ||
200 | SpeexEchoState *echo_state; | ||
201 | |||
202 | /* DSP-related arrays */ | ||
203 | spx_word16_t *frame; /**< Processing frame (2*ps_size) */ | ||
204 | spx_word16_t *ft; /**< Processing frame in freq domain (2*ps_size) */ | ||
205 | spx_word32_t *ps; /**< Current power spectrum */ | ||
206 | spx_word16_t *gain2; /**< Adjusted gains */ | ||
207 | spx_word16_t *gain_floor; /**< Minimum gain allowed */ | ||
208 | spx_word16_t *window; /**< Analysis/Synthesis window */ | ||
209 | spx_word32_t *noise; /**< Noise estimate */ | ||
210 | spx_word32_t *reverb_estimate; /**< Estimate of reverb energy */ | ||
211 | spx_word32_t *old_ps; /**< Power spectrum for last frame */ | ||
212 | spx_word16_t *gain; /**< Ephraim Malah gain */ | ||
213 | spx_word16_t *prior; /**< A-priori SNR */ | ||
214 | spx_word16_t *post; /**< A-posteriori SNR */ | ||
215 | |||
216 | spx_word32_t *S; /**< Smoothed power spectrum */ | ||
217 | spx_word32_t *Smin; /**< See Cohen paper */ | ||
218 | spx_word32_t *Stmp; /**< See Cohen paper */ | ||
219 | int *update_prob; /**< Probability of speech presence for noise update */ | ||
220 | |||
221 | spx_word16_t *zeta; /**< Smoothed a priori SNR */ | ||
222 | spx_word32_t *echo_noise; | ||
223 | spx_word32_t *residual_echo; | ||
224 | |||
225 | /* Misc */ | ||
226 | spx_word16_t *inbuf; /**< Input buffer (overlapped analysis) */ | ||
227 | spx_word16_t *outbuf; /**< Output buffer (for overlap and add) */ | ||
228 | |||
229 | /* AGC stuff, only for floating point for now */ | ||
230 | #ifndef FIXED_POINT | ||
231 | int agc_enabled; | ||
232 | float agc_level; | ||
233 | float loudness_accum; | ||
234 | float *loudness_weight; /**< Perceptual loudness curve */ | ||
235 | float loudness; /**< Loudness estimate */ | ||
236 | float agc_gain; /**< Current AGC gain */ | ||
237 | int nb_loudness_adapt; /**< Number of frames used for loudness adaptation so far */ | ||
238 | float max_gain; /**< Maximum gain allowed */ | ||
239 | float max_increase_step; /**< Maximum increase in gain from one frame to another */ | ||
240 | float max_decrease_step; /**< Maximum decrease in gain from one frame to another */ | ||
241 | float prev_loudness; /**< Loudness of previous frame */ | ||
242 | float init_max; /**< Current gain limit during initialisation */ | ||
243 | #endif | ||
244 | int nb_adapt; /**< Number of frames used for adaptation so far */ | ||
245 | int was_speech; | ||
246 | int min_count; /**< Number of frames processed so far */ | ||
247 | void *fft_lookup; /**< Lookup table for the FFT */ | ||
248 | #ifdef FIXED_POINT | ||
249 | int frame_shift; | ||
250 | #endif | ||
251 | }; | ||
252 | |||
253 | |||
254 | static void conj_window(spx_word16_t *w, int len) | ||
255 | { | ||
256 | int i; | ||
257 | for (i=0;i<len;i++) | ||
258 | { | ||
259 | spx_word16_t tmp; | ||
260 | #ifdef FIXED_POINT | ||
261 | spx_word16_t x = DIV32_16(MULT16_16(32767,i),len); | ||
262 | #else | ||
263 | spx_word16_t x = DIV32_16(MULT16_16(QCONST16(4.f,13),i),len); | ||
264 | #endif | ||
265 | int inv=0; | ||
266 | if (x<QCONST16(1.f,13)) | ||
267 | { | ||
268 | } else if (x<QCONST16(2.f,13)) | ||
269 | { | ||
270 | x=QCONST16(2.f,13)-x; | ||
271 | inv=1; | ||
272 | } else if (x<QCONST16(3.f,13)) | ||
273 | { | ||
274 | x=x-QCONST16(2.f,13); | ||
275 | inv=1; | ||
276 | } else { | ||
277 | x=QCONST16(2.f,13)-x+QCONST16(2.f,13); /* 4 - x */ | ||
278 | } | ||
279 | x = MULT16_16_Q14(QCONST16(1.271903f,14), x); | ||
280 | tmp = SQR16_Q15(QCONST16(.5f,15)-MULT16_16_P15(QCONST16(.5f,15),spx_cos_norm(SHL32(EXTEND32(x),2)))); | ||
281 | if (inv) | ||
282 | tmp=SUB16(Q15_ONE,tmp); | ||
283 | w[i]=spx_sqrt(SHL32(EXTEND32(tmp),15)); | ||
284 | } | ||
285 | } | ||
286 | |||
287 | |||
288 | #ifdef FIXED_POINT | ||
289 | /* This function approximates the gain function | ||
290 | y = gamma(1.25)^2 * M(-.25;1;-x) / sqrt(x) | ||
291 | which multiplied by xi/(1+xi) is the optimal gain | ||
292 | in the loudness domain ( sqrt[amplitude] ) | ||
293 | Input in Q11 format, output in Q15 | ||
294 | */ | ||
295 | static inline spx_word32_t hypergeom_gain(spx_word32_t xx) | ||
296 | { | ||
297 | int ind; | ||
298 | spx_word16_t frac; | ||
299 | /* Q13 table */ | ||
300 | static const spx_word16_t table[21] = { | ||
301 | 6730, 8357, 9868, 11267, 12563, 13770, 14898, | ||
302 | 15959, 16961, 17911, 18816, 19682, 20512, 21311, | ||
303 | 22082, 22827, 23549, 24250, 24931, 25594, 26241}; | ||
304 | ind = SHR32(xx,10); | ||
305 | if (ind<0) | ||
306 | return Q15_ONE; | ||
307 | if (ind>19) | ||
308 | return ADD32(EXTEND32(Q15_ONE),EXTEND32(DIV32_16(QCONST32(.1296,23), SHR32(xx,EXPIN_SHIFT-SNR_SHIFT)))); | ||
309 | frac = SHL32(xx-SHL32(ind,10),5); | ||
310 | return SHL32(DIV32_16(PSHR32(MULT16_16(Q15_ONE-frac,table[ind]) + MULT16_16(frac,table[ind+1]),7),(spx_sqrt(SHL32(xx,15)+6711))),7); | ||
311 | } | ||
312 | |||
313 | static inline spx_word16_t qcurve(spx_word16_t x) | ||
314 | { | ||
315 | x = MAX16(x, 1); | ||
316 | return DIV32_16(SHL32(EXTEND32(32767),9),ADD16(512,MULT16_16_Q15(QCONST16(.60f,15),DIV32_16(32767,x)))); | ||
317 | } | ||
318 | |||
319 | /* Compute the gain floor based on different floors for the background noise and residual echo */ | ||
320 | static void compute_gain_floor(int noise_suppress, int effective_echo_suppress, spx_word32_t *noise, spx_word32_t *echo, spx_word16_t *gain_floor, int len) | ||
321 | { | ||
322 | int i; | ||
323 | |||
324 | if (noise_suppress > effective_echo_suppress) | ||
325 | { | ||
326 | spx_word16_t noise_gain, gain_ratio; | ||
327 | noise_gain = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(0.11513,11),noise_suppress)),1))); | ||
328 | gain_ratio = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(.2302585f,11),effective_echo_suppress-noise_suppress)),1))); | ||
329 | |||
330 | /* gain_floor = sqrt [ (noise*noise_floor + echo*echo_floor) / (noise+echo) ] */ | ||
331 | for (i=0;i<len;i++) | ||
332 | gain_floor[i] = MULT16_16_Q15(noise_gain, | ||
333 | spx_sqrt(SHL32(EXTEND32(DIV32_16_Q15(PSHR32(noise[i],NOISE_SHIFT) + MULT16_32_Q15(gain_ratio,echo[i]), | ||
334 | (1+PSHR32(noise[i],NOISE_SHIFT) + echo[i]) )),15))); | ||
335 | } else { | ||
336 | spx_word16_t echo_gain, gain_ratio; | ||
337 | echo_gain = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(0.11513,11),effective_echo_suppress)),1))); | ||
338 | gain_ratio = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(.2302585f,11),noise_suppress-effective_echo_suppress)),1))); | ||
339 | |||
340 | /* gain_floor = sqrt [ (noise*noise_floor + echo*echo_floor) / (noise+echo) ] */ | ||
341 | for (i=0;i<len;i++) | ||
342 | gain_floor[i] = MULT16_16_Q15(echo_gain, | ||
343 | spx_sqrt(SHL32(EXTEND32(DIV32_16_Q15(MULT16_32_Q15(gain_ratio,PSHR32(noise[i],NOISE_SHIFT)) + echo[i], | ||
344 | (1+PSHR32(noise[i],NOISE_SHIFT) + echo[i]) )),15))); | ||
345 | } | ||
346 | } | ||
347 | |||
348 | #else | ||
349 | /* This function approximates the gain function | ||
350 | y = gamma(1.25)^2 * M(-.25;1;-x) / sqrt(x) | ||
351 | which multiplied by xi/(1+xi) is the optimal gain | ||
352 | in the loudness domain ( sqrt[amplitude] ) | ||
353 | */ | ||
354 | static inline spx_word32_t hypergeom_gain(spx_word32_t xx) | ||
355 | { | ||
356 | int ind; | ||
357 | float integer, frac; | ||
358 | float x; | ||
359 | static const float table[21] = { | ||
360 | 0.82157f, 1.02017f, 1.20461f, 1.37534f, 1.53363f, 1.68092f, 1.81865f, | ||
361 | 1.94811f, 2.07038f, 2.18638f, 2.29688f, 2.40255f, 2.50391f, 2.60144f, | ||
362 | 2.69551f, 2.78647f, 2.87458f, 2.96015f, 3.04333f, 3.12431f, 3.20326f}; | ||
363 | x = EXPIN_SCALING_1*xx; | ||
364 | integer = floor(2*x); | ||
365 | ind = (int)integer; | ||
366 | if (ind<0) | ||
367 | return FRAC_SCALING; | ||
368 | if (ind>19) | ||
369 | return FRAC_SCALING*(1+.1296/x); | ||
370 | frac = 2*x-integer; | ||
371 | return FRAC_SCALING*((1-frac)*table[ind] + frac*table[ind+1])/sqrt(x+.0001f); | ||
372 | } | ||
373 | |||
374 | static inline spx_word16_t qcurve(spx_word16_t x) | ||
375 | { | ||
376 | return 1.f/(1.f+.15f/(SNR_SCALING_1*x)); | ||
377 | } | ||
378 | |||
379 | static void compute_gain_floor(int noise_suppress, int effective_echo_suppress, spx_word32_t *noise, spx_word32_t *echo, spx_word16_t *gain_floor, int len) | ||
380 | { | ||
381 | int i; | ||
382 | float echo_floor; | ||
383 | float noise_floor; | ||
384 | |||
385 | noise_floor = exp(.2302585f*noise_suppress); | ||
386 | echo_floor = exp(.2302585f*effective_echo_suppress); | ||
387 | |||
388 | /* Compute the gain floor based on different floors for the background noise and residual echo */ | ||
389 | for (i=0;i<len;i++) | ||
390 | gain_floor[i] = FRAC_SCALING*sqrt(noise_floor*PSHR32(noise[i],NOISE_SHIFT) + echo_floor*echo[i])/sqrt(1+PSHR32(noise[i],NOISE_SHIFT) + echo[i]); | ||
391 | } | ||
392 | |||
393 | #endif | ||
394 | SpeexPreprocessState *speex_preprocess_state_init(int frame_size, int sampling_rate) | ||
395 | { | ||
396 | int i; | ||
397 | int N, N3, N4, M; | ||
398 | |||
399 | SpeexPreprocessState *st = (SpeexPreprocessState *)speex_alloc(sizeof(SpeexPreprocessState)); | ||
400 | st->frame_size = frame_size; | ||
401 | |||
402 | /* Round ps_size down to the nearest power of two */ | ||
403 | #if 0 | ||
404 | i=1; | ||
405 | st->ps_size = st->frame_size; | ||
406 | while(1) | ||
407 | { | ||
408 | if (st->ps_size & ~i) | ||
409 | { | ||
410 | st->ps_size &= ~i; | ||
411 | i<<=1; | ||
412 | } else { | ||
413 | break; | ||
414 | } | ||
415 | } | ||
416 | |||
417 | |||
418 | if (st->ps_size < 3*st->frame_size/4) | ||
419 | st->ps_size = st->ps_size * 3 / 2; | ||
420 | #else | ||
421 | st->ps_size = st->frame_size; | ||
422 | #endif | ||
423 | |||
424 | N = st->ps_size; | ||
425 | N3 = 2*N - st->frame_size; | ||
426 | N4 = st->frame_size - N3; | ||
427 | |||
428 | st->sampling_rate = sampling_rate; | ||
429 | st->denoise_enabled = 1; | ||
430 | st->vad_enabled = 0; | ||
431 | st->dereverb_enabled = 0; | ||
432 | st->reverb_decay = 0; | ||
433 | st->reverb_level = 0; | ||
434 | st->noise_suppress = NOISE_SUPPRESS_DEFAULT; | ||
435 | st->echo_suppress = ECHO_SUPPRESS_DEFAULT; | ||
436 | st->echo_suppress_active = ECHO_SUPPRESS_ACTIVE_DEFAULT; | ||
437 | |||
438 | st->speech_prob_start = SPEECH_PROB_START_DEFAULT; | ||
439 | st->speech_prob_continue = SPEECH_PROB_CONTINUE_DEFAULT; | ||
440 | |||
441 | st->echo_state = NULL; | ||
442 | |||
443 | st->nbands = NB_BANDS; | ||
444 | M = st->nbands; | ||
445 | st->bank = filterbank_new(M, sampling_rate, N, 1); | ||
446 | |||
447 | st->frame = (spx_word16_t*)speex_alloc(2*N*sizeof(spx_word16_t)); | ||
448 | st->window = (spx_word16_t*)speex_alloc(2*N*sizeof(spx_word16_t)); | ||
449 | st->ft = (spx_word16_t*)speex_alloc(2*N*sizeof(spx_word16_t)); | ||
450 | |||
451 | st->ps = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t)); | ||
452 | st->noise = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t)); | ||
453 | st->echo_noise = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t)); | ||
454 | st->residual_echo = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t)); | ||
455 | st->reverb_estimate = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t)); | ||
456 | st->old_ps = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t)); | ||
457 | st->prior = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t)); | ||
458 | st->post = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t)); | ||
459 | st->gain = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t)); | ||
460 | st->gain2 = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t)); | ||
461 | st->gain_floor = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t)); | ||
462 | st->zeta = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t)); | ||
463 | |||
464 | st->S = (spx_word32_t*)speex_alloc(N*sizeof(spx_word32_t)); | ||
465 | st->Smin = (spx_word32_t*)speex_alloc(N*sizeof(spx_word32_t)); | ||
466 | st->Stmp = (spx_word32_t*)speex_alloc(N*sizeof(spx_word32_t)); | ||
467 | st->update_prob = (int*)speex_alloc(N*sizeof(int)); | ||
468 | |||
469 | st->inbuf = (spx_word16_t*)speex_alloc(N3*sizeof(spx_word16_t)); | ||
470 | st->outbuf = (spx_word16_t*)speex_alloc(N3*sizeof(spx_word16_t)); | ||
471 | |||
472 | conj_window(st->window, 2*N3); | ||
473 | for (i=2*N3;i<2*st->ps_size;i++) | ||
474 | st->window[i]=Q15_ONE; | ||
475 | |||
476 | if (N4>0) | ||
477 | { | ||
478 | for (i=N3-1;i>=0;i--) | ||
479 | { | ||
480 | st->window[i+N3+N4]=st->window[i+N3]; | ||
481 | st->window[i+N3]=1; | ||
482 | } | ||
483 | } | ||
484 | for (i=0;i<N+M;i++) | ||
485 | { | ||
486 | st->noise[i]=QCONST32(1.f,NOISE_SHIFT); | ||
487 | st->reverb_estimate[i]=0; | ||
488 | st->old_ps[i]=1; | ||
489 | st->gain[i]=Q15_ONE; | ||
490 | st->post[i]=SHL16(1, SNR_SHIFT); | ||
491 | st->prior[i]=SHL16(1, SNR_SHIFT); | ||
492 | } | ||
493 | |||
494 | for (i=0;i<N;i++) | ||
495 | st->update_prob[i] = 1; | ||
496 | for (i=0;i<N3;i++) | ||
497 | { | ||
498 | st->inbuf[i]=0; | ||
499 | st->outbuf[i]=0; | ||
500 | } | ||
501 | #ifndef FIXED_POINT | ||
502 | st->agc_enabled = 0; | ||
503 | st->agc_level = 8000; | ||
504 | st->loudness_weight = (float*)speex_alloc(N*sizeof(float)); | ||
505 | for (i=0;i<N;i++) | ||
506 | { | ||
507 | float ff=((float)i)*.5*sampling_rate/((float)N); | ||
508 | /*st->loudness_weight[i] = .5f*(1.f/(1.f+ff/8000.f))+1.f*exp(-.5f*(ff-3800.f)*(ff-3800.f)/9e5f);*/ | ||
509 | st->loudness_weight[i] = .35f-.35f*ff/16000.f+.73f*exp(-.5f*(ff-3800)*(ff-3800)/9e5f); | ||
510 | if (st->loudness_weight[i]<.01f) | ||
511 | st->loudness_weight[i]=.01f; | ||
512 | st->loudness_weight[i] *= st->loudness_weight[i]; | ||
513 | } | ||
514 | /*st->loudness = pow(AMP_SCALE*st->agc_level,LOUDNESS_EXP);*/ | ||
515 | st->loudness = 1e-15; | ||
516 | st->agc_gain = 1; | ||
517 | st->nb_loudness_adapt = 0; | ||
518 | st->max_gain = 30; | ||
519 | st->max_increase_step = exp(0.11513f * 12.*st->frame_size / st->sampling_rate); | ||
520 | st->max_decrease_step = exp(-0.11513f * 40.*st->frame_size / st->sampling_rate); | ||
521 | st->prev_loudness = 1; | ||
522 | st->init_max = 1; | ||
523 | #endif | ||
524 | st->was_speech = 0; | ||
525 | |||
526 | st->fft_lookup = spx_fft_init(2*N); | ||
527 | |||
528 | st->nb_adapt=0; | ||
529 | st->min_count=0; | ||
530 | return st; | ||
531 | } | ||
532 | |||
533 | void speex_preprocess_state_destroy(SpeexPreprocessState *st) | ||
534 | { | ||
535 | speex_free(st->frame); | ||
536 | speex_free(st->ft); | ||
537 | speex_free(st->ps); | ||
538 | speex_free(st->gain2); | ||
539 | speex_free(st->gain_floor); | ||
540 | speex_free(st->window); | ||
541 | speex_free(st->noise); | ||
542 | speex_free(st->reverb_estimate); | ||
543 | speex_free(st->old_ps); | ||
544 | speex_free(st->gain); | ||
545 | speex_free(st->prior); | ||
546 | speex_free(st->post); | ||
547 | #ifndef FIXED_POINT | ||
548 | speex_free(st->loudness_weight); | ||
549 | #endif | ||
550 | speex_free(st->echo_noise); | ||
551 | speex_free(st->residual_echo); | ||
552 | |||
553 | speex_free(st->S); | ||
554 | speex_free(st->Smin); | ||
555 | speex_free(st->Stmp); | ||
556 | speex_free(st->update_prob); | ||
557 | speex_free(st->zeta); | ||
558 | |||
559 | speex_free(st->inbuf); | ||
560 | speex_free(st->outbuf); | ||
561 | |||
562 | spx_fft_destroy(st->fft_lookup); | ||
563 | filterbank_destroy(st->bank); | ||
564 | speex_free(st); | ||
565 | } | ||
566 | |||
567 | /* FIXME: The AGC doesn't work yet with fixed-point*/ | ||
568 | #ifndef FIXED_POINT | ||
569 | static void speex_compute_agc(SpeexPreprocessState *st, spx_word16_t Pframe, spx_word16_t *ft) | ||
570 | { | ||
571 | int i; | ||
572 | int N = st->ps_size; | ||
573 | float target_gain; | ||
574 | float loudness=1.f; | ||
575 | float rate; | ||
576 | |||
577 | for (i=2;i<N;i++) | ||
578 | { | ||
579 | loudness += 2.f*N*st->ps[i]* st->loudness_weight[i]; | ||
580 | } | ||
581 | loudness=sqrt(loudness); | ||
582 | /*if (loudness < 2*pow(st->loudness, 1.0/LOUDNESS_EXP) && | ||
583 | loudness*2 > pow(st->loudness, 1.0/LOUDNESS_EXP))*/ | ||
584 | if (Pframe>.3f) | ||
585 | { | ||
586 | st->nb_loudness_adapt++; | ||
587 | /*rate=2.0f*Pframe*Pframe/(1+st->nb_loudness_adapt);*/ | ||
588 | rate = .03*Pframe*Pframe; | ||
589 | st->loudness = (1-rate)*st->loudness + (rate)*pow(AMP_SCALE*loudness, LOUDNESS_EXP); | ||
590 | st->loudness_accum = (1-rate)*st->loudness_accum + rate; | ||
591 | if (st->init_max < st->max_gain && st->nb_adapt > 20) | ||
592 | st->init_max *= 1.f + .1f*Pframe*Pframe; | ||
593 | } | ||
594 | /*printf ("%f %f %f %f\n", Pframe, loudness, pow(st->loudness, 1.0f/LOUDNESS_EXP), st->loudness2);*/ | ||
595 | |||
596 | target_gain = AMP_SCALE*st->agc_level*pow(st->loudness/(1e-4+st->loudness_accum), -1.0f/LOUDNESS_EXP); | ||
597 | |||
598 | if ((Pframe>.5 && st->nb_adapt > 20) || target_gain < st->agc_gain) | ||
599 | { | ||
600 | if (target_gain > st->max_increase_step*st->agc_gain) | ||
601 | target_gain = st->max_increase_step*st->agc_gain; | ||
602 | if (target_gain < st->max_decrease_step*st->agc_gain && loudness < 10*st->prev_loudness) | ||
603 | target_gain = st->max_decrease_step*st->agc_gain; | ||
604 | if (target_gain > st->max_gain) | ||
605 | target_gain = st->max_gain; | ||
606 | if (target_gain > st->init_max) | ||
607 | target_gain = st->init_max; | ||
608 | |||
609 | st->agc_gain = target_gain; | ||
610 | } | ||
611 | /*fprintf (stderr, "%f %f %f\n", loudness, (float)AMP_SCALE_1*pow(st->loudness, 1.0f/LOUDNESS_EXP), st->agc_gain);*/ | ||
612 | |||
613 | for (i=0;i<2*N;i++) | ||
614 | ft[i] *= st->agc_gain; | ||
615 | st->prev_loudness = loudness; | ||
616 | } | ||
617 | #endif | ||
618 | |||
619 | static void preprocess_analysis(SpeexPreprocessState *st, spx_int16_t *x) | ||
620 | { | ||
621 | int i; | ||
622 | int N = st->ps_size; | ||
623 | int N3 = 2*N - st->frame_size; | ||
624 | int N4 = st->frame_size - N3; | ||
625 | spx_word32_t *ps=st->ps; | ||
626 | |||
627 | /* 'Build' input frame */ | ||
628 | for (i=0;i<N3;i++) | ||
629 | st->frame[i]=st->inbuf[i]; | ||
630 | for (i=0;i<st->frame_size;i++) | ||
631 | st->frame[N3+i]=x[i]; | ||
632 | |||
633 | /* Update inbuf */ | ||
634 | for (i=0;i<N3;i++) | ||
635 | st->inbuf[i]=x[N4+i]; | ||
636 | |||
637 | /* Windowing */ | ||
638 | for (i=0;i<2*N;i++) | ||
639 | st->frame[i] = MULT16_16_Q15(st->frame[i], st->window[i]); | ||
640 | |||
641 | #ifdef FIXED_POINT | ||
642 | { | ||
643 | spx_word16_t max_val=0; | ||
644 | for (i=0;i<2*N;i++) | ||
645 | max_val = MAX16(max_val, ABS16(st->frame[i])); | ||
646 | st->frame_shift = 14-spx_ilog2(EXTEND32(max_val)); | ||
647 | for (i=0;i<2*N;i++) | ||
648 | st->frame[i] = SHL16(st->frame[i], st->frame_shift); | ||
649 | } | ||
650 | #endif | ||
651 | |||
652 | /* Perform FFT */ | ||
653 | spx_fft(st->fft_lookup, st->frame, st->ft); | ||
654 | |||
655 | /* Power spectrum */ | ||
656 | ps[0]=MULT16_16(st->ft[0],st->ft[0]); | ||
657 | for (i=1;i<N;i++) | ||
658 | ps[i]=MULT16_16(st->ft[2*i-1],st->ft[2*i-1]) + MULT16_16(st->ft[2*i],st->ft[2*i]); | ||
659 | for (i=0;i<N;i++) | ||
660 | st->ps[i] = PSHR32(st->ps[i], 2*st->frame_shift); | ||
661 | |||
662 | filterbank_compute_bank32(st->bank, ps, ps+N); | ||
663 | } | ||
664 | |||
665 | static void update_noise_prob(SpeexPreprocessState *st) | ||
666 | { | ||
667 | int i; | ||
668 | int min_range; | ||
669 | int N = st->ps_size; | ||
670 | |||
671 | for (i=1;i<N-1;i++) | ||
672 | st->S[i] = MULT16_32_Q15(QCONST16(.8f,15),st->S[i]) + MULT16_32_Q15(QCONST16(.05f,15),st->ps[i-1]) | ||
673 | + MULT16_32_Q15(QCONST16(.1f,15),st->ps[i]) + MULT16_32_Q15(QCONST16(.05f,15),st->ps[i+1]); | ||
674 | st->S[0] = MULT16_32_Q15(QCONST16(.8f,15),st->S[0]) + MULT16_32_Q15(QCONST16(.2f,15),st->ps[0]); | ||
675 | st->S[N-1] = MULT16_32_Q15(QCONST16(.8f,15),st->S[N-1]) + MULT16_32_Q15(QCONST16(.2f,15),st->ps[N-1]); | ||
676 | |||
677 | if (st->nb_adapt==1) | ||
678 | { | ||
679 | for (i=0;i<N;i++) | ||
680 | st->Smin[i] = st->Stmp[i] = 0; | ||
681 | } | ||
682 | |||
683 | if (st->nb_adapt < 100) | ||
684 | min_range = 15; | ||
685 | else if (st->nb_adapt < 1000) | ||
686 | min_range = 50; | ||
687 | else if (st->nb_adapt < 10000) | ||
688 | min_range = 150; | ||
689 | else | ||
690 | min_range = 300; | ||
691 | if (st->min_count > min_range) | ||
692 | { | ||
693 | st->min_count = 0; | ||
694 | for (i=0;i<N;i++) | ||
695 | { | ||
696 | st->Smin[i] = MIN32(st->Stmp[i], st->S[i]); | ||
697 | st->Stmp[i] = st->S[i]; | ||
698 | } | ||
699 | } else { | ||
700 | for (i=0;i<N;i++) | ||
701 | { | ||
702 | st->Smin[i] = MIN32(st->Smin[i], st->S[i]); | ||
703 | st->Stmp[i] = MIN32(st->Stmp[i], st->S[i]); | ||
704 | } | ||
705 | } | ||
706 | for (i=0;i<N;i++) | ||
707 | { | ||
708 | if (MULT16_32_Q15(QCONST16(.4f,15),st->S[i]) > ADD32(st->Smin[i],EXTEND32(20))) | ||
709 | st->update_prob[i] = 1; | ||
710 | else | ||
711 | st->update_prob[i] = 0; | ||
712 | /*fprintf (stderr, "%f ", st->S[i]/st->Smin[i]);*/ | ||
713 | /*fprintf (stderr, "%f ", st->update_prob[i]);*/ | ||
714 | } | ||
715 | |||
716 | } | ||
717 | |||
718 | #define NOISE_OVERCOMPENS 1. | ||
719 | |||
720 | void speex_echo_get_residual(SpeexEchoState *st, spx_word32_t *Yout, int len); | ||
721 | |||
722 | int speex_preprocess(SpeexPreprocessState *st, spx_int16_t *x, spx_int32_t *echo) | ||
723 | { | ||
724 | return speex_preprocess_run(st, x); | ||
725 | } | ||
726 | |||
727 | int speex_preprocess_run(SpeexPreprocessState *st, spx_int16_t *x) | ||
728 | { | ||
729 | int i; | ||
730 | int M; | ||
731 | int N = st->ps_size; | ||
732 | int N3 = 2*N - st->frame_size; | ||
733 | int N4 = st->frame_size - N3; | ||
734 | spx_word32_t *ps=st->ps; | ||
735 | spx_word32_t Zframe; | ||
736 | spx_word16_t Pframe; | ||
737 | spx_word16_t beta, beta_1; | ||
738 | spx_word16_t effective_echo_suppress; | ||
739 | |||
740 | st->nb_adapt++; | ||
741 | if (st->nb_adapt>20000) | ||
742 | st->nb_adapt = 20000; | ||
743 | st->min_count++; | ||
744 | |||
745 | beta = MAX16(QCONST16(.03,15),DIV32_16(Q15_ONE,st->nb_adapt)); | ||
746 | beta_1 = Q15_ONE-beta; | ||
747 | M = st->nbands; | ||
748 | /* Deal with residual echo if provided */ | ||
749 | if (st->echo_state) | ||
750 | { | ||
751 | speex_echo_get_residual(st->echo_state, st->residual_echo, N); | ||
752 | #ifndef FIXED_POINT | ||
753 | /* If there are NaNs or ridiculous values, it'll show up in the DC and we just reset everything to zero */ | ||
754 | if (!(st->residual_echo[0] >=0 && st->residual_echo[0]<N*1e9f)) | ||
755 | { | ||
756 | for (i=0;i<N;i++) | ||
757 | st->residual_echo[i] = 0; | ||
758 | } | ||
759 | #endif | ||
760 | for (i=0;i<N;i++) | ||
761 | st->echo_noise[i] = MAX32(MULT16_32_Q15(QCONST16(.6f,15),st->echo_noise[i]), st->residual_echo[i]); | ||
762 | filterbank_compute_bank32(st->bank, st->echo_noise, st->echo_noise+N); | ||
763 | } else { | ||
764 | for (i=0;i<N+M;i++) | ||
765 | st->echo_noise[i] = 0; | ||
766 | } | ||
767 | preprocess_analysis(st, x); | ||
768 | |||
769 | update_noise_prob(st); | ||
770 | |||
771 | /* Noise estimation always updated for the 10 first frames */ | ||
772 | /*if (st->nb_adapt<10) | ||
773 | { | ||
774 | for (i=1;i<N-1;i++) | ||
775 | st->update_prob[i] = 0; | ||
776 | } | ||
777 | */ | ||
778 | |||
779 | /* Update the noise estimate for the frequencies where it can be */ | ||
780 | for (i=0;i<N;i++) | ||
781 | { | ||
782 | if (!st->update_prob[i] || st->ps[i] < PSHR32(st->noise[i], NOISE_SHIFT)) | ||
783 | st->noise[i] = MAX32(EXTEND32(0),MULT16_32_Q15(beta_1,st->noise[i]) + MULT16_32_Q15(beta,SHL32(st->ps[i],NOISE_SHIFT))); | ||
784 | } | ||
785 | filterbank_compute_bank32(st->bank, st->noise, st->noise+N); | ||
786 | |||
787 | /* Special case for first frame */ | ||
788 | if (st->nb_adapt==1) | ||
789 | for (i=0;i<N+M;i++) | ||
790 | st->old_ps[i] = ps[i]; | ||
791 | |||
792 | /* Compute a posteriori SNR */ | ||
793 | for (i=0;i<N+M;i++) | ||
794 | { | ||
795 | spx_word16_t gamma; | ||
796 | |||
797 | /* Total noise estimate including residual echo and reverberation */ | ||
798 | spx_word32_t tot_noise = ADD32(ADD32(ADD32(EXTEND32(1), PSHR32(st->noise[i],NOISE_SHIFT)) , st->echo_noise[i]) , st->reverb_estimate[i]); | ||
799 | |||
800 | /* A posteriori SNR = ps/noise - 1*/ | ||
801 | st->post[i] = SUB16(DIV32_16_Q8(ps[i],tot_noise), QCONST16(1.f,SNR_SHIFT)); | ||
802 | st->post[i]=MIN16(st->post[i], QCONST16(100.f,SNR_SHIFT)); | ||
803 | |||
804 | /* Computing update gamma = .1 + .9*(old/(old+noise))^2 */ | ||
805 | gamma = QCONST16(.1f,15)+MULT16_16_Q15(QCONST16(.89f,15),SQR16_Q15(DIV32_16_Q15(st->old_ps[i],ADD32(st->old_ps[i],tot_noise)))); | ||
806 | |||
807 | /* A priori SNR update = gamma*max(0,post) + (1-gamma)*old/noise */ | ||
808 | st->prior[i] = EXTRACT16(PSHR32(ADD32(MULT16_16(gamma,MAX16(0,st->post[i])), MULT16_16(Q15_ONE-gamma,DIV32_16_Q8(st->old_ps[i],tot_noise))), 15)); | ||
809 | st->prior[i]=MIN16(st->prior[i], QCONST16(100.f,SNR_SHIFT)); | ||
810 | } | ||
811 | |||
812 | /*print_vec(st->post, N+M, "");*/ | ||
813 | |||
814 | /* Recursive average of the a priori SNR. A bit smoothed for the psd components */ | ||
815 | st->zeta[0] = PSHR32(ADD32(MULT16_16(QCONST16(.7f,15),st->zeta[0]), MULT16_16(QCONST16(.3f,15),st->prior[0])),15); | ||
816 | for (i=1;i<N-1;i++) | ||
817 | st->zeta[i] = PSHR32(ADD32(ADD32(ADD32(MULT16_16(QCONST16(.7f,15),st->zeta[i]), MULT16_16(QCONST16(.15f,15),st->prior[i])), | ||
818 | MULT16_16(QCONST16(.075f,15),st->prior[i-1])), MULT16_16(QCONST16(.075f,15),st->prior[i+1])),15); | ||
819 | for (i=N-1;i<N+M;i++) | ||
820 | st->zeta[i] = PSHR32(ADD32(MULT16_16(QCONST16(.7f,15),st->zeta[i]), MULT16_16(QCONST16(.3f,15),st->prior[i])),15); | ||
821 | |||
822 | /* Speech probability of presence for the entire frame is based on the average filterbank a priori SNR */ | ||
823 | Zframe = 0; | ||
824 | for (i=N;i<N+M;i++) | ||
825 | Zframe = ADD32(Zframe, EXTEND32(st->zeta[i])); | ||
826 | Pframe = QCONST16(.1f,15)+MULT16_16_Q15(QCONST16(.899f,15),qcurve(DIV32_16(Zframe,st->nbands))); | ||
827 | |||
828 | effective_echo_suppress = EXTRACT16(PSHR32(ADD32(MULT16_16(SUB16(Q15_ONE,Pframe), st->echo_suppress), MULT16_16(Pframe, st->echo_suppress_active)),15)); | ||
829 | |||
830 | compute_gain_floor(st->noise_suppress, effective_echo_suppress, st->noise+N, st->echo_noise+N, st->gain_floor+N, M); | ||
831 | |||
832 | /* Compute Ephraim & Malah gain speech probability of presence for each critical band (Bark scale) | ||
833 | Technically this is actually wrong because the EM gaim assumes a slightly different probability | ||
834 | distribution */ | ||
835 | for (i=N;i<N+M;i++) | ||
836 | { | ||
837 | /* See EM and Cohen papers*/ | ||
838 | spx_word32_t theta; | ||
839 | /* Gain from hypergeometric function */ | ||
840 | spx_word32_t MM; | ||
841 | /* Weiner filter gain */ | ||
842 | spx_word16_t prior_ratio; | ||
843 | /* a priority probability of speech presence based on Bark sub-band alone */ | ||
844 | spx_word16_t P1; | ||
845 | /* Speech absence a priori probability (considering sub-band and frame) */ | ||
846 | spx_word16_t q; | ||
847 | #ifdef FIXED_POINT | ||
848 | spx_word16_t tmp; | ||
849 | #endif | ||
850 | |||
851 | prior_ratio = PDIV32_16(SHL32(EXTEND32(st->prior[i]), 15), ADD16(st->prior[i], SHL32(1,SNR_SHIFT))); | ||
852 | theta = MULT16_32_P15(prior_ratio, QCONST32(1.f,EXPIN_SHIFT)+SHL32(EXTEND32(st->post[i]),EXPIN_SHIFT-SNR_SHIFT)); | ||
853 | |||
854 | MM = hypergeom_gain(theta); | ||
855 | /* Gain with bound */ | ||
856 | st->gain[i] = EXTRACT16(MIN32(Q15_ONE, MULT16_32_Q15(prior_ratio, MM))); | ||
857 | /* Save old Bark power spectrum */ | ||
858 | st->old_ps[i] = MULT16_32_P15(QCONST16(.2f,15),st->old_ps[i]) + MULT16_32_P15(MULT16_16_P15(QCONST16(.8f,15),SQR16_Q15(st->gain[i])),ps[i]); | ||
859 | |||
860 | P1 = QCONST16(.199f,15)+MULT16_16_Q15(QCONST16(.8f,15),qcurve (st->zeta[i])); | ||
861 | q = Q15_ONE-MULT16_16_Q15(Pframe,P1); | ||
862 | #ifdef FIXED_POINT | ||
863 | theta = MIN32(theta, EXTEND32(32767)); | ||
864 | /*Q8*/tmp = MULT16_16_Q15((SHL32(1,SNR_SHIFT)+st->prior[i]),EXTRACT16(MIN32(Q15ONE,SHR32(spx_exp(-EXTRACT16(theta)),1)))); | ||
865 | tmp = MIN16(QCONST16(3.,SNR_SHIFT), tmp); /* Prevent overflows in the next line*/ | ||
866 | /*Q8*/tmp = EXTRACT16(PSHR32(MULT16_16(PDIV32_16(SHL32(EXTEND32(q),8),(Q15_ONE-q)),tmp),8)); | ||
867 | st->gain2[i]=DIV32_16(SHL32(EXTEND32(32767),SNR_SHIFT), ADD16(256,tmp)); | ||
868 | #else | ||
869 | st->gain2[i]=1/(1.f + (q/(1.f-q))*(1+st->prior[i])*exp(-theta)); | ||
870 | #endif | ||
871 | } | ||
872 | /* Convert the EM gains and speech prob to linear frequency */ | ||
873 | filterbank_compute_psd16(st->bank,st->gain2+N, st->gain2); | ||
874 | filterbank_compute_psd16(st->bank,st->gain+N, st->gain); | ||
875 | |||
876 | /* Use 1 for linear gain resolution (best) or 0 for Bark gain resolution (faster) */ | ||
877 | if (1) | ||
878 | { | ||
879 | filterbank_compute_psd16(st->bank,st->gain_floor+N, st->gain_floor); | ||
880 | |||
881 | /* Compute gain according to the Ephraim-Malah algorithm -- linear frequency */ | ||
882 | for (i=0;i<N;i++) | ||
883 | { | ||
884 | spx_word32_t MM; | ||
885 | spx_word32_t theta; | ||
886 | spx_word16_t prior_ratio; | ||
887 | spx_word16_t tmp; | ||
888 | spx_word16_t p; | ||
889 | spx_word16_t g; | ||
890 | |||
891 | /* Wiener filter gain */ | ||
892 | prior_ratio = PDIV32_16(SHL32(EXTEND32(st->prior[i]), 15), ADD16(st->prior[i], SHL32(1,SNR_SHIFT))); | ||
893 | theta = MULT16_32_P15(prior_ratio, QCONST32(1.f,EXPIN_SHIFT)+SHL32(EXTEND32(st->post[i]),EXPIN_SHIFT-SNR_SHIFT)); | ||
894 | |||
895 | /* Optimal estimator for loudness domain */ | ||
896 | MM = hypergeom_gain(theta); | ||
897 | /* EM gain with bound */ | ||
898 | g = EXTRACT16(MIN32(Q15_ONE, MULT16_32_Q15(prior_ratio, MM))); | ||
899 | /* Interpolated speech probability of presence */ | ||
900 | p = st->gain2[i]; | ||
901 | |||
902 | /* Constrain the gain to be close to the Bark scale gain */ | ||
903 | if (MULT16_16_Q15(QCONST16(.333f,15),g) > st->gain[i]) | ||
904 | g = MULT16_16(3,st->gain[i]); | ||
905 | st->gain[i] = g; | ||
906 | |||
907 | /* Save old power spectrum */ | ||
908 | st->old_ps[i] = MULT16_32_P15(QCONST16(.2f,15),st->old_ps[i]) + MULT16_32_P15(MULT16_16_P15(QCONST16(.8f,15),SQR16_Q15(st->gain[i])),ps[i]); | ||
909 | |||
910 | /* Apply gain floor */ | ||
911 | if (st->gain[i] < st->gain_floor[i]) | ||
912 | st->gain[i] = st->gain_floor[i]; | ||
913 | |||
914 | /* Exponential decay model for reverberation (unused) */ | ||
915 | /*st->reverb_estimate[i] = st->reverb_decay*st->reverb_estimate[i] + st->reverb_decay*st->reverb_level*st->gain[i]*st->gain[i]*st->ps[i];*/ | ||
916 | |||
917 | /* Take into account speech probability of presence (loudness domain MMSE estimator) */ | ||
918 | /* gain2 = [p*sqrt(gain)+(1-p)*sqrt(gain _floor) ]^2 */ | ||
919 | tmp = MULT16_16_P15(p,spx_sqrt(SHL32(EXTEND32(st->gain[i]),15))) + MULT16_16_P15(SUB16(Q15_ONE,p),spx_sqrt(SHL32(EXTEND32(st->gain_floor[i]),15))); | ||
920 | st->gain2[i]=SQR16_Q15(tmp); | ||
921 | |||
922 | /* Use this if you want a log-domain MMSE estimator instead */ | ||
923 | /*st->gain2[i] = pow(st->gain[i], p) * pow(st->gain_floor[i],1.f-p);*/ | ||
924 | } | ||
925 | } else { | ||
926 | for (i=N;i<N+M;i++) | ||
927 | { | ||
928 | spx_word16_t tmp; | ||
929 | spx_word16_t p = st->gain2[i]; | ||
930 | st->gain[i] = MAX16(st->gain[i], st->gain_floor[i]); | ||
931 | tmp = MULT16_16_P15(p,spx_sqrt(SHL32(EXTEND32(st->gain[i]),15))) + MULT16_16_P15(SUB16(Q15_ONE,p),spx_sqrt(SHL32(EXTEND32(st->gain_floor[i]),15))); | ||
932 | st->gain2[i]=SQR16_Q15(tmp); | ||
933 | } | ||
934 | filterbank_compute_psd16(st->bank,st->gain2+N, st->gain2); | ||
935 | } | ||
936 | |||
937 | /* If noise suppression is off, don't apply the gain (but then why call this in the first place!) */ | ||
938 | if (!st->denoise_enabled) | ||
939 | { | ||
940 | for (i=0;i<N+M;i++) | ||
941 | st->gain2[i]=Q15_ONE; | ||
942 | } | ||
943 | |||
944 | /* Apply computed gain */ | ||
945 | for (i=1;i<N;i++) | ||
946 | { | ||
947 | st->ft[2*i-1] = MULT16_16_P15(st->gain2[i],st->ft[2*i-1]); | ||
948 | st->ft[2*i] = MULT16_16_P15(st->gain2[i],st->ft[2*i]); | ||
949 | } | ||
950 | st->ft[0] = MULT16_16_P15(st->gain2[0],st->ft[0]); | ||
951 | st->ft[2*N-1] = MULT16_16_P15(st->gain2[N-1],st->ft[2*N-1]); | ||
952 | |||
953 | /*FIXME: This *will* not work for fixed-point */ | ||
954 | #ifndef FIXED_POINT | ||
955 | if (st->agc_enabled) | ||
956 | speex_compute_agc(st, Pframe, st->ft); | ||
957 | #endif | ||
958 | |||
959 | /* Inverse FFT with 1/N scaling */ | ||
960 | spx_ifft(st->fft_lookup, st->ft, st->frame); | ||
961 | /* Scale back to original (lower) amplitude */ | ||
962 | for (i=0;i<2*N;i++) | ||
963 | st->frame[i] = PSHR16(st->frame[i], st->frame_shift); | ||
964 | |||
965 | /*FIXME: This *will* not work for fixed-point */ | ||
966 | #ifndef FIXED_POINT | ||
967 | if (st->agc_enabled) | ||
968 | { | ||
969 | float max_sample=0; | ||
970 | for (i=0;i<2*N;i++) | ||
971 | if (fabs(st->frame[i])>max_sample) | ||
972 | max_sample = fabs(st->frame[i]); | ||
973 | if (max_sample>28000.f) | ||
974 | { | ||
975 | float damp = 28000.f/max_sample; | ||
976 | for (i=0;i<2*N;i++) | ||
977 | st->frame[i] *= damp; | ||
978 | } | ||
979 | } | ||
980 | #endif | ||
981 | |||
982 | /* Synthesis window (for WOLA) */ | ||
983 | for (i=0;i<2*N;i++) | ||
984 | st->frame[i] = MULT16_16_Q15(st->frame[i], st->window[i]); | ||
985 | |||
986 | /* Perform overlap and add */ | ||
987 | for (i=0;i<N3;i++) | ||
988 | x[i] = st->outbuf[i] + st->frame[i]; | ||
989 | for (i=0;i<N4;i++) | ||
990 | x[N3+i] = st->frame[N3+i]; | ||
991 | |||
992 | /* Update outbuf */ | ||
993 | for (i=0;i<N3;i++) | ||
994 | st->outbuf[i] = st->frame[st->frame_size+i]; | ||
995 | |||
996 | /* FIXME: This VAD is a kludge */ | ||
997 | if (st->vad_enabled) | ||
998 | { | ||
999 | if (Pframe > st->speech_prob_start || (st->was_speech && Pframe > st->speech_prob_continue)) | ||
1000 | { | ||
1001 | st->was_speech=1; | ||
1002 | return 1; | ||
1003 | } else | ||
1004 | { | ||
1005 | st->was_speech=0; | ||
1006 | return 0; | ||
1007 | } | ||
1008 | } else { | ||
1009 | return 1; | ||
1010 | } | ||
1011 | } | ||
1012 | |||
1013 | void speex_preprocess_estimate_update(SpeexPreprocessState *st, spx_int16_t *x) | ||
1014 | { | ||
1015 | int i; | ||
1016 | int N = st->ps_size; | ||
1017 | int N3 = 2*N - st->frame_size; | ||
1018 | int M; | ||
1019 | spx_word32_t *ps=st->ps; | ||
1020 | |||
1021 | M = st->nbands; | ||
1022 | st->min_count++; | ||
1023 | |||
1024 | preprocess_analysis(st, x); | ||
1025 | |||
1026 | update_noise_prob(st); | ||
1027 | |||
1028 | for (i=1;i<N-1;i++) | ||
1029 | { | ||
1030 | if (!st->update_prob[i] || st->ps[i] < PSHR32(st->noise[i],NOISE_SHIFT)) | ||
1031 | { | ||
1032 | st->noise[i] = MULT16_32_Q15(QCONST16(.95f,15),st->noise[i]) + MULT16_32_Q15(QCONST16(.05f,15),SHL32(st->ps[i],NOISE_SHIFT)); | ||
1033 | } | ||
1034 | } | ||
1035 | |||
1036 | for (i=0;i<N3;i++) | ||
1037 | st->outbuf[i] = MULT16_16_Q15(x[st->frame_size-N3+i],st->window[st->frame_size+i]); | ||
1038 | |||
1039 | /* Save old power spectrum */ | ||
1040 | for (i=0;i<N+M;i++) | ||
1041 | st->old_ps[i] = ps[i]; | ||
1042 | |||
1043 | for (i=0;i<N;i++) | ||
1044 | st->reverb_estimate[i] = MULT16_32_Q15(st->reverb_decay, st->reverb_estimate[i]); | ||
1045 | } | ||
1046 | |||
1047 | |||
1048 | int speex_preprocess_ctl(SpeexPreprocessState *state, int request, void *ptr) | ||
1049 | { | ||
1050 | int i; | ||
1051 | SpeexPreprocessState *st; | ||
1052 | st=(SpeexPreprocessState*)state; | ||
1053 | switch(request) | ||
1054 | { | ||
1055 | case SPEEX_PREPROCESS_SET_DENOISE: | ||
1056 | st->denoise_enabled = (*(spx_int32_t*)ptr); | ||
1057 | break; | ||
1058 | case SPEEX_PREPROCESS_GET_DENOISE: | ||
1059 | (*(spx_int32_t*)ptr) = st->denoise_enabled; | ||
1060 | break; | ||
1061 | #ifndef FIXED_POINT | ||
1062 | case SPEEX_PREPROCESS_SET_AGC: | ||
1063 | st->agc_enabled = (*(spx_int32_t*)ptr); | ||
1064 | break; | ||
1065 | case SPEEX_PREPROCESS_GET_AGC: | ||
1066 | (*(spx_int32_t*)ptr) = st->agc_enabled; | ||
1067 | break; | ||
1068 | #ifndef DISABLE_FLOAT_API | ||
1069 | case SPEEX_PREPROCESS_SET_AGC_LEVEL: | ||
1070 | st->agc_level = (*(float*)ptr); | ||
1071 | if (st->agc_level<1) | ||
1072 | st->agc_level=1; | ||
1073 | if (st->agc_level>32768) | ||
1074 | st->agc_level=32768; | ||
1075 | break; | ||
1076 | case SPEEX_PREPROCESS_GET_AGC_LEVEL: | ||
1077 | (*(float*)ptr) = st->agc_level; | ||
1078 | break; | ||
1079 | #endif /* #ifndef DISABLE_FLOAT_API */ | ||
1080 | case SPEEX_PREPROCESS_SET_AGC_INCREMENT: | ||
1081 | st->max_increase_step = exp(0.11513f * (*(spx_int32_t*)ptr)*st->frame_size / st->sampling_rate); | ||
1082 | break; | ||
1083 | case SPEEX_PREPROCESS_GET_AGC_INCREMENT: | ||
1084 | (*(spx_int32_t*)ptr) = floor(.5+8.6858*log(st->max_increase_step)*st->sampling_rate/st->frame_size); | ||
1085 | break; | ||
1086 | case SPEEX_PREPROCESS_SET_AGC_DECREMENT: | ||
1087 | st->max_decrease_step = exp(0.11513f * (*(spx_int32_t*)ptr)*st->frame_size / st->sampling_rate); | ||
1088 | break; | ||
1089 | case SPEEX_PREPROCESS_GET_AGC_DECREMENT: | ||
1090 | (*(spx_int32_t*)ptr) = floor(.5+8.6858*log(st->max_decrease_step)*st->sampling_rate/st->frame_size); | ||
1091 | break; | ||
1092 | case SPEEX_PREPROCESS_SET_AGC_MAX_GAIN: | ||
1093 | st->max_gain = exp(0.11513f * (*(spx_int32_t*)ptr)); | ||
1094 | break; | ||
1095 | case SPEEX_PREPROCESS_GET_AGC_MAX_GAIN: | ||
1096 | (*(spx_int32_t*)ptr) = floor(.5+8.6858*log(st->max_gain)); | ||
1097 | break; | ||
1098 | #endif | ||
1099 | case SPEEX_PREPROCESS_SET_VAD: | ||
1100 | speex_warning("The VAD has been replaced by a hack pending a complete rewrite"); | ||
1101 | st->vad_enabled = (*(spx_int32_t*)ptr); | ||
1102 | break; | ||
1103 | case SPEEX_PREPROCESS_GET_VAD: | ||
1104 | (*(spx_int32_t*)ptr) = st->vad_enabled; | ||
1105 | break; | ||
1106 | |||
1107 | case SPEEX_PREPROCESS_SET_DEREVERB: | ||
1108 | st->dereverb_enabled = (*(spx_int32_t*)ptr); | ||
1109 | for (i=0;i<st->ps_size;i++) | ||
1110 | st->reverb_estimate[i]=0; | ||
1111 | break; | ||
1112 | case SPEEX_PREPROCESS_GET_DEREVERB: | ||
1113 | (*(spx_int32_t*)ptr) = st->dereverb_enabled; | ||
1114 | break; | ||
1115 | |||
1116 | case SPEEX_PREPROCESS_SET_DEREVERB_LEVEL: | ||
1117 | /* FIXME: Re-enable when de-reverberation is actually enabled again */ | ||
1118 | /*st->reverb_level = (*(float*)ptr);*/ | ||
1119 | break; | ||
1120 | case SPEEX_PREPROCESS_GET_DEREVERB_LEVEL: | ||
1121 | /* FIXME: Re-enable when de-reverberation is actually enabled again */ | ||
1122 | /*(*(float*)ptr) = st->reverb_level;*/ | ||
1123 | break; | ||
1124 | |||
1125 | case SPEEX_PREPROCESS_SET_DEREVERB_DECAY: | ||
1126 | /* FIXME: Re-enable when de-reverberation is actually enabled again */ | ||
1127 | /*st->reverb_decay = (*(float*)ptr);*/ | ||
1128 | break; | ||
1129 | case SPEEX_PREPROCESS_GET_DEREVERB_DECAY: | ||
1130 | /* FIXME: Re-enable when de-reverberation is actually enabled again */ | ||
1131 | /*(*(float*)ptr) = st->reverb_decay;*/ | ||
1132 | break; | ||
1133 | |||
1134 | case SPEEX_PREPROCESS_SET_PROB_START: | ||
1135 | *(spx_int32_t*)ptr = MIN32(100,MAX32(0, *(spx_int32_t*)ptr)); | ||
1136 | st->speech_prob_start = DIV32_16(MULT16_16(Q15ONE,*(spx_int32_t*)ptr), 100); | ||
1137 | break; | ||
1138 | case SPEEX_PREPROCESS_GET_PROB_START: | ||
1139 | (*(spx_int32_t*)ptr) = MULT16_16_Q15(st->speech_prob_start, 100); | ||
1140 | break; | ||
1141 | |||
1142 | case SPEEX_PREPROCESS_SET_PROB_CONTINUE: | ||
1143 | *(spx_int32_t*)ptr = MIN32(100,MAX32(0, *(spx_int32_t*)ptr)); | ||
1144 | st->speech_prob_continue = DIV32_16(MULT16_16(Q15ONE,*(spx_int32_t*)ptr), 100); | ||
1145 | break; | ||
1146 | case SPEEX_PREPROCESS_GET_PROB_CONTINUE: | ||
1147 | (*(spx_int32_t*)ptr) = MULT16_16_Q15(st->speech_prob_continue, 100); | ||
1148 | break; | ||
1149 | |||
1150 | case SPEEX_PREPROCESS_SET_NOISE_SUPPRESS: | ||
1151 | st->noise_suppress = -ABS(*(spx_int32_t*)ptr); | ||
1152 | break; | ||
1153 | case SPEEX_PREPROCESS_GET_NOISE_SUPPRESS: | ||
1154 | (*(spx_int32_t*)ptr) = st->noise_suppress; | ||
1155 | break; | ||
1156 | case SPEEX_PREPROCESS_SET_ECHO_SUPPRESS: | ||
1157 | st->echo_suppress = -ABS(*(spx_int32_t*)ptr); | ||
1158 | break; | ||
1159 | case SPEEX_PREPROCESS_GET_ECHO_SUPPRESS: | ||
1160 | (*(spx_int32_t*)ptr) = st->echo_suppress; | ||
1161 | break; | ||
1162 | case SPEEX_PREPROCESS_SET_ECHO_SUPPRESS_ACTIVE: | ||
1163 | st->echo_suppress_active = -ABS(*(spx_int32_t*)ptr); | ||
1164 | break; | ||
1165 | case SPEEX_PREPROCESS_GET_ECHO_SUPPRESS_ACTIVE: | ||
1166 | (*(spx_int32_t*)ptr) = st->echo_suppress_active; | ||
1167 | break; | ||
1168 | case SPEEX_PREPROCESS_SET_ECHO_STATE: | ||
1169 | st->echo_state = (SpeexEchoState*)ptr; | ||
1170 | break; | ||
1171 | case SPEEX_PREPROCESS_GET_ECHO_STATE: | ||
1172 | ptr = (void*)st->echo_state; | ||
1173 | break; | ||
1174 | #ifndef FIXED_POINT | ||
1175 | case SPEEX_PREPROCESS_GET_AGC_LOUDNESS: | ||
1176 | (*(spx_int32_t*)ptr) = pow(st->loudness, 1.0/LOUDNESS_EXP); | ||
1177 | break; | ||
1178 | #endif | ||
1179 | |||
1180 | default: | ||
1181 | speex_warning_int("Unknown speex_preprocess_ctl request: ", request); | ||
1182 | return -1; | ||
1183 | } | ||
1184 | return 0; | ||
1185 | } | ||