mirror of
https://github.com/xiph/opus.git
synced 2025-05-19 18:08:29 +00:00
718 lines
19 KiB
C
718 lines
19 KiB
C
/* Copyright (c) 2017-2018 Mozilla */
|
|
/*
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions
|
|
are met:
|
|
|
|
- Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
|
|
- Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
|
|
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "config.h"
|
|
#endif
|
|
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <stdio.h>
|
|
#include "kiss_fft.h"
|
|
#include "common.h"
|
|
#include <math.h>
|
|
#include "rnnoise.h"
|
|
#include "pitch.h"
|
|
#include "arch.h"
|
|
#include "celt_lpc.h"
|
|
#include <assert.h>
|
|
|
|
#define PREEMPHASIS (0.85f)
|
|
|
|
#define FRAME_SIZE_SHIFT 2
|
|
#define FRAME_SIZE (40<<FRAME_SIZE_SHIFT)
|
|
#define WINDOW_SIZE (2*FRAME_SIZE)
|
|
#define FREQ_SIZE (FRAME_SIZE + 1)
|
|
|
|
#define PITCH_MIN_PERIOD 32
|
|
#define PITCH_MAX_PERIOD 256
|
|
#define PITCH_FRAME_SIZE 320
|
|
#define PITCH_BUF_SIZE (PITCH_MAX_PERIOD+PITCH_FRAME_SIZE)
|
|
|
|
#define SQUARE(x) ((x)*(x))
|
|
|
|
#define SMOOTH_BANDS 1
|
|
|
|
#if SMOOTH_BANDS
|
|
#define NB_BANDS 18
|
|
#else
|
|
#define NB_BANDS 17
|
|
#endif
|
|
|
|
#define CEPS_MEM 8
|
|
#define NB_DELTA_CEPS 6
|
|
|
|
#define NB_FEATURES (2*NB_BANDS+3+LPC_ORDER)
|
|
|
|
|
|
#ifndef TRAINING
|
|
#define TRAINING 0
|
|
#endif
|
|
|
|
static const opus_int16 eband5ms[] = {
|
|
/*0 200 400 600 800 1k 1.2 1.4 1.6 2k 2.4 2.8 3.2 4k 4.8 5.6 6.8 8k*/
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 34, 40
|
|
};
|
|
|
|
|
|
typedef struct {
|
|
int init;
|
|
kiss_fft_state *kfft;
|
|
float half_window[FRAME_SIZE];
|
|
float dct_table[NB_BANDS*NB_BANDS];
|
|
} CommonState;
|
|
|
|
struct DenoiseState {
|
|
float analysis_mem[FRAME_SIZE];
|
|
float cepstral_mem[CEPS_MEM][NB_BANDS];
|
|
int memid;
|
|
float synthesis_mem[FRAME_SIZE];
|
|
float pitch_buf[PITCH_BUF_SIZE];
|
|
float pitch_enh_buf[PITCH_BUF_SIZE];
|
|
float last_gain;
|
|
int last_period;
|
|
float mem_hp_x[2];
|
|
float lastg[NB_BANDS];
|
|
};
|
|
|
|
#if SMOOTH_BANDS
|
|
void compute_band_energy(float *bandE, const kiss_fft_cpx *X) {
|
|
int i;
|
|
float sum[NB_BANDS] = {0};
|
|
for (i=0;i<NB_BANDS-1;i++)
|
|
{
|
|
int j;
|
|
int band_size;
|
|
band_size = (eband5ms[i+1]-eband5ms[i])<<FRAME_SIZE_SHIFT;
|
|
for (j=0;j<band_size;j++) {
|
|
float tmp;
|
|
float frac = (float)j/band_size;
|
|
tmp = SQUARE(X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].r);
|
|
tmp += SQUARE(X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].i);
|
|
sum[i] += (1-frac)*tmp;
|
|
sum[i+1] += frac*tmp;
|
|
}
|
|
}
|
|
sum[0] *= 2;
|
|
sum[NB_BANDS-1] *= 2;
|
|
for (i=0;i<NB_BANDS;i++)
|
|
{
|
|
bandE[i] = sum[i];
|
|
}
|
|
}
|
|
|
|
void compute_band_corr(float *bandE, const kiss_fft_cpx *X, const kiss_fft_cpx *P) {
|
|
int i;
|
|
float sum[NB_BANDS] = {0};
|
|
for (i=0;i<NB_BANDS-1;i++)
|
|
{
|
|
int j;
|
|
int band_size;
|
|
band_size = (eband5ms[i+1]-eband5ms[i])<<FRAME_SIZE_SHIFT;
|
|
for (j=0;j<band_size;j++) {
|
|
float tmp;
|
|
float frac = (float)j/band_size;
|
|
tmp = X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].r * P[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].r;
|
|
tmp += X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].i * P[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].i;
|
|
sum[i] += (1-frac)*tmp;
|
|
sum[i+1] += frac*tmp;
|
|
}
|
|
}
|
|
sum[0] *= 2;
|
|
sum[NB_BANDS-1] *= 2;
|
|
for (i=0;i<NB_BANDS;i++)
|
|
{
|
|
bandE[i] = sum[i];
|
|
}
|
|
}
|
|
|
|
void interp_band_gain(float *g, const float *bandE) {
|
|
int i;
|
|
memset(g, 0, FREQ_SIZE);
|
|
for (i=0;i<NB_BANDS-1;i++)
|
|
{
|
|
int j;
|
|
int band_size;
|
|
band_size = (eband5ms[i+1]-eband5ms[i])<<FRAME_SIZE_SHIFT;
|
|
for (j=0;j<band_size;j++) {
|
|
float frac = (float)j/band_size;
|
|
g[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j] = (1-frac)*bandE[i] + frac*bandE[i+1];
|
|
}
|
|
}
|
|
}
|
|
#else
|
|
void compute_band_energy(float *bandE, const kiss_fft_cpx *X) {
|
|
int i;
|
|
for (i=0;i<NB_BANDS;i++)
|
|
{
|
|
int j;
|
|
opus_val32 sum = 0;
|
|
for (j=0;j<(eband5ms[i+1]-eband5ms[i])<<FRAME_SIZE_SHIFT;j++) {
|
|
sum += SQUARE(X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].r);
|
|
sum += SQUARE(X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].i);
|
|
}
|
|
bandE[i] = sum;
|
|
}
|
|
}
|
|
|
|
void interp_band_gain(float *g, const float *bandE) {
|
|
int i;
|
|
memset(g, 0, FREQ_SIZE);
|
|
for (i=0;i<NB_BANDS;i++)
|
|
{
|
|
int j;
|
|
for (j=0;j<(eband5ms[i+1]-eband5ms[i])<<FRAME_SIZE_SHIFT;j++)
|
|
g[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j] = bandE[i];
|
|
}
|
|
}
|
|
#endif
|
|
|
|
|
|
CommonState common;
|
|
|
|
static void check_init() {
|
|
int i;
|
|
if (common.init) return;
|
|
common.kfft = opus_fft_alloc_twiddles(2*FRAME_SIZE, NULL, NULL, NULL, 0);
|
|
for (i=0;i<FRAME_SIZE;i++)
|
|
common.half_window[i] = sin(.5*M_PI*sin(.5*M_PI*(i+.5)/FRAME_SIZE) * sin(.5*M_PI*(i+.5)/FRAME_SIZE));
|
|
for (i=0;i<NB_BANDS;i++) {
|
|
int j;
|
|
for (j=0;j<NB_BANDS;j++) {
|
|
common.dct_table[i*NB_BANDS + j] = cos((i+.5)*j*M_PI/NB_BANDS);
|
|
if (j==0) common.dct_table[i*NB_BANDS + j] *= sqrt(.5);
|
|
}
|
|
}
|
|
common.init = 1;
|
|
}
|
|
|
|
static void dct(float *out, const float *in) {
|
|
int i;
|
|
check_init();
|
|
for (i=0;i<NB_BANDS;i++) {
|
|
int j;
|
|
float sum = 0;
|
|
for (j=0;j<NB_BANDS;j++) {
|
|
sum += in[j] * common.dct_table[j*NB_BANDS + i];
|
|
}
|
|
out[i] = sum*sqrt(2./NB_BANDS);
|
|
}
|
|
}
|
|
|
|
static void idct(float *out, const float *in) {
|
|
int i;
|
|
check_init();
|
|
for (i=0;i<NB_BANDS;i++) {
|
|
int j;
|
|
float sum = 0;
|
|
for (j=0;j<NB_BANDS;j++) {
|
|
sum += in[j] * common.dct_table[i*NB_BANDS + j];
|
|
}
|
|
out[i] = sum*sqrt(2./NB_BANDS);
|
|
}
|
|
}
|
|
|
|
#if 0
|
|
static void idct(float *out, const float *in) {
|
|
int i;
|
|
check_init();
|
|
for (i=0;i<NB_BANDS;i++) {
|
|
int j;
|
|
float sum = 0;
|
|
for (j=0;j<NB_BANDS;j++) {
|
|
sum += in[j] * common.dct_table[i*NB_BANDS + j];
|
|
}
|
|
out[i] = sum*sqrt(2./22);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
static void forward_transform(kiss_fft_cpx *out, const float *in) {
|
|
int i;
|
|
kiss_fft_cpx x[WINDOW_SIZE];
|
|
kiss_fft_cpx y[WINDOW_SIZE];
|
|
check_init();
|
|
for (i=0;i<WINDOW_SIZE;i++) {
|
|
x[i].r = in[i];
|
|
x[i].i = 0;
|
|
}
|
|
opus_fft(common.kfft, x, y, 0);
|
|
for (i=0;i<FREQ_SIZE;i++) {
|
|
out[i] = y[i];
|
|
}
|
|
}
|
|
|
|
static void inverse_transform(float *out, const kiss_fft_cpx *in) {
|
|
int i;
|
|
kiss_fft_cpx x[WINDOW_SIZE];
|
|
kiss_fft_cpx y[WINDOW_SIZE];
|
|
check_init();
|
|
for (i=0;i<FREQ_SIZE;i++) {
|
|
x[i] = in[i];
|
|
}
|
|
for (;i<WINDOW_SIZE;i++) {
|
|
x[i].r = x[WINDOW_SIZE - i].r;
|
|
x[i].i = -x[WINDOW_SIZE - i].i;
|
|
}
|
|
opus_fft(common.kfft, x, y, 0);
|
|
/* output in reverse order for IFFT. */
|
|
out[0] = WINDOW_SIZE*y[0].r;
|
|
for (i=1;i<WINDOW_SIZE;i++) {
|
|
out[i] = WINDOW_SIZE*y[WINDOW_SIZE - i].r;
|
|
}
|
|
}
|
|
|
|
static void apply_window(float *x) {
|
|
int i;
|
|
check_init();
|
|
for (i=0;i<FRAME_SIZE;i++) {
|
|
x[i] *= common.half_window[i];
|
|
x[WINDOW_SIZE - 1 - i] *= common.half_window[i];
|
|
}
|
|
}
|
|
|
|
int rnnoise_get_size() {
|
|
return sizeof(DenoiseState);
|
|
}
|
|
|
|
int rnnoise_init(DenoiseState *st) {
|
|
memset(st, 0, sizeof(*st));
|
|
return 0;
|
|
}
|
|
|
|
DenoiseState *rnnoise_create() {
|
|
DenoiseState *st;
|
|
st = malloc(rnnoise_get_size());
|
|
rnnoise_init(st);
|
|
return st;
|
|
}
|
|
|
|
void rnnoise_destroy(DenoiseState *st) {
|
|
free(st);
|
|
}
|
|
|
|
#if TRAINING
|
|
int lowpass = FREQ_SIZE;
|
|
int band_lp = NB_BANDS;
|
|
#endif
|
|
|
|
short float2short(float x)
|
|
{
|
|
int i;
|
|
i = (int)floor(.5+x);
|
|
return IMAX(-32767, IMIN(32767, i));
|
|
}
|
|
|
|
static float lpc_from_bands(float *lpc, const float *Ex)
|
|
{
|
|
int i;
|
|
float e;
|
|
float ac[LPC_ORDER+1];
|
|
float rc[LPC_ORDER];
|
|
float Xr[FREQ_SIZE];
|
|
kiss_fft_cpx X_auto[FREQ_SIZE];
|
|
float x_auto[FRAME_SIZE];
|
|
interp_band_gain(Xr, Ex);
|
|
RNN_CLEAR(X_auto, FREQ_SIZE);
|
|
for (i=0;i<160;i++) X_auto[i].r = Xr[i];
|
|
inverse_transform(x_auto, X_auto);
|
|
for (i=0;i<LPC_ORDER+1;i++) ac[i] = x_auto[i];
|
|
|
|
/* -40 dB noise floor. */
|
|
ac[0] += ac[0]*1e-4 + 320/12/38.;
|
|
/* Lag windowing. */
|
|
for (i=1;i<LPC_ORDER+1;i++) ac[i] *= (1 - 6e-5*i*i);
|
|
e = _celt_lpc(lpc, rc, ac, LPC_ORDER);
|
|
return e;
|
|
}
|
|
|
|
float lpc_from_cepstrum(float *lpc, const float *cepstrum)
|
|
{
|
|
int i;
|
|
float Ex[NB_BANDS];
|
|
float tmp[NB_BANDS];
|
|
RNN_COPY(tmp, cepstrum, NB_BANDS);
|
|
tmp[0] += 4;
|
|
idct(Ex, tmp);
|
|
for (i=0;i<NB_BANDS;i++) Ex[i] = pow(10.f, Ex[i]);
|
|
return lpc_from_bands(lpc, Ex);
|
|
}
|
|
|
|
static float frame_analysis(DenoiseState *st, signed char *iexc, short *pred, short *pcm, float *lpc, kiss_fft_cpx *X, float *Ex, const float *in) {
|
|
int i;
|
|
float x[WINDOW_SIZE];
|
|
float x0[WINDOW_SIZE];
|
|
float ac[LPC_ORDER+1];
|
|
float rc[LPC_ORDER];
|
|
float g;
|
|
RNN_COPY(x, st->analysis_mem, FRAME_SIZE);
|
|
for (i=0;i<FRAME_SIZE;i++) x[FRAME_SIZE + i] = in[i];
|
|
RNN_COPY(st->analysis_mem, in, FRAME_SIZE);
|
|
RNN_COPY(x0, x, WINDOW_SIZE);
|
|
apply_window(x);
|
|
forward_transform(X, x);
|
|
#if TRAINING
|
|
for (i=lowpass;i<FREQ_SIZE;i++)
|
|
X[i].r = X[i].i = 0;
|
|
#endif
|
|
compute_band_energy(Ex, X);
|
|
{
|
|
float e;
|
|
float g_1;
|
|
if (0) {
|
|
_celt_autocorr(x, ac, NULL, 0, LPC_ORDER, WINDOW_SIZE);
|
|
} else {
|
|
float Xr[FREQ_SIZE];
|
|
kiss_fft_cpx X_auto[FREQ_SIZE];
|
|
float x_auto[FRAME_SIZE];
|
|
interp_band_gain(Xr, Ex);
|
|
RNN_CLEAR(X_auto, FREQ_SIZE);
|
|
for (i=0;i<160;i++) X_auto[i].r = Xr[i];
|
|
inverse_transform(x_auto, X_auto);
|
|
for (i=0;i<LPC_ORDER+1;i++) ac[i] = x_auto[i];
|
|
}
|
|
/* -40 dB noise floor. */
|
|
ac[0] += ac[0]*1e-4 + 320/12/38.;
|
|
/* Lag windowing. */
|
|
for (i=1;i<LPC_ORDER+1;i++) ac[i] *= (1 - 6e-5*i*i);
|
|
e = _celt_lpc(lpc, rc, ac, LPC_ORDER);
|
|
g = sqrt((1e-10+e)*(1./FRAME_SIZE));
|
|
g_1 = 1./g;
|
|
#if 0
|
|
for(i=0;i<WINDOW_SIZE;i++) printf("%f ", x[i]);
|
|
printf("\n");
|
|
#endif
|
|
#if 0
|
|
printf("1 ");
|
|
for(i=0;i<LPC_ORDER;i++) printf("%f ", lpc[i]);
|
|
printf("\n");
|
|
#endif
|
|
for (i=0;i<FRAME_SIZE;i++) {
|
|
int j;
|
|
float *z;
|
|
float tmp;
|
|
int nexc;
|
|
z = &x0[i]+FRAME_SIZE/2;
|
|
tmp = z[0];
|
|
for (j=0;j<LPC_ORDER;j++) tmp += lpc[j]*z[-1-j];
|
|
pcm[i] = float2short(z[0]);
|
|
pred[i] = float2short(z[0] - tmp);
|
|
nexc = (int)floor(.5 + 16*g_1*tmp);
|
|
nexc = IMAX(-128, IMIN(127, nexc));
|
|
iexc[i] = nexc;
|
|
#if 0
|
|
printf("%f\n", g_1*tmp);
|
|
#endif
|
|
}
|
|
}
|
|
return g;
|
|
}
|
|
|
|
static int compute_frame_features(DenoiseState *st, signed char *iexc, short *pred, short *pcm, kiss_fft_cpx *X, kiss_fft_cpx *P,
|
|
float *Ex, float *Ep, float *Exp, float *features, const float *in) {
|
|
int i;
|
|
float E = 0;
|
|
float Ly[NB_BANDS];
|
|
float lpc[LPC_ORDER];
|
|
float p[WINDOW_SIZE];
|
|
float pitch_buf[PITCH_BUF_SIZE];
|
|
int pitch_index;
|
|
float gain;
|
|
float tmp[NB_BANDS];
|
|
float follow, logMax;
|
|
float g;
|
|
g = frame_analysis(st, iexc, pred, pcm, lpc, X, Ex, in);
|
|
RNN_MOVE(st->pitch_buf, &st->pitch_buf[FRAME_SIZE], PITCH_BUF_SIZE-FRAME_SIZE);
|
|
RNN_COPY(&st->pitch_buf[PITCH_BUF_SIZE-FRAME_SIZE], in, FRAME_SIZE);
|
|
//pre[0] = &st->pitch_buf[0];
|
|
RNN_COPY(pitch_buf, &st->pitch_buf[0], PITCH_BUF_SIZE);
|
|
pitch_downsample(pitch_buf, PITCH_BUF_SIZE);
|
|
pitch_search(pitch_buf+PITCH_MAX_PERIOD, pitch_buf, PITCH_FRAME_SIZE<<1,
|
|
(PITCH_MAX_PERIOD-3*PITCH_MIN_PERIOD)<<1, &pitch_index);
|
|
//printf("%d ", pitch_index);
|
|
pitch_index = 2*PITCH_MAX_PERIOD-pitch_index;
|
|
//printf("%d ", pitch_index);
|
|
gain = remove_doubling(pitch_buf, 2*PITCH_MAX_PERIOD, 2*PITCH_MIN_PERIOD,
|
|
2*PITCH_FRAME_SIZE, &pitch_index, st->last_period, st->last_gain);
|
|
st->last_period = pitch_index;
|
|
st->last_gain = gain;
|
|
//printf("%d %f\n", pitch_index, gain);
|
|
for (i=0;i<WINDOW_SIZE;i++)
|
|
p[i] = st->pitch_buf[PITCH_BUF_SIZE-WINDOW_SIZE-pitch_index/2+i];
|
|
apply_window(p);
|
|
forward_transform(P, p);
|
|
compute_band_energy(Ep, P);
|
|
compute_band_corr(Exp, X, P);
|
|
for (i=0;i<NB_BANDS;i++) Exp[i] = Exp[i]/sqrt(.001+Ex[i]*Ep[i]);
|
|
#if 0
|
|
for (i=0;i<NB_BANDS;i++) printf("%f ", Exp[i]);
|
|
printf("\n");
|
|
#endif
|
|
dct(tmp, Exp);
|
|
for (i=0;i<NB_BANDS;i++) features[NB_BANDS+i] = tmp[i];
|
|
features[NB_BANDS] -= 1.3;
|
|
features[NB_BANDS+1] -= 0.9;
|
|
logMax = -2;
|
|
follow = -2;
|
|
for (i=0;i<NB_BANDS;i++) {
|
|
Ly[i] = log10(1e-2+Ex[i]);
|
|
Ly[i] = MAX16(logMax-8, MAX16(follow-2.5, Ly[i]));
|
|
logMax = MAX16(logMax, Ly[i]);
|
|
follow = MAX16(follow-2.5, Ly[i]);
|
|
E += Ex[i];
|
|
}
|
|
dct(features, Ly);
|
|
features[0] -= 4;
|
|
lpc_from_cepstrum(lpc, features);
|
|
#if 0
|
|
for (i=0;i<NB_BANDS;i++) printf("%f ", Ly[i]);
|
|
printf("\n");
|
|
#endif
|
|
features[2*NB_BANDS] = .01*(pitch_index-200);
|
|
features[2*NB_BANDS+1] = gain;
|
|
features[2*NB_BANDS+2] = log10(g);
|
|
for (i=0;i<LPC_ORDER;i++) features[2*NB_BANDS+3+i] = lpc[i];
|
|
#if 0
|
|
for (i=0;i<NB_FEATURES;i++) printf("%f ", features[i]);
|
|
printf("\n");
|
|
#endif
|
|
return TRAINING && E < 0.1;
|
|
}
|
|
|
|
static void frame_synthesis(DenoiseState *st, float *out, const kiss_fft_cpx *y) {
|
|
float x[WINDOW_SIZE];
|
|
int i;
|
|
inverse_transform(x, y);
|
|
apply_window(x);
|
|
for (i=0;i<FRAME_SIZE;i++) out[i] = x[i] + st->synthesis_mem[i];
|
|
RNN_COPY(st->synthesis_mem, &x[FRAME_SIZE], FRAME_SIZE);
|
|
}
|
|
|
|
static void biquad(float *y, float mem[2], const float *x, const float *b, const float *a, int N) {
|
|
int i;
|
|
for (i=0;i<N;i++) {
|
|
float xi, yi;
|
|
xi = x[i];
|
|
yi = x[i] + mem[0];
|
|
mem[0] = mem[1] + (b[0]*(double)xi - a[0]*(double)yi);
|
|
mem[1] = (b[1]*(double)xi - a[1]*(double)yi);
|
|
y[i] = yi;
|
|
}
|
|
}
|
|
|
|
static void preemphasis(float *y, float *mem, const float *x, float coef, int N) {
|
|
int i;
|
|
for (i=0;i<N;i++) {
|
|
float yi;
|
|
yi = x[i] + *mem;
|
|
*mem = -coef*x[i];
|
|
y[i] = yi;
|
|
}
|
|
}
|
|
|
|
void pitch_filter(kiss_fft_cpx *X, const kiss_fft_cpx *P, const float *Ex, const float *Ep,
|
|
const float *Exp, const float *g) {
|
|
int i;
|
|
float r[NB_BANDS];
|
|
float rf[FREQ_SIZE] = {0};
|
|
for (i=0;i<NB_BANDS;i++) {
|
|
#if 0
|
|
if (Exp[i]>g[i]) r[i] = 1;
|
|
else r[i] = Exp[i]*(1-g[i])/(.001 + g[i]*(1-Exp[i]));
|
|
r[i] = MIN16(1, MAX16(0, r[i]));
|
|
#else
|
|
if (Exp[i]>g[i]) r[i] = 1;
|
|
else r[i] = SQUARE(Exp[i])*(1-SQUARE(g[i]))/(.001 + SQUARE(g[i])*(1-SQUARE(Exp[i])));
|
|
r[i] = sqrt(MIN16(1, MAX16(0, r[i])));
|
|
#endif
|
|
r[i] *= sqrt(Ex[i]/(1e-8+Ep[i]));
|
|
}
|
|
interp_band_gain(rf, r);
|
|
for (i=0;i<FREQ_SIZE;i++) {
|
|
X[i].r += rf[i]*P[i].r;
|
|
X[i].i += rf[i]*P[i].i;
|
|
}
|
|
float newE[NB_BANDS];
|
|
compute_band_energy(newE, X);
|
|
float norm[NB_BANDS];
|
|
float normf[FREQ_SIZE]={0};
|
|
for (i=0;i<NB_BANDS;i++) {
|
|
norm[i] = sqrt(Ex[i]/(1e-8+newE[i]));
|
|
}
|
|
interp_band_gain(normf, norm);
|
|
for (i=0;i<FREQ_SIZE;i++) {
|
|
X[i].r *= normf[i];
|
|
X[i].i *= normf[i];
|
|
}
|
|
}
|
|
|
|
float rnnoise_process_frame(DenoiseState *st, float *out, const float *in) {
|
|
int i;
|
|
kiss_fft_cpx X[FREQ_SIZE];
|
|
kiss_fft_cpx P[WINDOW_SIZE];
|
|
float x[FRAME_SIZE];
|
|
float Ex[NB_BANDS], Ep[NB_BANDS];
|
|
float Exp[NB_BANDS];
|
|
float features[NB_FEATURES];
|
|
float g[NB_BANDS];
|
|
float gf[FREQ_SIZE]={1};
|
|
float vad_prob = 0;
|
|
int silence=0;
|
|
static const float a_hp[2] = {-1.99599, 0.99600};
|
|
static const float b_hp[2] = {-2, 1};
|
|
biquad(x, st->mem_hp_x, in, b_hp, a_hp, FRAME_SIZE);
|
|
//silence = compute_frame_features(st, NULL, X, P, Ex, Ep, Exp, features, x);
|
|
|
|
if (!silence) {
|
|
pitch_filter(X, P, Ex, Ep, Exp, g);
|
|
for (i=0;i<NB_BANDS;i++) {
|
|
float alpha = .6f;
|
|
g[i] = MAX16(g[i], alpha*st->lastg[i]);
|
|
st->lastg[i] = g[i];
|
|
}
|
|
interp_band_gain(gf, g);
|
|
#if 1
|
|
for (i=0;i<FREQ_SIZE;i++) {
|
|
X[i].r *= gf[i];
|
|
X[i].i *= gf[i];
|
|
}
|
|
#endif
|
|
}
|
|
|
|
frame_synthesis(st, out, X);
|
|
return vad_prob;
|
|
}
|
|
|
|
#if TRAINING
|
|
|
|
static float uni_rand() {
|
|
return rand()/(double)RAND_MAX-.5;
|
|
}
|
|
|
|
static void rand_resp(float *a, float *b) {
|
|
a[0] = .75*uni_rand();
|
|
a[1] = .75*uni_rand();
|
|
b[0] = .75*uni_rand();
|
|
b[1] = .75*uni_rand();
|
|
}
|
|
|
|
int main(int argc, char **argv) {
|
|
int i;
|
|
int count=0;
|
|
static const float a_hp[2] = {-1.99599, 0.99600};
|
|
static const float b_hp[2] = {-2, 1};
|
|
float a_sig[2] = {0};
|
|
float b_sig[2] = {0};
|
|
float mem_hp_x[2]={0};
|
|
float mem_resp_x[2]={0};
|
|
float mem_preemph=0;
|
|
float x[FRAME_SIZE];
|
|
int gain_change_count=0;
|
|
FILE *f1;
|
|
FILE *ffeat;
|
|
FILE *fpcm;
|
|
signed char iexc[FRAME_SIZE];
|
|
short pred[FRAME_SIZE];
|
|
short pcm[FRAME_SIZE];
|
|
short tmp[FRAME_SIZE] = {0};
|
|
float savedX[FRAME_SIZE] = {0};
|
|
float speech_gain=1;
|
|
int last_silent = 1;
|
|
float old_speech_gain = 1;
|
|
int one_pass_completed = 0;
|
|
DenoiseState *st;
|
|
st = rnnoise_create();
|
|
if (argc!=4) {
|
|
fprintf(stderr, "usage: %s <speech> <features out>\n", argv[0]);
|
|
return 1;
|
|
}
|
|
f1 = fopen(argv[1], "r");
|
|
ffeat = fopen(argv[2], "w");
|
|
fpcm = fopen(argv[3], "w");
|
|
while (1) {
|
|
kiss_fft_cpx X[FREQ_SIZE], P[WINDOW_SIZE];
|
|
float Ex[NB_BANDS], Ep[NB_BANDS];
|
|
float Exp[NB_BANDS];
|
|
float Ln[NB_BANDS];
|
|
float features[NB_FEATURES];
|
|
float g[NB_BANDS];
|
|
float E=0;
|
|
int silent;
|
|
for (i=0;i<FRAME_SIZE;i++) x[i] = tmp[i];
|
|
fread(tmp, sizeof(short), FRAME_SIZE, f1);
|
|
if (feof(f1)) {
|
|
rewind(f1);
|
|
fread(tmp, sizeof(short), FRAME_SIZE, f1);
|
|
one_pass_completed = 1;
|
|
}
|
|
for (i=0;i<FRAME_SIZE;i++) E += tmp[i]*(float)tmp[i];
|
|
silent = E < 5000 || (last_silent && E < 20000);
|
|
if (!last_silent && silent) {
|
|
for (i=0;i<FRAME_SIZE;i++) savedX[i] = x[i];
|
|
}
|
|
if (last_silent && !silent) {
|
|
for (i=0;i<FRAME_SIZE;i++) {
|
|
float f = (float)i/FRAME_SIZE;
|
|
tmp[i] = (int)floor(.5 + f*tmp[i] + (1-f)*savedX[i]);
|
|
}
|
|
}
|
|
if (last_silent) {
|
|
last_silent = silent;
|
|
continue;
|
|
}
|
|
last_silent = silent;
|
|
if (count==5000000 && one_pass_completed) break;
|
|
if (++gain_change_count > 2821) {
|
|
speech_gain = pow(10., (-20+(rand()%40))/20.);
|
|
if (rand()%20==0) speech_gain *= .01;
|
|
if (rand()%100==0) speech_gain = 0;
|
|
gain_change_count = 0;
|
|
rand_resp(a_sig, b_sig);
|
|
}
|
|
biquad(x, mem_hp_x, x, b_hp, a_hp, FRAME_SIZE);
|
|
biquad(x, mem_resp_x, x, b_sig, a_sig, FRAME_SIZE);
|
|
preemphasis(x, &mem_preemph, x, PREEMPHASIS, FRAME_SIZE);
|
|
for (i=0;i<FRAME_SIZE;i++) {
|
|
float g;
|
|
float f = (float)i/FRAME_SIZE;
|
|
g = f*speech_gain + (1-f)*old_speech_gain;
|
|
x[i] *= g;
|
|
}
|
|
for (i=0;i<FRAME_SIZE;i++) x[i] += rand()/(float)RAND_MAX - .5;
|
|
compute_frame_features(st, iexc, pred, pcm, X, P, Ex, Ep, Exp, features, x);
|
|
fwrite(features, sizeof(float), NB_FEATURES, ffeat);
|
|
fwrite(pcm, sizeof(short), FRAME_SIZE, fpcm);
|
|
old_speech_gain = speech_gain;
|
|
count++;
|
|
}
|
|
//fprintf(stderr, "matrix size: %d x %d\n", count, NB_FEATURES + 2*NB_BANDS + 1);
|
|
fclose(f1);
|
|
fclose(ffeat);
|
|
fclose(fpcm);
|
|
return 0;
|
|
}
|
|
|
|
#endif
|