Using Burg cepstrum for feature prediction
This commit is contained in:
parent
b93dbfc0bc
commit
2e18f0d160
8 changed files with 50 additions and 15 deletions
|
@ -138,9 +138,18 @@ int main(int argc, char **argv) {
|
||||||
int encode = 0;
|
int encode = 0;
|
||||||
int decode = 0;
|
int decode = 0;
|
||||||
int quantize = 0;
|
int quantize = 0;
|
||||||
|
int burg = 0;
|
||||||
srand(getpid());
|
srand(getpid());
|
||||||
st = lpcnet_encoder_create();
|
st = lpcnet_encoder_create();
|
||||||
argv0=argv[0];
|
argv0=argv[0];
|
||||||
|
if (argc == 5 && strcmp(argv[1], "-btrain")==0) {
|
||||||
|
burg = 1;
|
||||||
|
training = 1;
|
||||||
|
}
|
||||||
|
if (argc == 4 && strcmp(argv[1], "-btest")==0) {
|
||||||
|
burg = 1;
|
||||||
|
training = 0;
|
||||||
|
}
|
||||||
if (argc == 5 && strcmp(argv[1], "-train")==0) training = 1;
|
if (argc == 5 && strcmp(argv[1], "-train")==0) training = 1;
|
||||||
if (argc == 5 && strcmp(argv[1], "-qtrain")==0) {
|
if (argc == 5 && strcmp(argv[1], "-qtrain")==0) {
|
||||||
training = 1;
|
training = 1;
|
||||||
|
@ -236,7 +245,8 @@ int main(int argc, char **argv) {
|
||||||
if (count*FRAME_SIZE_5MS>=10000000 && one_pass_completed) break;
|
if (count*FRAME_SIZE_5MS>=10000000 && one_pass_completed) break;
|
||||||
if (training && ++gain_change_count > 2821) {
|
if (training && ++gain_change_count > 2821) {
|
||||||
float tmp, tmp2;
|
float tmp, tmp2;
|
||||||
speech_gain = pow(10., (-20+(rand()%40))/20.);
|
speech_gain = pow(10., (-30+(rand()%40))/20.);
|
||||||
|
if (rand()&1) speech_gain = -speech_gain;
|
||||||
if (rand()%20==0) speech_gain *= .01;
|
if (rand()%20==0) speech_gain *= .01;
|
||||||
if (rand()%100==0) speech_gain = 0;
|
if (rand()%100==0) speech_gain = 0;
|
||||||
gain_change_count = 0;
|
gain_change_count = 0;
|
||||||
|
@ -247,13 +257,18 @@ int main(int argc, char **argv) {
|
||||||
}
|
}
|
||||||
biquad(x, mem_hp_x, x, b_hp, a_hp, FRAME_SIZE);
|
biquad(x, mem_hp_x, x, b_hp, a_hp, FRAME_SIZE);
|
||||||
biquad(x, mem_resp_x, x, b_sig, a_sig, FRAME_SIZE);
|
biquad(x, mem_resp_x, x, b_sig, a_sig, FRAME_SIZE);
|
||||||
preemphasis(x, &mem_preemph, x, PREEMPHASIS, FRAME_SIZE);
|
|
||||||
for (i=0;i<FRAME_SIZE;i++) {
|
for (i=0;i<FRAME_SIZE;i++) {
|
||||||
float g;
|
float g;
|
||||||
float f = (float)i/FRAME_SIZE;
|
float f = (float)i/FRAME_SIZE;
|
||||||
g = f*speech_gain + (1-f)*old_speech_gain;
|
g = f*speech_gain + (1-f)*old_speech_gain;
|
||||||
x[i] *= g;
|
x[i] *= g;
|
||||||
}
|
}
|
||||||
|
if (burg) {
|
||||||
|
float ceps[2*NB_BANDS];
|
||||||
|
burg_cepstral_analysis(ceps, x);
|
||||||
|
fwrite(ceps, sizeof(float), 2*NB_BANDS, ffeat);
|
||||||
|
}
|
||||||
|
preemphasis(x, &mem_preemph, x, PREEMPHASIS, FRAME_SIZE);
|
||||||
for (i=0;i<FRAME_SIZE;i++) x[i] += rand()/(float)RAND_MAX - .5;
|
for (i=0;i<FRAME_SIZE;i++) x[i] += rand()/(float)RAND_MAX - .5;
|
||||||
/* PCM is delayed by 1/2 frame to make the features centered on the frames. */
|
/* PCM is delayed by 1/2 frame to make the features centered on the frames. */
|
||||||
for (i=0;i<FRAME_SIZE-TRAINING_OFFSET;i++) pcm[i+TRAINING_OFFSET] = float2short(x[i]);
|
for (i=0;i<FRAME_SIZE-TRAINING_OFFSET;i++) pcm[i+TRAINING_OFFSET] = float2short(x[i]);
|
||||||
|
|
15
dnn/freq.c
15
dnn/freq.c
|
@ -155,7 +155,7 @@ void compute_band_energy(float *bandE, const kiss_fft_cpx *X) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void compute_burg_cepstrum(const short *pcm, float *burg_cepstrum, int len, int order) {
|
void compute_burg_cepstrum(const float *pcm, float *burg_cepstrum, int len, int order) {
|
||||||
int i;
|
int i;
|
||||||
float burg_in[FRAME_SIZE];
|
float burg_in[FRAME_SIZE];
|
||||||
float burg_lpc[LPC_ORDER];
|
float burg_lpc[LPC_ORDER];
|
||||||
|
@ -190,6 +190,19 @@ void compute_burg_cepstrum(const short *pcm, float *burg_cepstrum, int len, int
|
||||||
burg_cepstrum[0] += - 4;
|
burg_cepstrum[0] += - 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void burg_cepstral_analysis(float *ceps, const float *x) {
|
||||||
|
int i;
|
||||||
|
compute_burg_cepstrum(x, &ceps[0 ], FRAME_SIZE/2, LPC_ORDER);
|
||||||
|
compute_burg_cepstrum(&x[FRAME_SIZE/2], &ceps[NB_BANDS], FRAME_SIZE/2, LPC_ORDER);
|
||||||
|
for (i=0;i<NB_BANDS;i++) {
|
||||||
|
float c0, c1;
|
||||||
|
c0 = ceps[i];
|
||||||
|
c1 = ceps[NB_BANDS+i];
|
||||||
|
ceps[i ] = .5*(c0+c1);
|
||||||
|
ceps[NB_BANDS+i] = (c0-c1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void compute_band_corr(float *bandE, const kiss_fft_cpx *X, const kiss_fft_cpx *P) {
|
void compute_band_corr(float *bandE, const kiss_fft_cpx *X, const kiss_fft_cpx *P) {
|
||||||
int i;
|
int i;
|
||||||
float sum[NB_BANDS] = {0};
|
float sum[NB_BANDS] = {0};
|
||||||
|
|
|
@ -47,7 +47,8 @@
|
||||||
|
|
||||||
void compute_band_energy(float *bandE, const kiss_fft_cpx *X);
|
void compute_band_energy(float *bandE, const kiss_fft_cpx *X);
|
||||||
void compute_band_corr(float *bandE, const kiss_fft_cpx *X, const kiss_fft_cpx *P);
|
void compute_band_corr(float *bandE, const kiss_fft_cpx *X, const kiss_fft_cpx *P);
|
||||||
void compute_burg_cepstrum(const short *pcm, float *burg_cepstrum, int len, int order);
|
void compute_burg_cepstrum(const float *pcm, float *burg_cepstrum, int len, int order);
|
||||||
|
void burg_cepstral_analysis(float *ceps, const float *x);
|
||||||
|
|
||||||
void apply_window(float *x);
|
void apply_window(float *x);
|
||||||
void dct(float *out, const float *in);
|
void dct(float *out, const float *in);
|
||||||
|
|
|
@ -75,7 +75,9 @@ LPCNET_EXPORT int lpcnet_plc_update(LPCNetPLCState *st, short *pcm) {
|
||||||
float x[FRAME_SIZE];
|
float x[FRAME_SIZE];
|
||||||
short output[FRAME_SIZE];
|
short output[FRAME_SIZE];
|
||||||
#if PLC_DNN_PRED
|
#if PLC_DNN_PRED
|
||||||
float plc_features[NB_FEATURES+1];
|
float plc_features[2*NB_BANDS+NB_FEATURES+1];
|
||||||
|
for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i];
|
||||||
|
burg_cepstral_analysis(plc_features, x);
|
||||||
#endif
|
#endif
|
||||||
st->enc.pcount = 0;
|
st->enc.pcount = 0;
|
||||||
if (st->skip_analysis) {
|
if (st->skip_analysis) {
|
||||||
|
@ -105,8 +107,8 @@ LPCNET_EXPORT int lpcnet_plc_update(LPCNetPLCState *st, short *pcm) {
|
||||||
process_single_frame(&st->enc, NULL);
|
process_single_frame(&st->enc, NULL);
|
||||||
#if PLC_DNN_PRED
|
#if PLC_DNN_PRED
|
||||||
if (st->skip_analysis <= 1) {
|
if (st->skip_analysis <= 1) {
|
||||||
RNN_COPY(plc_features, st->enc.features[0], NB_FEATURES);
|
RNN_COPY(&plc_features[2*NB_BANDS], st->enc.features[0], NB_FEATURES);
|
||||||
plc_features[NB_FEATURES] = 1;
|
plc_features[2*NB_BANDS+NB_FEATURES] = 1;
|
||||||
compute_plc_pred(&st->plc_net, st->features, plc_features);
|
compute_plc_pred(&st->plc_net, st->features, plc_features);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
@ -142,7 +144,7 @@ LPCNET_EXPORT int lpcnet_plc_conceal(LPCNetPLCState *st, short *pcm) {
|
||||||
int i;
|
int i;
|
||||||
#endif
|
#endif
|
||||||
short output[FRAME_SIZE];
|
short output[FRAME_SIZE];
|
||||||
float zeros[NB_FEATURES+1] = {0};
|
float zeros[2*NB_BANDS+NB_FEATURES+1] = {0};
|
||||||
st->enc.pcount = 0;
|
st->enc.pcount = 0;
|
||||||
/* If we concealed the previous frame, finish synthesizing the rest of the samples. */
|
/* If we concealed the previous frame, finish synthesizing the rest of the samples. */
|
||||||
/* FIXME: Copy/predict features. */
|
/* FIXME: Copy/predict features. */
|
||||||
|
|
|
@ -64,6 +64,7 @@ struct LPCNetEncState{
|
||||||
float features[4][NB_TOTAL_FEATURES];
|
float features[4][NB_TOTAL_FEATURES];
|
||||||
float sig_mem[LPC_ORDER];
|
float sig_mem[LPC_ORDER];
|
||||||
int exc_mem;
|
int exc_mem;
|
||||||
|
float burg_cepstrum[2*NB_BANDS];
|
||||||
};
|
};
|
||||||
|
|
||||||
#define PLC_BUF_SIZE (FEATURES_DELAY*FRAME_SIZE + TRAINING_OFFSET)
|
#define PLC_BUF_SIZE (FEATURES_DELAY*FRAME_SIZE + TRAINING_OFFSET)
|
||||||
|
|
|
@ -62,8 +62,8 @@ class WeightClip(Constraint):
|
||||||
|
|
||||||
constraint = WeightClip(0.992)
|
constraint = WeightClip(0.992)
|
||||||
|
|
||||||
def new_lpcnet_plc_model(rnn_units=256, nb_used_features=20, batch_size=128, training=False, adaptation=False, quantize=False, cond_size=128):
|
def new_lpcnet_plc_model(rnn_units=256, nb_used_features=20, nb_burg_features=36, batch_size=128, training=False, adaptation=False, quantize=False, cond_size=128):
|
||||||
feat = Input(shape=(None, nb_used_features), batch_size=batch_size)
|
feat = Input(shape=(None, nb_used_features+nb_burg_features), batch_size=batch_size)
|
||||||
lost = Input(shape=(None, 1), batch_size=batch_size)
|
lost = Input(shape=(None, 1), batch_size=batch_size)
|
||||||
|
|
||||||
fdense1 = Dense(cond_size, activation='tanh', name='plc_dense1')
|
fdense1 = Dense(cond_size, activation='tanh', name='plc_dense1')
|
||||||
|
@ -96,5 +96,6 @@ def new_lpcnet_plc_model(rnn_units=256, nb_used_features=20, batch_size=128, tra
|
||||||
model.rnn_units = rnn_units
|
model.rnn_units = rnn_units
|
||||||
model.cond_size = cond_size
|
model.cond_size = cond_size
|
||||||
model.nb_used_features = nb_used_features
|
model.nb_used_features = nb_used_features
|
||||||
|
model.nb_burg_features = nb_burg_features
|
||||||
|
|
||||||
return model
|
return model
|
||||||
|
|
|
@ -29,12 +29,13 @@ import numpy as np
|
||||||
from tensorflow.keras.utils import Sequence
|
from tensorflow.keras.utils import Sequence
|
||||||
|
|
||||||
class PLCLoader(Sequence):
|
class PLCLoader(Sequence):
|
||||||
def __init__(self, features, lost, batch_size):
|
def __init__(self, features, lost, nb_burg_features, batch_size):
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
self.nb_batches = features.shape[0]//self.batch_size
|
self.nb_batches = features.shape[0]//self.batch_size
|
||||||
self.features = features[:self.nb_batches*self.batch_size, :, :]
|
self.features = features[:self.nb_batches*self.batch_size, :, :]
|
||||||
self.lost = lost.astype('float')
|
self.lost = lost.astype('float')
|
||||||
self.lost = self.lost[:(len(self.lost)//features.shape[1]-1)*features.shape[1]]
|
self.lost = self.lost[:(len(self.lost)//features.shape[1]-1)*features.shape[1]]
|
||||||
|
self.nb_burg_features = nb_burg_features
|
||||||
self.on_epoch_end()
|
self.on_epoch_end()
|
||||||
|
|
||||||
def on_epoch_end(self):
|
def on_epoch_end(self):
|
||||||
|
@ -51,7 +52,7 @@ class PLCLoader(Sequence):
|
||||||
lost = np.reshape(lost, (features.shape[0], features.shape[1], 1))
|
lost = np.reshape(lost, (features.shape[0], features.shape[1], 1))
|
||||||
lost_mask = np.tile(lost, (1,1,features.shape[2]))
|
lost_mask = np.tile(lost, (1,1,features.shape[2]))
|
||||||
|
|
||||||
out_features = np.concatenate([features, 1.-lost], axis=-1)
|
out_features = np.concatenate([features[:,:,self.nb_burg_features:], 1.-lost], axis=-1)
|
||||||
inputs = [features*lost_mask, lost]
|
inputs = [features*lost_mask, lost]
|
||||||
outputs = [out_features]
|
outputs = [out_features]
|
||||||
return (inputs, outputs)
|
return (inputs, outputs)
|
||||||
|
|
|
@ -140,8 +140,9 @@ with strategy.scope():
|
||||||
lpc_order = 16
|
lpc_order = 16
|
||||||
|
|
||||||
feature_file = args.features
|
feature_file = args.features
|
||||||
nb_features = model.nb_used_features + lpc_order
|
nb_features = model.nb_used_features + lpc_order + model.nb_burg_features
|
||||||
nb_used_features = model.nb_used_features
|
nb_used_features = model.nb_used_features
|
||||||
|
nb_burg_features = model.nb_burg_features
|
||||||
sequence_size = args.seq_length
|
sequence_size = args.seq_length
|
||||||
|
|
||||||
# u for unquantised, load 16 bit PCM samples and convert to mu-law
|
# u for unquantised, load 16 bit PCM samples and convert to mu-law
|
||||||
|
@ -153,7 +154,7 @@ features = features[:nb_sequences*sequence_size*nb_features]
|
||||||
|
|
||||||
features = np.reshape(features, (nb_sequences, sequence_size, nb_features))
|
features = np.reshape(features, (nb_sequences, sequence_size, nb_features))
|
||||||
|
|
||||||
features = features[:, :, :nb_used_features]
|
features = features[:, :, :nb_used_features+model.nb_burg_features]
|
||||||
|
|
||||||
lost = np.memmap(args.lost_file, dtype='int8', mode='r')
|
lost = np.memmap(args.lost_file, dtype='int8', mode='r')
|
||||||
|
|
||||||
|
@ -169,7 +170,7 @@ if quantize or retrain:
|
||||||
|
|
||||||
model.save_weights('{}_{}_initial.h5'.format(args.output, args.gru_size))
|
model.save_weights('{}_{}_initial.h5'.format(args.output, args.gru_size))
|
||||||
|
|
||||||
loader = PLCLoader(features, lost, batch_size)
|
loader = PLCLoader(features, lost, nb_burg_features, batch_size)
|
||||||
|
|
||||||
callbacks = [checkpoint]
|
callbacks = [checkpoint]
|
||||||
if args.logdir is not None:
|
if args.logdir is not None:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue