Makes analysis run at 24 kHz, with 20-ms frames

The change also makes the analysis run for sampling rates of 16 kHz and 24 kHz
since the features are only computed on the 0-8 kHz band. The longer time
window (20 ms instead of 10 ms) makes the tonality estimator more reliable
for low-pitch harmonics.
This commit is contained in:
Jean-Marc Valin 2016-12-16 17:52:15 -05:00
parent 159bb6df00
commit cf9409fe51
No known key found for this signature in database
GPG key ID: 5E5DD9A36F9189C8
7 changed files with 408 additions and 210 deletions

View file

@ -101,6 +101,7 @@ static OPUS_INLINE void _celt_fatal(const char *str, const char *file, int line)
typedef opus_int16 opus_val16; typedef opus_int16 opus_val16;
typedef opus_int32 opus_val32; typedef opus_int32 opus_val32;
typedef opus_int64 opus_val64;
typedef opus_val32 celt_sig; typedef opus_val32 celt_sig;
typedef opus_val16 celt_norm; typedef opus_val16 celt_norm;
@ -158,6 +159,7 @@ static OPUS_INLINE opus_int16 SAT16(opus_int32 x) {
typedef float opus_val16; typedef float opus_val16;
typedef float opus_val32; typedef float opus_val32;
typedef float opus_val64;
typedef float celt_sig; typedef float celt_sig;
typedef float celt_norm; typedef float celt_norm;

View file

@ -57,6 +57,7 @@ typedef struct {
float noisiness; float noisiness;
float activity; float activity;
float music_prob; float music_prob;
float vad_prob;
int bandwidth; int bandwidth;
float activity_probability; float activity_probability;
} AnalysisInfo; } AnalysisInfo;

View file

@ -42,6 +42,7 @@
#include "analysis.h" #include "analysis.h"
#include "mlp.h" #include "mlp.h"
#include "stack_alloc.h" #include "stack_alloc.h"
#include "float_cast.h"
#ifndef M_PI #ifndef M_PI
#define M_PI 3.141592653 #define M_PI 3.141592653
@ -100,24 +101,118 @@ static const float analysis_window[240] = {
}; };
static const int tbands[NB_TBANDS+1] = { static const int tbands[NB_TBANDS+1] = {
2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120 4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 136, 160, 192, 240
}; };
static const int extra_bands[NB_TOT_BANDS+1] = {
1, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200
};
/*static const float tweight[NB_TBANDS+1] = {
.3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5
};*/
#define NB_TONAL_SKIP_BANDS 9 #define NB_TONAL_SKIP_BANDS 9
static opus_val32 silk_resampler_down2_hp(
opus_val32 *S, /* I/O State vector [ 2 ] */
opus_val32 *out, /* O Output signal [ floor(len/2) ] */
const opus_val32 *in, /* I Input signal [ len ] */
int inLen /* I Number of input samples */
)
{
int k, len2 = inLen/2;
opus_val32 in32, out32, out32_hp, Y, X;
opus_val64 hp_ener = 0;
/* Internal variables and state are in Q10 format */
for( k = 0; k < len2; k++ ) {
/* Convert to Q10 */
in32 = in[ 2 * k ];
void tonality_analysis_init(TonalityAnalysisState *tonal) /* All-pass section for even input sample */
Y = SUB32( in32, S[ 0 ] );
X = MULT16_32_Q15(QCONST16(0.6074371f, 15), Y);
out32 = ADD32( S[ 0 ], X );
S[ 0 ] = ADD32( in32, X );
out32_hp = out32;
/* Convert to Q10 */
in32 = in[ 2 * k + 1 ];
/* All-pass section for odd input sample, and add to output of previous section */
Y = SUB32( in32, S[ 1 ] );
X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
out32 = ADD32( out32, S[ 1 ] );
out32 = ADD32( out32, X );
S[ 1 ] = ADD32( in32, X );
Y = SUB32( -in32, S[ 2 ] );
X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
out32_hp = ADD32( out32_hp, S[ 2 ] );
out32_hp = ADD32( out32_hp, X );
S[ 2 ] = ADD32( -in32, X );
hp_ener += out32_hp*(opus_val64)out32_hp;
/* Add, convert back to int16 and store to output */
out[ k ] = HALF32(out32);
}
#ifdef FIXED_POINT
/* len2 can be up to 480, so we shift by 8 more to make it fit. */
hp_ener = hp_ener >> (2*SIG_SHIFT + 8);
#endif
return hp_ener;
}
static opus_val32 downmix_and_resample(downmix_func downmix, const void *_x, opus_val32 *y, opus_val32 S[3], int subframe, int offset, int c1, int c2, int C, int Fs)
{
VARDECL(opus_val32, tmp);
opus_val32 scale;
int j;
opus_val32 ret = 0;
SAVE_STACK;
if (subframe==0) return 0;
if (Fs == 48000)
{
subframe *= 2;
offset *= 2;
} else if (Fs == 16000) {
subframe = subframe*2/3;
offset = offset*2/3;
}
ALLOC(tmp, subframe, opus_val32);
downmix(_x, tmp, subframe, offset, c1, c2, C);
#ifdef FIXED_POINT
scale = (1<<SIG_SHIFT);
#else
scale = 1.f/32768;
#endif
if (c2==-2)
scale /= C;
else if (c2>-1)
scale /= 2;
for (j=0;j<subframe;j++)
tmp[j] *= scale;
if (Fs == 48000)
{
ret = silk_resampler_down2_hp(S, y, tmp, subframe);
} else if (Fs == 24000) {
OPUS_COPY(y, tmp, subframe);
} else if (Fs == 16000) {
VARDECL(opus_val32, tmp3x);
ALLOC(tmp3x, 3*subframe, opus_val32);
/* Don't do this at home! This resampler is horrible and it's only (barely)
usable for the purpose of the analysis because we don't care about all
the aliasing between 8 kHz and 12 kHz. */
for (j=0;j<subframe;j++)
{
tmp3x[3*j] = tmp[j];
tmp3x[3*j+1] = tmp[j];
tmp3x[3*j+2] = tmp[j];
}
silk_resampler_down2_hp(S, y, tmp3x, 3*subframe);
}
RESTORE_STACK;
return ret;
}
void tonality_analysis_init(TonalityAnalysisState *tonal, opus_int32 Fs)
{ {
/* Initialize reusable fields. */ /* Initialize reusable fields. */
tonal->arch = opus_select_arch(); tonal->arch = opus_select_arch();
tonal->Fs = Fs;
/* Clear remaining fields. */ /* Clear remaining fields. */
tonality_analysis_reset(tonal); tonality_analysis_reset(tonal);
} }
@ -141,7 +236,8 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int
if (curr_lookahead<0) if (curr_lookahead<0)
curr_lookahead += DETECT_SIZE; curr_lookahead += DETECT_SIZE;
if (len > 480 && pos != tonal->write_pos) /* On long frames, look at the second analysis window rather than the first. */
if (len > tonal->Fs/50 && pos != tonal->write_pos)
{ {
pos++; pos++;
if (pos==DETECT_SIZE) if (pos==DETECT_SIZE)
@ -152,18 +248,27 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int
if (pos<0) if (pos<0)
pos = DETECT_SIZE-1; pos = DETECT_SIZE-1;
OPUS_COPY(info_out, &tonal->info[pos], 1); OPUS_COPY(info_out, &tonal->info[pos], 1);
tonal->read_subframe += len/120; /* If possible, look ahead for a tone to compensate for the delay in the tone detector. */
while (tonal->read_subframe>=4) for (i=0;i<3;i++)
{ {
tonal->read_subframe -= 4; pos++;
if (pos==DETECT_SIZE)
pos = 0;
if (pos == tonal->write_pos)
break;
info_out->tonality = MAX32(0, -.03 + MAX32(info_out->tonality, tonal->info[pos].tonality-.05));
}
tonal->read_subframe += len/(tonal->Fs/400);
while (tonal->read_subframe>=8)
{
tonal->read_subframe -= 8;
tonal->read_pos++; tonal->read_pos++;
} }
if (tonal->read_pos>=DETECT_SIZE) if (tonal->read_pos>=DETECT_SIZE)
tonal->read_pos-=DETECT_SIZE; tonal->read_pos-=DETECT_SIZE;
/* Compensate for the delay in the features themselves. /* The -1 is to compensate for the delay in the features themselves. */
FIXME: Need a better estimate the 10 I just made up */ curr_lookahead = IMAX(curr_lookahead-1, 0);
curr_lookahead = IMAX(curr_lookahead-10, 0);
psum=0; psum=0;
/* Summing the probability of transition patterns that involve music at /* Summing the probability of transition patterns that involve music at
@ -173,7 +278,7 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int
for (;i<DETECT_SIZE;i++) for (;i<DETECT_SIZE;i++)
psum += tonal->pspeech[i]; psum += tonal->pspeech[i];
psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence; psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
/*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/ /*printf("%f %f %f %f %f\n", psum, info_out->music_prob, info_out->vad_prob, info_out->activity_probability, info_out->tonality);*/
info_out->music_prob = psum; info_out->music_prob = psum;
} }
@ -216,19 +321,33 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
float noise_floor; float noise_floor;
int remaining; int remaining;
AnalysisInfo *info; AnalysisInfo *info;
float hp_ener;
float tonality2[240];
float midE[8];
float spec_variability=0;
SAVE_STACK; SAVE_STACK;
tonal->last_transition++; alpha = 1.f/IMIN(10, 1+tonal->count);
alpha = 1.f/IMIN(20, 1+tonal->count); alphaE = 1.f/IMIN(25, 1+tonal->count);
alphaE = 1.f/IMIN(50, 1+tonal->count); alphaE2 = 1.f/IMIN(500, 1+tonal->count);
alphaE2 = 1.f/IMIN(1000, 1+tonal->count);
if (tonal->Fs == 48000)
{
/* len and offset are now at 24 kHz. */
len/= 2;
offset /= 2;
} else if (tonal->Fs == 16000) {
len = 3*len/2;
offset = 3*offset/2;
}
if (tonal->count<4) if (tonal->count<4)
tonal->music_prob = .5; tonal->music_prob = .5;
kfft = celt_mode->mdct.kfft[0]; kfft = celt_mode->mdct.kfft[0];
if (tonal->count==0) if (tonal->count==0)
tonal->mem_fill = 240; tonal->mem_fill = 240;
downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C); tonal->hp_ener_accum += downmix_and_resample(downmix, x, &tonal->inmem[tonal->mem_fill], tonal->downmix_state,
IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C, tonal->Fs);
if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE) if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
{ {
tonal->mem_fill += len; tonal->mem_fill += len;
@ -236,6 +355,7 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
RESTORE_STACK; RESTORE_STACK;
return; return;
} }
hp_ener = tonal->hp_ener_accum;
info = &tonal->info[tonal->write_pos++]; info = &tonal->info[tonal->write_pos++];
if (tonal->write_pos>=DETECT_SIZE) if (tonal->write_pos>=DETECT_SIZE)
tonal->write_pos-=DETECT_SIZE; tonal->write_pos-=DETECT_SIZE;
@ -254,7 +374,8 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
} }
OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240); OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill); remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C); tonal->hp_ener_accum = downmix_and_resample(downmix, x, &tonal->inmem[240], tonal->downmix_state,
remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C, tonal->Fs);
tonal->mem_fill = 240 + remaining; tonal->mem_fill = 240 + remaining;
opus_fft(kfft, in, out, tonal->arch); opus_fft(kfft, in, out, tonal->arch);
#ifndef FIXED_POINT #ifndef FIXED_POINT
@ -286,24 +407,31 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
d_angle2 = angle2 - angle; d_angle2 = angle2 - angle;
d2_angle2 = d_angle2 - d_angle; d2_angle2 = d_angle2 - d_angle;
mod1 = d2_angle - (float)floor(.5+d2_angle); mod1 = d2_angle - (float)float2int(d2_angle);
noisiness[i] = ABS16(mod1); noisiness[i] = ABS16(mod1);
mod1 *= mod1; mod1 *= mod1;
mod1 *= mod1; mod1 *= mod1;
mod2 = d2_angle2 - (float)floor(.5+d2_angle2); mod2 = d2_angle2 - (float)float2int(d2_angle2);
noisiness[i] += ABS16(mod2); noisiness[i] += ABS16(mod2);
mod2 *= mod2; mod2 *= mod2;
mod2 *= mod2; mod2 *= mod2;
avg_mod = .25f*(d2A[i]+2.f*mod1+mod2); avg_mod = .25f*(d2A[i]+mod1+2*mod2);
/* This introduces an extra delay of 2 frames in the detection. */
tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f; tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
/* No delay on this detection, but it's less reliable. */
tonality2[i] = 1.f/(1.f+40.f*16.f*pi4*mod2)-.015f;
A[i] = angle2; A[i] = angle2;
dA[i] = d_angle2; dA[i] = d_angle2;
d2A[i] = mod2; d2A[i] = mod2;
} }
for (i=2;i<N2-1;i++)
{
float tt = MIN32(tonality2[i], MAX32(tonality2[i-1], tonality2[i+1]));
tonality[i] = .9*MAX32(tonality[i], tt-.1);
}
frame_tonality = 0; frame_tonality = 0;
max_frame_tonality = 0; max_frame_tonality = 0;
/*tw_sum = 0;*/ /*tw_sum = 0;*/
@ -334,7 +462,7 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
binE *= 5.55e-17f; binE *= 5.55e-17f;
#endif #endif
E += binE; E += binE;
tE += binE*tonality[i]; tE += binE*MAX32(0, tonality[i]);
nE += binE*2.f*(.5f-noisiness[i]); nE += binE*2.f*(.5f-noisiness[i]);
} }
#ifndef FIXED_POINT #ifndef FIXED_POINT
@ -352,14 +480,26 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
frame_loudness += (float)sqrt(E+1e-10f); frame_loudness += (float)sqrt(E+1e-10f);
logE[b] = (float)log(E+1e-10f); logE[b] = (float)log(E+1e-10f);
tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f); tonal->logE[tonal->E_count][b] = logE[b];
tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f); if (tonal->count==0)
if (tonal->highE[b] < tonal->lowE[b]+1.f) tonal->highE[b] = tonal->lowE[b] = logE[b];
if (tonal->highE[b] > tonal->lowE[b] + 7.5)
{ {
tonal->highE[b]+=.5f; if (tonal->highE[b] - logE[b] > logE[b] - tonal->lowE[b])
tonal->lowE[b]-=.5f; tonal->highE[b] -= .01;
else
tonal->lowE[b] += .01;
} }
relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE[b]); if (logE[b] > tonal->highE[b])
{
tonal->highE[b] = logE[b];
tonal->lowE[b] = MAX32(tonal->highE[b]-15, tonal->lowE[b]);
} else if (logE[b] < tonal->lowE[b])
{
tonal->lowE[b] = logE[b];
tonal->highE[b] = MIN32(tonal->lowE[b]+15, tonal->highE[b]);
}
relativeE += (logE[b]-tonal->lowE[b])/(1e-15f + (tonal->highE[b]-tonal->lowE[b]));
L1=L2=0; L1=L2=0;
for (i=0;i<NB_FRAMES;i++) for (i=0;i<NB_FRAMES;i++)
@ -391,6 +531,26 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
tonal->prev_band_tonality[b] = band_tonality[b]; tonal->prev_band_tonality[b] = band_tonality[b];
} }
for (i=0;i<NB_FRAMES;i++)
{
int j;
float mindist = 1e15;
for (j=0;j<NB_FRAMES;j++)
{
int k;
float dist=0;
for (k=0;k<NB_TBANDS;k++)
{
float tmp;
tmp = tonal->logE[i][k] - tonal->logE[j][k];
dist += tmp*tmp;
}
if (j!=i)
mindist = MIN32(mindist, dist);
}
spec_variability += mindist;
}
spec_variability = sqrt(spec_variability/NB_FRAMES/NB_TBANDS);
bandwidth_mask = 0; bandwidth_mask = 0;
bandwidth = 0; bandwidth = 0;
maxE = 0; maxE = 0;
@ -399,13 +559,13 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
noise_floor *= 1<<(15+SIG_SHIFT); noise_floor *= 1<<(15+SIG_SHIFT);
#endif #endif
noise_floor *= noise_floor; noise_floor *= noise_floor;
for (b=0;b<NB_TOT_BANDS;b++) for (b=0;b<NB_TBANDS;b++)
{ {
float E=0; float E=0;
int band_start, band_end; int band_start, band_end;
/* Keep a margin of 300 Hz for aliasing */ /* Keep a margin of 300 Hz for aliasing */
band_start = extra_bands[b]; band_start = tbands[b];
band_end = extra_bands[b+1]; band_end = tbands[b+1];
for (i=band_start;i<band_end;i++) for (i=band_start;i<band_end;i++)
{ {
float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
@ -422,14 +582,31 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
2) less than 90 dB below the peak band (maximal masking possible considering 2) less than 90 dB below the peak band (maximal masking possible considering
both the ATH and the loudness-dependent slope of the spreading function) both the ATH and the loudness-dependent slope of the spreading function)
3) above the PCM quantization noise floor 3) above the PCM quantization noise floor
We use b+1 because the first CELT band isn't included in tbands[]
*/ */
if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start)) if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
bandwidth = b; bandwidth = b+1;
}
/* Special case for the last two bands, for which we don't have spectrum but only
the energy above 12 kHz. */
{
float E = hp_ener*(1./(240*240));
#ifdef FIXED_POINT
/* silk_resampler_down2_hp() shifted right by an extra 8 bits. */
E *= ((opus_int32)1 << 2*SIG_SHIFT)*256.f;
#endif
maxE = MAX32(maxE, E);
tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
E = MAX32(E, tonal->meanE[b]);
/* Use a simple follower with 13 dB/Bark slope for spreading function */
bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*160)
bandwidth = 20;
} }
if (tonal->count<=2) if (tonal->count<=2)
bandwidth = 20; bandwidth = 20;
frame_loudness = 20*(float)log10(frame_loudness); frame_loudness = 20*(float)log10(frame_loudness);
tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness); tonal->Etracker = MAX32(tonal->Etracker-.003f, frame_loudness);
tonal->lowECount *= (1-alphaE); tonal->lowECount *= (1-alphaE);
if (frame_loudness < tonal->Etracker-30) if (frame_loudness < tonal->Etracker-30)
tonal->lowECount += alphaE; tonal->lowECount += alphaE;
@ -441,6 +618,13 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
sum += dct_table[i*16+b]*logE[b]; sum += dct_table[i*16+b]*logE[b];
BFCC[i] = sum; BFCC[i] = sum;
} }
for (i=0;i<8;i++)
{
float sum=0;
for (b=0;b<16;b++)
sum += dct_table[i*16+b]*.5*(tonal->highE[b]+tonal->lowE[b]);
midE[i] = sum;
}
frame_stationarity /= NB_TBANDS; frame_stationarity /= NB_TBANDS;
relativeE /= NB_TBANDS; relativeE /= NB_TBANDS;
@ -460,7 +644,7 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
info->tonality_slope = slope; info->tonality_slope = slope;
tonal->E_count = (tonal->E_count+1)%NB_FRAMES; tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
tonal->count++; tonal->count = IMIN(tonal->count+1, ANALYSIS_COUNT_MAX);
info->tonality = frame_tonality; info->tonality = frame_tonality;
for (i=0;i<4;i++) for (i=0;i<4;i++)
@ -479,6 +663,8 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
for (i=0;i<9;i++) for (i=0;i<9;i++)
tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i]; tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
} }
for (i=0;i<4;i++)
features[i] = BFCC[i]-midE[i];
for (i=0;i<8;i++) for (i=0;i<8;i++)
{ {
@ -489,6 +675,7 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
} }
for (i=0;i<9;i++) for (i=0;i<9;i++)
features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i]; features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i];
features[18] = spec_variability-.78;;
features[20] = info->tonality - 0.154723; features[20] = info->tonality - 0.154723;
features[21] = info->activity - 0.724643; features[21] = info->activity - 0.724643;
features[22] = frame_stationarity - 0.743717; features[22] = frame_stationarity - 0.743717;
@ -503,8 +690,6 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
/* Probability of active audio (as opposed to silence) */ /* Probability of active audio (as opposed to silence) */
frame_probs[1] = .5f*frame_probs[1]+.5f; frame_probs[1] = .5f*frame_probs[1]+.5f;
frame_probs[1] *= frame_probs[1]; frame_probs[1] *= frame_probs[1];
/* Consider that silence has a 50-50 probability. */
frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f;
/* Probability of speech or music vs noise */ /* Probability of speech or music vs noise */
info->activity_probability = frame_probs[1]; info->activity_probability = frame_probs[1];
@ -527,12 +712,32 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
float music0; float music0;
float p, q; float p, q;
/* More silence transitions for speech than for music. */
tau = .001f*tonal->music_prob + .01f*(1-tonal->music_prob);
p = MAX16(.05f,MIN16(.95f,frame_probs[1]));
q = MAX16(.05f,MIN16(.95f,tonal->vad_prob));
beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
/* p0 and p1 are the probabilities of speech and music at this frame
using only information from previous frame and applying the
state transition model */
p0 = (1-tonal->vad_prob)*(1-tau) + tonal->vad_prob *tau;
p1 = tonal->vad_prob *(1-tau) + (1-tonal->vad_prob)*tau;
/* We apply the current probability with exponent beta to work around
the fact that the probability estimates aren't independent. */
p0 *= (float)pow(1-frame_probs[1], beta);
p1 *= (float)pow(frame_probs[1], beta);
/* Normalise the probabilities to get the Marokv probability of music. */
tonal->vad_prob = p1/(p0+p1);
info->vad_prob = tonal->vad_prob;
/* Consider that silence has a 50-50 probability of being speech or music. */
frame_probs[0] = tonal->vad_prob*frame_probs[0] + (1-tonal->vad_prob)*.5f;
/* One transition every 3 minutes of active audio */ /* One transition every 3 minutes of active audio */
tau = .00005f*frame_probs[1]; tau = .0001f;
/* Adapt beta based on how "unexpected" the new prob is */ /* Adapt beta based on how "unexpected" the new prob is */
p = MAX16(.05f,MIN16(.95f,frame_probs[0])); p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
q = MAX16(.05f,MIN16(.95f,tonal->music_prob)); q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p)); beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
/* p0 and p1 are the probabilities of speech and music at this frame /* p0 and p1 are the probabilities of speech and music at this frame
using only information from previous frame and applying the using only information from previous frame and applying the
state transition model */ state transition model */
@ -546,6 +751,7 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
tonal->music_prob = p1/(p0+p1); tonal->music_prob = p1/(p0+p1);
info->music_prob = tonal->music_prob; info->music_prob = tonal->music_prob;
/*printf("%f %f %f %f\n", frame_probs[0], frame_probs[1], tonal->music_prob, tonal->vad_prob);*/
/* This chunk of code deals with delayed decision. */ /* This chunk of code deals with delayed decision. */
psum=1e-20f; psum=1e-20f;
/* Instantaneous probability of speech and music, with beta pre-applied. */ /* Instantaneous probability of speech and music, with beta pre-applied. */
@ -611,15 +817,15 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
tonal->speech_confidence = .1f; tonal->speech_confidence = .1f;
} }
} }
if (tonal->last_music != (tonal->music_prob>.5f))
tonal->last_transition=0;
tonal->last_music = tonal->music_prob>.5f; tonal->last_music = tonal->music_prob>.5f;
#else #else
info->music_prob = 0; info->music_prob = 0;
#endif #endif
/*for (i=0;i<25;i++) #ifdef MLP_TRAINING
for (i=0;i<25;i++)
printf("%f ", features[i]); printf("%f ", features[i]);
printf("\n");*/ printf("\n");
#endif
info->bandwidth = bandwidth; info->bandwidth = bandwidth;
/*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/ /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
@ -635,17 +841,18 @@ void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, co
int offset; int offset;
int pcm_len; int pcm_len;
analysis_frame_size -= analysis_frame_size&1;
if (analysis_pcm != NULL) if (analysis_pcm != NULL)
{ {
/* Avoid overflow/wrap-around of the analysis buffer */ /* Avoid overflow/wrap-around of the analysis buffer */
analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size); analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/50, analysis_frame_size);
pcm_len = analysis_frame_size - analysis->analysis_offset; pcm_len = analysis_frame_size - analysis->analysis_offset;
offset = analysis->analysis_offset; offset = analysis->analysis_offset;
while (pcm_len>0) { while (pcm_len>0) {
tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix); tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(Fs/50, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
offset += 480; offset += Fs/50;
pcm_len -= 480; pcm_len -= Fs/50;
} }
analysis->analysis_offset = analysis_frame_size; analysis->analysis_offset = analysis_frame_size;

View file

@ -33,13 +33,19 @@
#define NB_FRAMES 8 #define NB_FRAMES 8
#define NB_TBANDS 18 #define NB_TBANDS 18
#define NB_TOT_BANDS 21 #define ANALYSIS_BUF_SIZE 720 /* 30 ms at 24 kHz */
#define ANALYSIS_BUF_SIZE 720 /* 15 ms at 48 kHz */
#define DETECT_SIZE 200 /* At that point we can stop counting frames because it no longer matters. */
#define ANALYSIS_COUNT_MAX 10000
#define DETECT_SIZE 100
/* Uncomment this to print the MLP features on stdout. */
/*#define MLP_TRAINING*/
typedef struct { typedef struct {
int arch; int arch;
opus_int32 Fs;
#define TONALITY_ANALYSIS_RESET_START angle #define TONALITY_ANALYSIS_RESET_START angle
float angle[240]; float angle[240];
float d_angle[240]; float d_angle[240];
@ -49,18 +55,19 @@ typedef struct {
float prev_band_tonality[NB_TBANDS]; float prev_band_tonality[NB_TBANDS];
float prev_tonality; float prev_tonality;
float E[NB_FRAMES][NB_TBANDS]; float E[NB_FRAMES][NB_TBANDS];
float logE[NB_FRAMES][NB_TBANDS];
float lowE[NB_TBANDS]; float lowE[NB_TBANDS];
float highE[NB_TBANDS]; float highE[NB_TBANDS];
float meanE[NB_TOT_BANDS]; float meanE[NB_TBANDS+1];
float mem[32]; float mem[32];
float cmean[8]; float cmean[8];
float std[9]; float std[9];
float music_prob; float music_prob;
float vad_prob;
float Etracker; float Etracker;
float lowECount; float lowECount;
int E_count; int E_count;
int last_music; int last_music;
int last_transition;
int count; int count;
int analysis_offset; int analysis_offset;
/** Probability of having speech for time i to DETECT_SIZE-1 (and music before). /** Probability of having speech for time i to DETECT_SIZE-1 (and music before).
@ -76,6 +83,8 @@ typedef struct {
int write_pos; int write_pos;
int read_pos; int read_pos;
int read_subframe; int read_subframe;
float hp_ener_accum;
opus_val32 downmix_state[3];
AnalysisInfo info[DETECT_SIZE]; AnalysisInfo info[DETECT_SIZE];
} TonalityAnalysisState; } TonalityAnalysisState;
@ -85,7 +94,7 @@ typedef struct {
* not be repeated every analysis step. No allocated memory is retained * not be repeated every analysis step. No allocated memory is retained
* by the state struct, so no cleanup call is required. * by the state struct, so no cleanup call is required.
*/ */
void tonality_analysis_init(TonalityAnalysisState *analysis); void tonality_analysis_init(TonalityAnalysisState *analysis, opus_int32 Fs);
/** Reset a TonalityAnalysisState stuct. /** Reset a TonalityAnalysisState stuct.
* *

View file

@ -4,104 +4,104 @@
#include "mlp.h" #include "mlp.h"
/* RMS error was 0.230027, seed was 1452289367 */ /* RMS error was 0.280492, seed was 1480478173 */
/* 0.009100 0.069938 (0.230027 0.230027) 1.24058e-07 5543 */ /* 0.005976 0.031821 (0.280494 0.280492) done */
static const float weights[450] = { static const float weights[450] = {
/* hidden layer */ /* hidden layer */
-1.20927f, -0.0275523f, 0.0304442f, -0.071791f, -0.0897356f, -0.514624f, 0.0234227f, -0.14329f, -0.0878216f, -0.00187827f,
0.100996f, -0.0492634f, 0.070213f, 0.0187071f, 0.0042668f, -0.0257443f, 0.108524f, 0.00333881f, 0.00585017f, -0.0246132f,
0.0644589f, -0.10967f, -0.119688f, -0.00888386f, 0.170952f, 0.142723f, -0.00436494f, 0.0101354f, -0.11124f, -0.0809367f,
0.174562f, -0.265435f, -0.0635892f, -0.284755f, -1.06453f, -0.0750772f, 0.0295524f, 0.00823944f, 0.150392f, 0.0320876f,
0.202855f, 2.31084f, -2.763f, -0.420894f, 0.698811f, -0.0710564f, -1.43818f, 0.652076f, 0.0650744f, -1.54821f,
6.46418f, 0.0662341f, 0.0758173f, 0.0511722f, 0.0426484f, 0.168949f, -1.92724f, 0.0517976f, -0.0670737f, -0.0690121f,
0.115711f, -0.263815f, -0.0113386f, -0.189737f, -0.0929912f, 0.00247528f, -0.0522024f, 0.0631368f, 0.0532776f, 0.047751f,
-0.287827f, 0.0925463f, 0.0286792f, -0.0199793f, -0.193071f, -0.011715f, 0.142374f, -0.0290885f, -0.279263f, -0.433499f,
0.258586f, 0.018504f, 0.116125f, 0.099269f, -0.00781962f, -0.0795174f, -0.380458f, -0.051263f, 0.218537f, -0.322478f,
-0.266017f, 0.283733f, 10.5488f, -0.658286f, 0.836758f, 1.06667f, -0.104607f, -4.70108f, 0.312037f, 0.277397f,
13.1168f, -5.02553f, -1.0969f, -0.0738116f, 0.0204736f, -2.71859f, 1.70037f, -0.141845f, 0.0115618f, 0.0629883f,
0.0110775f, -0.00198985f, 0.00426824f, 0.148998f, 0.0755275f, 0.0403871f, 0.0139428f, -0.00430733f, -0.0429038f, -0.0590318f,
0.112213f, -0.0518501f, 0.028398f, 0.0240943f, -0.0503666f, -0.0501526f, -0.0284802f, -0.0415686f, -0.0438999f, 0.0822666f,
-0.149506f, -0.133575f, -0.137328f, 0.116275f, 0.238077f, 0.197194f, 0.0363275f, -0.0584307f, 0.0752364f, -0.0799796f,
0.080265f, 0.0387349f, 0.09185f, 4.04867f, 3.2435f, -0.146275f, 0.161661f, -0.184585f, 0.145568f, 0.442823f,
-0.7155f, 8.14792f, -29.8969f, 1.1575f, -0.124794f, 1.61221f, 1.11162f, 2.62177f, -2.482f, -0.112599f,
0.0226943f, -0.0470538f, -0.0334476f, 0.0360859f, 0.0447789f, -0.110366f, -0.140794f, -0.181694f, 0.0648674f, 0.0842248f,
-0.00258532f, -0.0192054f, -0.113082f, 0.109513f, -0.0437787f, 0.0933993f, 0.150122f, 0.129171f, 0.176848f, 0.141758f,
0.0382349f, -0.00994462f, -0.155653f, 0.171922f, -0.222151f, -0.271822f, 0.235113f, 0.0668579f, -0.433957f, 0.113633f,
-0.523565f, -0.0454432f, -0.556888f, 0.761537f, -2.70075f, -0.169348f, -1.40091f, 0.62861f, -0.134236f, 0.402173f,
-0.883015f, 0.887168f, 0.746329f, -0.363477f, 0.360424f, 1.86373f, 1.53998f, -4.32084f, 0.735343f, 0.800214f,
0.034755f, -0.015404f, 0.00688472f, -0.00949269f, 0.0625642f, -0.00968415f, 0.0425904f, 0.0196811f, -0.018426f, -0.000343953f,
-0.050711f, 0.0370223f, 0.0149561f, 0.060385f, -0.0709806f, -0.00416389f, 0.00111558f, 0.0173069f, -0.00998596f, -0.025898f,
-0.036509f, 0.099007f, -0.0397276f, 0.285237f, 0.127836f, 0.00123764f, -0.00520373f, -0.0565033f, 0.0637394f, 0.0051213f,
-0.15154f, 0.265848f, -0.0832318f, 0.0520659f, 0.897805f, 0.0221361f, 0.00819962f, -0.0467061f, -0.0548258f, -0.00314063f,
0.439215f, -3.00803f, 1.93755f, -0.408725f, 0.300142f, -1.18332f, 1.88091f, -0.41148f, -2.95727f, -0.521449f,
-1.42001f, 0.118794f, -0.04621f, 0.050757f, -0.0239654f, -0.271641f, 0.124946f, -0.0532936f, 0.101515f, 0.000208564f,
-0.0629488f, -0.0083243f, -0.108989f, -0.0326831f, 0.104277f, -0.0488748f, 0.0642388f, -0.0383848f, 0.0135046f, -0.0413592f,
-0.0667274f, 0.0475941f, 0.069182f, -0.0574944f, -0.137823f, -0.0326402f, -0.0137421f, -0.0225219f, -0.0917294f, -0.277759f,
-0.206978f, -0.162035f, -0.208444f, 0.141751f, -0.289377f, -0.185418f, 0.0471128f, -0.125879f, 0.262467f, -0.212794f,
-0.7875f, 0.0911f, 0.174999f, -2.03406f, 3.06743f, -0.112931f, -1.99885f, -0.404787f, 0.224402f, 0.637962f,
1.22255f, 2.10659f, 0.0779022f, -0.220946f, 0.137124f, -0.27808f, -0.0723953f, -0.0537655f, -0.0336359f, -0.0906601f,
-0.0625512f, -0.073468f, 0.174861f, -0.139417f, 0.0967417f, -0.0641309f, -0.0713542f, 0.0524317f, 0.00608819f, 0.0754101f,
0.0830658f, -0.223662f, 0.103016f, -0.102317f, 0.225611f, -0.0488401f, -0.00671865f, 0.0418239f, 0.0536284f, -0.132639f,
0.154375f, 0.187856f, -0.00878193f, 0.128648f, -0.371477f, 0.0267648f, -0.248432f, -0.0104153f, 0.035544f, -0.212753f,
-0.479037f, 0.156541f, 1.10304f, -1.26162f, 0.086939f, -0.302895f, -0.0357854f, 0.376838f, 0.597025f, -0.664647f,
-0.143269f, 2.18318f, -2.88831f, 0.101126f, -0.308315f, 0.268422f, -0.376772f, -1.05472f, 0.0144178f, 0.179122f,
0.222068f, -0.227709f, -0.00855236f, 0.0107035f, 0.00774349f, 0.0360155f, 0.220262f, -0.0056381f, 0.0317197f, 0.0621066f,
-0.0185316f, 0.0306039f, -0.233612f, 0.0807309f, -0.029933f, -0.00779298f, 0.00789378f, 0.00350605f, 0.0104809f, 0.0362871f,
0.151942f, -0.267724f, 0.0484763f, 0.132192f, -0.230059f, -0.157708f, -0.0659779f, -0.0926278f, 0.00770791f, 0.0631621f,
0.357879f, 0.075414f, 0.110637f, -1.27818f, 3.3101f, 0.0817343f, -0.424295f, -0.0437727f, -0.24251f, 0.711217f,
0.831064f, -0.212367f, -20.704f, -1.1492f, 0.0312941f, -0.736455f, -2.194f, -0.107612f, -0.175156f, -0.0366573f,
-0.0208507f, -0.00804196f, 0.0110407f, 0.027599f, 0.00193594f, -0.0123156f, -0.0628516f, -0.0218977f, -0.00693699f, 0.00695185f,
-0.0135057f, -0.00614977f, 0.0505432f, -0.0108098f, 0.000826042f, 0.00507362f, 0.00359334f, 0.0052661f, 0.035561f, 0.0382701f,
-0.0243765f, -0.323055f, 0.0682748f, -0.55873f, -0.103042f, 0.0342179f, -0.00790271f, -0.0170925f, 0.047029f, 0.0197362f,
0.174935f, -0.126558f, -0.104518f, 0.422479f, -0.0683178f, -0.0153435f, 0.0644152f, -0.36862f, -0.0674876f, -2.82672f,
-1.44811f, 0.702109f, 0.712138f, -0.420112f, 2.59746f, 1.34122f, -0.0788029f, -3.47792f, 0.507246f, -0.816378f,
-0.0297689f, -0.0453044f, -0.0330312f, -0.0344518f, -0.0260442f, -0.0142383f, -0.127349f, -0.106926f, -0.0359524f, 0.105045f,
-0.0610515f, 0.0916816f, 0.0256295f, -0.105187f, 0.0771212f, 0.291554f, 0.195413f, 0.0866214f, -0.066577f, -0.102188f,
-0.0898792f, -0.186163f, -0.321019f, -0.225689f, 0.175825f, 0.0979466f, -0.12982f, 0.400181f, -0.409336f, -0.0593326f,
0.252939f, 0.738898f, 2.41919f, 0.114505f, -0.314026f, -0.0656203f, -0.204474f, 0.179802f, 0.000509084f, 0.0995954f,
0.607983f, 1.73201f, -2.09609f, -0.609339f, 1.18997f, -2.377f, -0.686359f, 0.934861f, 1.10261f, 1.3901f,
0.113871f, -0.177673f, -0.0785783f, -0.348033f, -0.0949274f, -4.33616f, -0.00264017f, 0.00713045f, 0.106264f, 0.143726f,
-0.0191062f, 0.335823f, -0.0578655f, 0.131259f, -0.118687f, -0.0685305f, -0.054656f, -0.0176725f, -0.0772669f, -0.0264526f,
-0.132123f, -0.239624f, 0.000738732f, -0.185936f, -0.13077f, -0.0103824f, -0.0269872f, -0.00687f, 0.225804f, 0.407751f,
-0.436439f, -0.141664f, 0.0353391f, -0.0536557f, -0.0964537f, -0.0612611f, -0.0576863f, -0.180131f, -0.222772f, -0.461742f,
0.221853f, 1.94264f, -1.78544f, 3.8254f, 3.74598f, 0.335236f, 1.03399f, 4.24112f, -0.345796f, -0.594549f,
2.37071f, -1.42709f, 0.0463179f, -0.0568602f, 0.0529534f, -76.1407f, -0.265276f, 0.0507719f, 0.0643044f, 0.0384832f,
-0.103245f, -0.340972f, 0.101934f, -0.810811f, 0.176158f, 0.0424459f, -0.0387817f, -0.0235996f, -0.0740556f, -0.0270029f,
0.469658f, 0.0248864f, -0.10734f, -0.143827f, -0.0457131f, 0.00882177f, -0.0552371f, -0.00485851f, 0.314295f, 0.360431f,
0.779219f, -0.142152f, 0.0394297f, 0.160772f, -0.707623f, -0.0787085f, 0.110355f, -0.415958f, -0.385088f, -0.272224f,
-0.608236f, 1.07106f, -1.27037f, 2.27722f, 6.3688f, -1.55108f, -0.141848f, 0.448877f, -0.563447f, -2.31403f,
0.519837f, -3.33262f, -0.126443f, -0.0943922f, 0.0265837f, -0.120077f, -1.49918f, -0.817726f, -0.0495854f, -0.0230782f,
0.0620709f, 0.0113266f, -0.255811f, -0.0735781f, -0.0638952f, -0.0224014f, 0.117076f, 0.0393216f, 0.051997f, 0.0330763f,
-0.09543f, -0.204965f, 0.00454999f, 0.0554974f, -0.16251f, -0.110796f, 0.0211117f, -0.0197258f, 0.0187461f, 0.0125183f,
-0.573836f, 0.258764f, 0.19895f, 0.0219289f, -0.376757f, 0.14876f, 0.0920565f, -0.342475f, 0.135272f, -0.168155f,
-0.508578f, -0.0767061f, -0.654512f, 4.48901f, 3.38949f, -0.033423f, -0.0604611f, -0.128835f, 0.664947f, -0.144997f,
-2.34533f, -11.0766f, 4.35799f, 1.66794f, -0.0513934f, 2.27649f, 1.28663f, 0.841217f, -2.42807f, 0.0230471f,
-0.0685787f, -0.0112154f, 0.000464661f, -0.234848f, -0.338596f, 0.226709f, -0.0374803f, 0.155436f, 0.0400342f, -0.184686f,
-0.142242f, -0.167476f, -0.140324f, -0.104829f, -0.104195f, 0.128488f, -0.0939518f, -0.0578559f, 0.0265967f, -0.0999322f,
0.0110351f, -0.112668f, 0.0872292f, -0.170777f, -0.0876985f, -0.0322768f, -0.322994f, -0.189371f, -0.738069f, -0.0754914f,
0.123348f, -0.156758f, 0.199038f, -0.056107f, 0.899269f, 0.214717f, -0.093728f, -0.695741f, 0.0899298f, -2.06188f,
0.0820197f, -1.295f, 0.0295294f, 2.27577f, -0.940993f, -0.273719f, -0.896977f, 0.130553f, 0.134638f, 1.29355f,
-0.0100104f, -0.111541f, -0.132193f, -0.11037f, 0.0371375f, 0.00520749f, -0.0324224f, 0.00530451f, 0.0192385f, 0.00328708f,
-0.0180172f, -0.0105591f, 0.0197043f, 0.04099f, -0.0538671f, 0.0250838f, 0.0053365f, -0.0177321f, 0.00618789f, 0.00525364f,
-0.102347f, -0.0470742f, 0.178034f, -0.267772f, -0.105789f, 0.00104596f, -0.0360459f, 0.0402403f, -0.0406351f, 0.0136883f,
-0.105376f, 0.0623262f, -0.042906f, 0.176528f, -0.160076f, 0.0880722f, -0.0197449f, 0.089938f, 0.0100456f, -0.0475638f,
-2.28483f, -1.92619f, 0.218149f, 9.67107f, 3.30399f, -0.73267f, 0.037433f, -0.146551f, -0.230221f, -3.06489f,
-1.75951f, 0.129671f, 0.118305f, 0.140766f, 0.0678099f, -1.40194f, 0.0198483f, 0.0397953f, -0.0190239f, 0.0470715f,
0.00313175f, -0.0144533f, -0.0310217f, -0.0245139f, 0.136948f, -0.131363f, -0.191721f, -0.0176224f, -0.0480352f, -0.221799f,
0.150137f, 0.112326f, -0.0755033f, -0.280984f, -0.249342f, -0.26794f, -0.0292615f, 0.0612127f, -0.129877f, 0.00628332f,
-0.681657f, 0.0315246f, 0.294968f, 0.0407062f, 0.282759f, -0.085918f, 0.0175379f, 0.0541011f, -0.0810874f, -0.380809f,
-0.344185f, -7.32828f, -0.220036f, -0.560418f, -1.87191f, -0.222056f, -0.508859f, -0.473369f, 0.484958f, -2.28411f,
-7.10132f, 0.0139516f,
/* output layer */ /* output layer */
8.55144, 2.0822, 0.240592, 1.26638, 0.0309585, 3.90017, 1.71789, -1.43372, -2.70839, 1.77107,
-1.09841, 0.861549, -1.53704, 1.07356, 4.39194, 5.48006, 1.44661, 2.01134, -1.88383, -3.64958,
-2.60476, 0.375094, 0.122941, 0.00326393, 0.777163, -1.26351, 0.779421, 2.11357, 3.10409, 1.68846,
-2.03171, -0.944556, 4.02958, -0.260741, 0.556385, -4.46197, -1.61455, 3.59832, 2.43531, -1.26458,
-0.220568, -1.77121, -0.858706, -1.52023, -0.784162, 0.417941, 1.47437, 2.16635, -1.909, -0.828869,
0.345948, -0.0488489, -0.323381, -0.752573, 0.517346, 1.38805, -2.67975, -0.110044, 1.95596, 0.697931,
0.876475, -1.44056, -0.382276, -1.55409, }; -0.313226, -0.889315, 0.283236, 0.946102, };
static const int topo[3] = {25, 16, 2}; static const int topo[3] = {25, 16, 2};

View file

@ -138,13 +138,16 @@ double compute_gradient(MLPTrain *net, float *inputs, float *outputs, int nbSamp
for (s=0;s<nbSamples;s++) for (s=0;s<nbSamples;s++)
{ {
float *in, *out; float *in, *out;
float inp[inDim];
in = inputs+s*inDim; in = inputs+s*inDim;
out = outputs + s*outDim; out = outputs + s*outDim;
for (j=0;j<inDim;j++)
inp[j] = in[j];
for (i=0;i<hiddenDim;i++) for (i=0;i<hiddenDim;i++)
{ {
double sum = W0[i*(inDim+1)]; double sum = W0[i*(inDim+1)];
for (j=0;j<inDim;j++) for (j=0;j<inDim;j++)
sum += W0[i*(inDim+1)+j+1]*in[j]; sum += W0[i*(inDim+1)+j+1]*inp[j];
hidden[i] = tansig_approx(sum); hidden[i] = tansig_approx(sum);
} }
for (i=0;i<outDim;i++) for (i=0;i<outDim;i++)
@ -156,14 +159,14 @@ double compute_gradient(MLPTrain *net, float *inputs, float *outputs, int nbSamp
error[i] = out[i] - netOut[i]; error[i] = out[i] - netOut[i];
if (out[i] == 0) error[i] *= .0; if (out[i] == 0) error[i] *= .0;
error_rate[i] += fabs(error[i])>1; error_rate[i] += fabs(error[i])>1;
if (i==0) error[i] *= 3; if (i==0) error[i] *= 5;
rms += error[i]*error[i]; rms += error[i]*error[i];
/*error[i] = error[i]/(1+fabs(error[i]));*/ /*error[i] = error[i]/(1+fabs(error[i]));*/
} }
/* Back-propagate error */ /* Back-propagate error */
for (i=0;i<outDim;i++) for (i=0;i<outDim;i++)
{ {
float grad = 1-netOut[i]*netOut[i]; double grad = 1-netOut[i]*netOut[i];
W1_grad[i*(hiddenDim+1)] += error[i]*grad; W1_grad[i*(hiddenDim+1)] += error[i]*grad;
for (j=0;j<hiddenDim;j++) for (j=0;j<hiddenDim;j++)
W1_grad[i*(hiddenDim+1)+j+1] += grad*error[i]*hidden[j]; W1_grad[i*(hiddenDim+1)+j+1] += grad*error[i]*hidden[j];
@ -177,7 +180,7 @@ double compute_gradient(MLPTrain *net, float *inputs, float *outputs, int nbSamp
grad *= 1-hidden[i]*hidden[i]; grad *= 1-hidden[i]*hidden[i];
W0_grad[i*(inDim+1)] += grad; W0_grad[i*(inDim+1)] += grad;
for (j=0;j<inDim;j++) for (j=0;j<inDim;j++)
W0_grad[i*(inDim+1)+j+1] += grad*in[j]; W0_grad[i*(inDim+1)+j+1] += grad*inp[j];
} }
} }
return rms; return rms;
@ -232,8 +235,6 @@ float mlp_train_backprop(MLPTrain *net, float *inputs, float *outputs, int nbSam
int inDim, outDim, hiddenDim; int inDim, outDim, hiddenDim;
int *topo; int *topo;
double *W0, *W1, *best_W0, *best_W1; double *W0, *W1, *best_W0, *best_W1;
double *W0_old, *W1_old;
double *W0_old2, *W1_old2;
double *W0_grad, *W1_grad; double *W0_grad, *W1_grad;
double *W0_oldgrad, *W1_oldgrad; double *W0_oldgrad, *W1_oldgrad;
double *W0_rate, *W1_rate; double *W0_rate, *W1_rate;
@ -256,10 +257,6 @@ float mlp_train_backprop(MLPTrain *net, float *inputs, float *outputs, int nbSam
W1 = net->weights[1]; W1 = net->weights[1];
best_W0 = net->best_weights[0]; best_W0 = net->best_weights[0];
best_W1 = net->best_weights[1]; best_W1 = net->best_weights[1];
W0_old = malloc(W0_size*sizeof(double));
W1_old = malloc(W1_size*sizeof(double));
W0_old2 = malloc(W0_size*sizeof(double));
W1_old2 = malloc(W1_size*sizeof(double));
W0_grad = malloc(W0_size*sizeof(double)); W0_grad = malloc(W0_size*sizeof(double));
W1_grad = malloc(W1_size*sizeof(double)); W1_grad = malloc(W1_size*sizeof(double));
W0_oldgrad = malloc(W0_size*sizeof(double)); W0_oldgrad = malloc(W0_size*sizeof(double));
@ -268,12 +265,8 @@ float mlp_train_backprop(MLPTrain *net, float *inputs, float *outputs, int nbSam
W1_rate = malloc(W1_size*sizeof(double)); W1_rate = malloc(W1_size*sizeof(double));
best_W0_rate = malloc(W0_size*sizeof(double)); best_W0_rate = malloc(W0_size*sizeof(double));
best_W1_rate = malloc(W1_size*sizeof(double)); best_W1_rate = malloc(W1_size*sizeof(double));
memcpy(W0_old, W0, W0_size*sizeof(double));
memcpy(W0_old2, W0, W0_size*sizeof(double));
memset(W0_grad, 0, W0_size*sizeof(double)); memset(W0_grad, 0, W0_size*sizeof(double));
memset(W0_oldgrad, 0, W0_size*sizeof(double)); memset(W0_oldgrad, 0, W0_size*sizeof(double));
memcpy(W1_old, W1, W1_size*sizeof(double));
memcpy(W1_old2, W1, W1_size*sizeof(double));
memset(W1_grad, 0, W1_size*sizeof(double)); memset(W1_grad, 0, W1_size*sizeof(double));
memset(W1_oldgrad, 0, W1_size*sizeof(double)); memset(W1_oldgrad, 0, W1_size*sizeof(double));
@ -378,8 +371,6 @@ float mlp_train_backprop(MLPTrain *net, float *inputs, float *outputs, int nbSam
/*if (W0_rate[i] > .01) /*if (W0_rate[i] > .01)
W0_rate[i] = .01;*/ W0_rate[i] = .01;*/
W0_oldgrad[i] = W0_grad[i]; W0_oldgrad[i] = W0_grad[i];
W0_old2[i] = W0_old[i];
W0_old[i] = W0[i];
W0[i] += W0_grad[i]*W0_rate[i]; W0[i] += W0_grad[i]*W0_rate[i];
} }
for (i=0;i<W1_size;i++) for (i=0;i<W1_size;i++)
@ -394,8 +385,6 @@ float mlp_train_backprop(MLPTrain *net, float *inputs, float *outputs, int nbSam
if (W1_rate[i] < 1e-15) if (W1_rate[i] < 1e-15)
W1_rate[i] = 1e-15; W1_rate[i] = 1e-15;
W1_oldgrad[i] = W1_grad[i]; W1_oldgrad[i] = W1_grad[i];
W1_old2[i] = W1_old[i];
W1_old[i] = W1[i];
W1[i] += W1_grad[i]*W1_rate[i]; W1[i] += W1_grad[i]*W1_rate[i];
} }
mean_rate /= (topo[0]+1)*topo[1] + (topo[1]+1)*topo[2]; mean_rate /= (topo[0]+1)*topo[1] + (topo[1]+1)*topo[2];
@ -413,12 +402,14 @@ float mlp_train_backprop(MLPTrain *net, float *inputs, float *outputs, int nbSam
pthread_join(thread[i], NULL); pthread_join(thread[i], NULL);
fprintf (stderr, "joined %d\n", i); fprintf (stderr, "joined %d\n", i);
} }
free(W0_old);
free(W1_old);
free(W0_grad); free(W0_grad);
free(W0_oldgrad);
free(W1_grad); free(W1_grad);
free(W1_oldgrad);
free(W0_rate); free(W0_rate);
free(best_W0_rate);
free(W1_rate); free(W1_rate);
free(best_W1_rate);
return best_rms; return best_rms;
} }
@ -476,22 +467,29 @@ int main(int argc, char **argv)
fprintf (stderr, "Got %d samples\n", nbSamples); fprintf (stderr, "Got %d samples\n", nbSamples);
net = mlp_init(topo, 3, inputs, outputs, nbSamples); net = mlp_init(topo, 3, inputs, outputs, nbSamples);
rms = mlp_train_backprop(net, inputs, outputs, nbSamples, nbEpoch, 1); rms = mlp_train_backprop(net, inputs, outputs, nbSamples, nbEpoch, 1);
printf ("#ifdef HAVE_CONFIG_H\n");
printf ("#include \"config.h\"\n");
printf ("#endif\n\n");
printf ("#include \"mlp.h\"\n\n"); printf ("#include \"mlp.h\"\n\n");
printf ("/* RMS error was %f, seed was %u */\n\n", rms, seed); printf ("/* RMS error was %f, seed was %u */\n\n", rms, seed);
printf ("static const float weights[%d] = {\n", (topo[0]+1)*topo[1] + (topo[1]+1)*topo[2]); printf ("static const float weights[%d] = {\n", (topo[0]+1)*topo[1] + (topo[1]+1)*topo[2]);
printf ("\n/* hidden layer */\n"); printf ("\n/* hidden layer */\n");
for (i=0;i<(topo[0]+1)*topo[1];i++) for (i=0;i<(topo[0]+1)*topo[1];i++)
{ {
printf ("%gf, ", net->weights[0][i]); printf ("%gf,", net->weights[0][i]);
if (i%5==4) if (i%5==4)
printf("\n"); printf("\n");
else
printf(" ");
} }
printf ("\n/* output layer */\n"); printf ("\n/* output layer */\n");
for (i=0;i<(topo[1]+1)*topo[2];i++) for (i=0;i<(topo[1]+1)*topo[2];i++)
{ {
printf ("%g, ", net->weights[1][i]); printf ("%g,", net->weights[1][i]);
if (i%5==4) if (i%5==4)
printf("\n"); printf("\n");
else
printf(" ");
} }
printf ("};\n\n"); printf ("};\n\n");
printf ("static const int topo[3] = {%d, %d, %d};\n\n", topo[0], topo[1], topo[2]); printf ("static const int topo[3] = {%d, %d, %d};\n\n", topo[0], topo[1], topo[2]);

View file

@ -263,7 +263,7 @@ int opus_encoder_init(OpusEncoder* st, opus_int32 Fs, int channels, int applicat
st->bandwidth = OPUS_BANDWIDTH_FULLBAND; st->bandwidth = OPUS_BANDWIDTH_FULLBAND;
#ifndef DISABLE_FLOAT_API #ifndef DISABLE_FLOAT_API
tonality_analysis_init(&st->analysis); tonality_analysis_init(&st->analysis, st->Fs);
#endif #endif
return OPUS_OK; return OPUS_OK;
@ -577,73 +577,52 @@ static opus_int32 user_bitrate_to_bitrate(OpusEncoder *st, int frame_size, int m
#else #else
#define PCM2VAL(x) SCALEIN(x) #define PCM2VAL(x) SCALEIN(x)
#endif #endif
void downmix_float(const void *_x, opus_val32 *sub, int subframe, int offset, int c1, int c2, int C)
void downmix_float(const void *_x, opus_val32 *y, int subframe, int offset, int c1, int c2, int C)
{ {
const float *x; const float *x;
opus_val32 scale;
int j; int j;
x = (const float *)_x; x = (const float *)_x;
for (j=0;j<subframe;j++) for (j=0;j<subframe;j++)
sub[j] = PCM2VAL(x[(j+offset)*C+c1]); y[j] = PCM2VAL(x[(j+offset)*C+c1]);
if (c2>-1) if (c2>-1)
{ {
for (j=0;j<subframe;j++) for (j=0;j<subframe;j++)
sub[j] += PCM2VAL(x[(j+offset)*C+c2]); y[j] += PCM2VAL(x[(j+offset)*C+c2]);
} else if (c2==-2) } else if (c2==-2)
{ {
int c; int c;
for (c=1;c<C;c++) for (c=1;c<C;c++)
{ {
for (j=0;j<subframe;j++) for (j=0;j<subframe;j++)
sub[j] += PCM2VAL(x[(j+offset)*C+c]); y[j] += PCM2VAL(x[(j+offset)*C+c]);
} }
} }
#ifdef FIXED_POINT
scale = (1<<SIG_SHIFT);
#else
scale = 1.f;
#endif
if (c2==-2)
scale /= C;
else if (c2>-1)
scale /= 2;
for (j=0;j<subframe;j++)
sub[j] *= scale;
} }
#endif #endif
void downmix_int(const void *_x, opus_val32 *sub, int subframe, int offset, int c1, int c2, int C) void downmix_int(const void *_x, opus_val32 *y, int subframe, int offset, int c1, int c2, int C)
{ {
const opus_int16 *x; const opus_int16 *x;
opus_val32 scale;
int j; int j;
x = (const opus_int16 *)_x; x = (const opus_int16 *)_x;
for (j=0;j<subframe;j++) for (j=0;j<subframe;j++)
sub[j] = x[(j+offset)*C+c1]; y[j] = x[(j+offset)*C+c1];
if (c2>-1) if (c2>-1)
{ {
for (j=0;j<subframe;j++) for (j=0;j<subframe;j++)
sub[j] += x[(j+offset)*C+c2]; y[j] += x[(j+offset)*C+c2];
} else if (c2==-2) } else if (c2==-2)
{ {
int c; int c;
for (c=1;c<C;c++) for (c=1;c<C;c++)
{ {
for (j=0;j<subframe;j++) for (j=0;j<subframe;j++)
sub[j] += x[(j+offset)*C+c]; y[j] += x[(j+offset)*C+c];
} }
} }
#ifdef FIXED_POINT
scale = (1<<SIG_SHIFT);
#else
scale = 1.f/32768;
#endif
if (c2==-2)
scale /= C;
else if (c2>-1)
scale /= 2;
for (j=0;j<subframe;j++)
sub[j] *= scale;
} }
opus_int32 frame_size_select(opus_int32 frame_size, int variable_duration, opus_int32 Fs) opus_int32 frame_size_select(opus_int32 frame_size, int variable_duration, opus_int32 Fs)
@ -866,7 +845,9 @@ static int is_digital_silence(const opus_val16* pcm, int frame_size, int channel
{ {
int silence = 0; int silence = 0;
opus_val32 sample_max = 0; opus_val32 sample_max = 0;
#ifdef MLP_TRAINING
return 0;
#endif
sample_max = celt_maxabs16(pcm, frame_size*channels); sample_max = celt_maxabs16(pcm, frame_size*channels);
#ifdef FIXED_POINT #ifdef FIXED_POINT
@ -1131,9 +1112,9 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
#ifndef DISABLE_FLOAT_API #ifndef DISABLE_FLOAT_API
analysis_info.valid = 0; analysis_info.valid = 0;
#ifdef FIXED_POINT #ifdef FIXED_POINT
if (st->silk_mode.complexity >= 10 && st->Fs==48000) if (st->silk_mode.complexity >= 10 && st->Fs>=16000)
#else #else
if (st->silk_mode.complexity >= 7 && st->Fs==48000) if (st->silk_mode.complexity >= 7 && st->Fs>=16000)
#endif #endif
{ {
if (is_digital_silence(pcm, frame_size, st->channels, lsb_depth)) if (is_digital_silence(pcm, frame_size, st->channels, lsb_depth))