Adds support for delayed decision

Variable duration option renamed to OPUS_SET_EXPERT_FRAME_DURATION,
with new API.
Also moves up the analysis to avoid having to do int->float conversion
on large buffers.
This commit is contained in:
Jean-Marc Valin 2013-02-20 04:08:04 -05:00
parent 10a34a5dd6
commit 51f4a32ec2
7 changed files with 395 additions and 122 deletions

View file

@ -139,10 +139,81 @@ static inline float fast_atan2f(float y, float x) {
}
}
void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEncoder *celt_enc, const opus_val16 *x, int len, int C, int lsb_depth)
void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
{
#if 1
int pos;
int curr_lookahead;
float psum;
int i;
pos = tonal->read_pos;
curr_lookahead = tonal->write_pos-tonal->read_pos;
if (curr_lookahead<0)
curr_lookahead += DETECT_SIZE;
if (len > 480 && pos != tonal->write_pos)
{
pos++;
if (pos==DETECT_SIZE)
pos=0;
}
if (pos == tonal->write_pos)
pos--;
if (pos<0)
pos = DETECT_SIZE-1;
OPUS_COPY(info_out, &tonal->info[pos], 1);
tonal->read_subframe += len/120;
while (tonal->read_subframe>=4)
{
tonal->read_subframe -= 4;
tonal->read_pos++;
}
if (tonal->read_pos>=DETECT_SIZE)
tonal->read_pos-=DETECT_SIZE;
/* Compensate for the delay in the features themselves.
FIXME: Need a better estimate the 10 I just made up */
curr_lookahead = IMAX(curr_lookahead-10, 0);
psum=0;
for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
psum += tonal->pmusic[i];
for (;i<DETECT_SIZE;i++)
psum += tonal->pspeech[i];
/*printf("%f %f\n", psum, info_out->music_prob);*/
info_out->music_prob = psum;
#else
/* If data not available, return invalid */
if (tonal->read_pos==tonal->write_pos)
{
info_out->valid=0;
return;
}
OPUS_COPY(info_out, &tonal->info[tonal->read_pos], 1);
tonal->read_subframe += len/480;
while (tonal->read_subframe>=4)
{
tonal->read_subframe -= 4;
tonal->read_pos++;
}
if (tonal->read_pos>=DETECT_SIZE)
tonal->read_pos-=DETECT_SIZE;
if (tonal->read_pos == tonal->write_pos)
{
tonal->read_pos = tonal->write_pos-1;
if (tonal->read_pos<0)
tonal->read_pos=DETECT_SIZE-1;
tonal->read_subframe = 3;
}
#endif
}
void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, const CELTMode *celt_mode, const void *x, int len, int offset, int C, int lsb_depth, downmix_func downmix)
{
int i, b;
const CELTMode *mode;
const kiss_fft_state *kfft;
kiss_fft_cpx in[480], out[480];
int N = 480, N2=240;
@ -171,8 +242,7 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc
float maxE = 0;
float noise_floor;
int remaining;
celt_encoder_ctl(celt_enc, CELT_GET_MODE(&mode));
AnalysisInfo *info;
tonal->last_transition++;
alpha = 1.f/IMIN(20, 1+tonal->count);
@ -181,23 +251,19 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc
if (tonal->count<4)
tonal->music_prob = .5;
kfft = mode->mdct.kfft[0];
kfft = celt_mode->mdct.kfft[0];
if (tonal->count==0)
tonal->mem_fill = 240;
if (C==1)
{
for (i=0;i<IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill);i++)
tonal->inmem[i+tonal->mem_fill] = x[i];
} else {
for (i=0;i<IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill);i++)
tonal->inmem[i+tonal->mem_fill] = x[2*i]+x[2*i+1];
}
downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, C);
if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
{
tonal->mem_fill += len;
/* Don't have enough to update the analysis */
return;
}
info = &tonal->info[tonal->write_pos++];
if (tonal->write_pos>=DETECT_SIZE)
tonal->write_pos-=DETECT_SIZE;
for (i=0;i<N2;i++)
{
@ -209,15 +275,7 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc
}
OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
if (C==1)
{
for (i=0;i<remaining;i++)
tonal->inmem[240+i] = x[ANALYSIS_BUF_SIZE-tonal->mem_fill+i];
} else {
for (i=0;i<remaining;i++)
tonal->inmem[240+i] = x[2*(ANALYSIS_BUF_SIZE-tonal->mem_fill+i)]
+ x[2*(ANALYSIS_BUF_SIZE-tonal->mem_fill+i)+1];
}
downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, C);
tonal->mem_fill = 240 + remaining;
opus_fft(kfft, in, out);
@ -450,13 +508,49 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc
tau = .00005f;
beta = .1f;
max_certainty = .01f+1.f/(20.f+.5f*tonal->last_transition);
max_certainty = 0;
p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau;
p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
p0 *= (float)pow(1-frame_prob, beta);
p1 *= (float)pow(frame_prob, beta);
tonal->music_prob = MAX16(max_certainty, MIN16(1-max_certainty, p1/(p0+p1)));
info->music_prob = tonal->music_prob;
/*printf("%f %f\n", frame_prob, info->music_prob);*/
info->music_prob = frame_prob;
float psum=1e-20;
float speech0 = (float)pow(1-frame_prob, beta);
float music0 = (float)pow(frame_prob, beta);
if (tonal->count==1)
{
tonal->pspeech[0]=.5;
tonal->pmusic [0]=.5;
}
float s0, m0;
s0 = tonal->pspeech[0] + tonal->pspeech[1];
m0 = tonal->pmusic [0] + tonal->pmusic [1];
tonal->pspeech[0] = s0*(1-tau)*speech0;
tonal->pmusic [0] = m0*(1-tau)*music0;
for (i=1;i<DETECT_SIZE-1;i++)
{
tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
}
tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
for (i=0;i<DETECT_SIZE;i++)
psum += tonal->pspeech[i] + tonal->pmusic[i];
psum = 1.f/psum;
for (i=0;i<DETECT_SIZE;i++)
{
tonal->pspeech[i] *= psum;
tonal->pmusic [i] *= psum;
}
psum = tonal->pmusic[0];
for (i=1;i<DETECT_SIZE;i++)
psum += tonal->pspeech[i];
/*printf("%f %f %f\n", frame_prob, info->music_prob, psum);*/
}
if (tonal->last_music != (tonal->music_prob>.5f))
tonal->last_transition=0;
@ -484,4 +578,48 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc
/*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
info->noisiness = frame_noisiness;
info->valid = 1;
if (info_out!=NULL)
OPUS_COPY(info_out, info, 1);
}
int run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *pcm,
const void *analysis_pcm, int frame_size, int variable_duration, int C, opus_int32 Fs, int bitrate_bps,
int delay_compensation, int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
{
int offset;
int pcm_len;
/* Avoid overflow/wrap-around of the analysis buffer */
frame_size = IMIN((DETECT_SIZE-5)*Fs/100, frame_size);
pcm_len = frame_size - analysis->analysis_offset;
offset = 0;
do {
tonality_analysis(analysis, NULL, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, C, lsb_depth, downmix);
offset += 480;
pcm_len -= 480;
} while (pcm_len>0);
analysis->analysis_offset = frame_size;
if (variable_duration == OPUS_FRAMESIZE_VARIABLE && frame_size >= Fs/200)
{
int LM = 3;
LM = optimize_framesize(pcm, frame_size, C, Fs, bitrate_bps,
analysis->prev_tonality, analysis->subframe_mem, delay_compensation, downmix);
while ((Fs/400<<LM)>frame_size)
LM--;
frame_size = (Fs/400<<LM);
} else {
frame_size = frame_size_select(frame_size, variable_duration, Fs);
}
if (frame_size<0)
return -1;
analysis->analysis_offset -= frame_size;
/* Only perform analysis up to 20-ms frames. Longer ones will be split if
they're in CELT-only mode. */
analysis_info->valid = 0;
tonality_get_info(analysis, analysis_info, frame_size);
return frame_size;
}