From 7ebacf430a465d000d97d6d9015f8f6061af8804 Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin Date: Tue, 13 Nov 2012 02:24:07 -0500 Subject: [PATCH 01/15] Moves analysis to the beginning of opus_encode() --- src/analysis.c | 18 ++++++++++-------- src/analysis.h | 1 + src/opus_encoder.c | 45 ++++++++++++++++++++++----------------------- 3 files changed, 33 insertions(+), 31 deletions(-) diff --git a/src/analysis.c b/src/analysis.c index 22a8fa79..08975992 100644 --- a/src/analysis.c +++ b/src/analysis.c @@ -185,19 +185,21 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc for (i=0;iinmem[i]); + in[i].i = MULT16_16(w, x[i]); + in[N-i-1].r = MULT16_16(w, x[N2-i-1]); + in[N-i-1].i = MULT16_16(w, x[N-i-1]); + tonal->inmem[i] = x[N2+i]; } } else { for (i=0;iinmem[i]); + in[i].i = MULT16_16(w, x[2*i]+x[2*i+1]); + in[N-i-1].r = MULT16_16(w, x[2*(N2-i-1)]+x[2*(N2-i-1)+1]); + in[N-i-1].i = MULT16_16(w, x[2*(N-i-1)]+x[2*(N-i-1)+1]); + tonal->inmem[i] = x[2*(N2+i)]+x[2*(N2+i)+1]; } } opus_fft(kfft, in, out); diff --git a/src/analysis.h b/src/analysis.h index bf8ad40a..951ded50 100644 --- a/src/analysis.h +++ b/src/analysis.h @@ -36,6 +36,7 @@ typedef struct { float angle[240]; float d_angle[240]; float d2_angle[240]; + float inmem[240]; float prev_band_tonality[NB_TBANDS]; float prev_tonality; float E[NB_FRAMES][NB_TBANDS]; diff --git a/src/opus_encoder.c b/src/opus_encoder.c index ec7d6e7b..87f0147a 100644 --- a/src/opus_encoder.c +++ b/src/opus_encoder.c @@ -592,18 +592,32 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ lsb_depth = IMIN(lsb_depth, st->lsb_depth); #ifndef FIXED_POINT - perform_analysis = st->silk_mode.complexity >= 7 && frame_size >= st->Fs/100 && st->Fs==48000; + /* Only perform analysis for 10- and 20-ms frames. We don't have enough buffering for shorter + ones and longer ones will be split if they're in CELT-only mode. */ + perform_analysis = st->silk_mode.complexity >= 7 + && (frame_size >= st->Fs/100 || frame_size >= st->Fs/50) + && st->Fs==48000; + if (perform_analysis) + { + int nb_analysis_frames; + nb_analysis_frames = frame_size/(st->Fs/100); + for (i=0;ianalysis, &analysis_info, celt_enc, pcm+i*(st->Fs/100)*st->channels, st->channels, lsb_depth); + if (st->signal_type == OPUS_AUTO) + st->voice_ratio = (int)floor(.5+100*(1-analysis_info.music_prob)); + st->detected_bandwidth = analysis_info.opus_bandwidth; + } else { + analysis_info.valid = 0; + st->voice_ratio = -1; + st->detected_bandwidth = 0; + } #endif + if (st->application == OPUS_APPLICATION_RESTRICTED_LOWDELAY) delay_compensation = 0; else delay_compensation = st->delay_compensation; - if (perform_analysis) - { - total_buffer = IMAX(st->Fs/200, delay_compensation); - } else { - total_buffer = delay_compensation; - } + total_buffer = delay_compensation; extra_buffer = total_buffer-delay_compensation; st->bitrate_bps = user_bitrate_to_bitrate(st, frame_size, max_data_bytes); @@ -975,22 +989,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ dc_reject(pcm, 3, &pcm_buf[total_buffer*st->channels], st->hp_mem, frame_size, st->channels, st->Fs); } -#ifndef FIXED_POINT - if (perform_analysis) - { - int nb_analysis_frames; - nb_analysis_frames = frame_size/(st->Fs/100); - for (i=0;ianalysis, &analysis_info, celt_enc, pcm_buf+i*(st->Fs/100)*st->channels, st->channels, lsb_depth); - if (st->signal_type == OPUS_AUTO) - st->voice_ratio = (int)floor(.5+100*(1-analysis_info.music_prob)); - st->detected_bandwidth = analysis_info.opus_bandwidth; - } else { - analysis_info.valid = 0; - st->voice_ratio = -1; - st->detected_bandwidth = 0; - } -#endif + /* SILK processing */ HB_gain = Q15ONE; From 48ac122141c317964fae2987eaea161c46538717 Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin Date: Wed, 14 Nov 2012 02:39:27 -0500 Subject: [PATCH 02/15] Makes analysis usable for all frame sizes --- celt/celt_encoder.c | 2 +- src/analysis.c | 55 +++++++++++++++++++++++++++++---------------- src/analysis.h | 10 +++++---- src/opus_encoder.c | 18 +++++++-------- 4 files changed, 51 insertions(+), 34 deletions(-) diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c index 0d92c1ff..fd811360 100644 --- a/celt/celt_encoder.c +++ b/celt/celt_encoder.c @@ -1274,7 +1274,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, prefilter_tapset = st->tapset_decision; pf_on = run_prefilter(st, in, prefilter_mem, CC, N, prefilter_tapset, &pitch_index, &gain1, &qg, enabled, nbAvailableBytes); - if ((gain1 > QCONST16(.4f,15) || st->prefilter_gain > QCONST16(.4f,15)) && st->analysis.tonality > .3 + if ((gain1 > QCONST16(.4f,15) || st->prefilter_gain > QCONST16(.4f,15)) && (!st->analysis.valid || st->analysis.tonality > .3) && (pitch_index > 1.26*st->prefilter_period || pitch_index < .79*st->prefilter_period)) pitch_change = 1; if (pf_on==0) diff --git a/src/analysis.c b/src/analysis.c index 08975992..6b07890a 100644 --- a/src/analysis.c +++ b/src/analysis.c @@ -139,7 +139,7 @@ static inline float fast_atan2f(float y, float x) { } } -void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEncoder *celt_enc, const opus_val16 *x, int C, int lsb_depth) +void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEncoder *celt_enc, const opus_val16 *x, int len, int C, int lsb_depth) { int i, b; const CELTMode *mode; @@ -170,6 +170,8 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc int bandwidth=0; float maxE = 0; float noise_floor; + int remaining; + celt_encoder_ctl(celt_enc, CELT_GET_MODE(&mode)); tonal->last_transition++; @@ -180,28 +182,43 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc if (tonal->count<4) tonal->music_prob = .5; kfft = mode->mdct.kfft[0]; + if (tonal->count==0) + tonal->mem_fill = 240; if (C==1) { - for (i=0;iinmem[i]); - in[i].i = MULT16_16(w, x[i]); - in[N-i-1].r = MULT16_16(w, x[N2-i-1]); - in[N-i-1].i = MULT16_16(w, x[N-i-1]); - tonal->inmem[i] = x[N2+i]; - } + for (i=0;imem_fill);i++) + tonal->inmem[i+tonal->mem_fill] = x[i]; } else { - for (i=0;iinmem[i]); - in[i].i = MULT16_16(w, x[2*i]+x[2*i+1]); - in[N-i-1].r = MULT16_16(w, x[2*(N2-i-1)]+x[2*(N2-i-1)+1]); - in[N-i-1].i = MULT16_16(w, x[2*(N-i-1)]+x[2*(N-i-1)+1]); - tonal->inmem[i] = x[2*(N2+i)]+x[2*(N2+i)+1]; - } + for (i=0;imem_fill);i++) + tonal->inmem[i+tonal->mem_fill] = x[2*i]+x[2*i+1]; } + if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE) + { + tonal->mem_fill += len; + /* Don't have enough to update the analysis */ + return; + } + + for (i=0;iinmem[i]); + in[i].i = MULT16_16(w, tonal->inmem[N2+i]); + in[N-i-1].r = MULT16_16(w, tonal->inmem[N-i-1]); + in[N-i-1].i = MULT16_16(w, tonal->inmem[N+N2-i-1]); + } + OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240); + remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill); + if (C==1) + { + for (i=0;iinmem[240+i] = x[ANALYSIS_BUF_SIZE-tonal->mem_fill+i]; + } else { + for (i=0;iinmem[240+i] = x[2*(ANALYSIS_BUF_SIZE-tonal->mem_fill+i)] + + x[2*(ANALYSIS_BUF_SIZE-tonal->mem_fill+i)+1]; + } + tonal->mem_fill = 240 + remaining; opus_fft(kfft, in, out); for (i=1;ilsb_depth); #ifndef FIXED_POINT - /* Only perform analysis for 10- and 20-ms frames. We don't have enough buffering for shorter - ones and longer ones will be split if they're in CELT-only mode. */ - perform_analysis = st->silk_mode.complexity >= 7 - && (frame_size >= st->Fs/100 || frame_size >= st->Fs/50) - && st->Fs==48000; + /* Only perform analysis up to 20-ms frames. Longer ones will be split if + they're in CELT-only mode. */ + perform_analysis = st->silk_mode.complexity >= 7 && frame_size <= st->Fs/50 && st->Fs==48000; if (perform_analysis) { - int nb_analysis_frames; - nb_analysis_frames = frame_size/(st->Fs/100); - for (i=0;ianalysis, &analysis_info, celt_enc, pcm+i*(st->Fs/100)*st->channels, st->channels, lsb_depth); - if (st->signal_type == OPUS_AUTO) + analysis_info.valid = 0; + tonality_analysis(&st->analysis, &analysis_info, celt_enc, pcm, IMIN(480, frame_size), st->channels, lsb_depth); + if (frame_size > st->Fs/100) + tonality_analysis(&st->analysis, &analysis_info, celt_enc, pcm+(st->Fs/100)*st->channels, 480, st->channels, lsb_depth); + if (analysis_info.valid && st->signal_type == OPUS_AUTO) st->voice_ratio = (int)floor(.5+100*(1-analysis_info.music_prob)); st->detected_bandwidth = analysis_info.opus_bandwidth; } else { From 2a5f0565b8b037c0d32fc471544e7d32a9f010fe Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin Date: Mon, 19 Nov 2012 23:17:06 -0500 Subject: [PATCH 03/15] Running transient_analysis() even for 2.5 ms frames This means 2.5 ms frames can now use a higher bitrate for transients. --- celt/celt_encoder.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c index fd811360..1b5426a7 100644 --- a/celt/celt_encoder.c +++ b/celt/celt_encoder.c @@ -1298,16 +1298,18 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, isTransient = 0; shortBlocks = 0; + if (st->complexity >= 1) + { + isTransient = transient_analysis(in, N+st->overlap, CC, + &tf_estimate, &tf_chan); + } if (LM>0 && ec_tell(enc)+3<=total_bits) { - if (st->complexity >= 1) - { - isTransient = transient_analysis(in, N+st->overlap, CC, - &tf_estimate, &tf_chan); - if (isTransient) - shortBlocks = M; - } + if (isTransient) + shortBlocks = M; ec_enc_bit_logp(enc, isTransient, 3); + } else { + isTransient = 0; } ALLOC(freq, CC*N, celt_sig); /**< Interleaved signal MDCTs */ From e85a6f5cbee9edd7ceb10e05db6652872cf8d8d2 Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin Date: Mon, 19 Nov 2012 23:21:43 -0500 Subject: [PATCH 04/15] Makes opus_demo rubust to the encoder using variable frame duration Also, the encode+decode mode now produces an output of the same size as the original. --- src/opus_demo.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/src/opus_demo.c b/src/opus_demo.c index 09b12a33..ae6ccf38 100644 --- a/src/opus_demo.c +++ b/src/opus_demo.c @@ -221,6 +221,8 @@ int main(int argc, char *argv[]) short *in, *out; int application=OPUS_APPLICATION_AUDIO; double bits=0.0, bits_max=0.0, bits_act=0.0, bits2=0.0, nrg; + double tot_samples=0; + opus_uint64 tot_in, tot_out; int bandwidth=-1; const char *bandwidth_string; int lost = 0, lost_prev = 1; @@ -239,6 +241,8 @@ int main(int argc, char *argv[]) int curr_mode=0; int curr_mode_count=0; int mode_switch_time = 48000; + int nb_encoded; + int remaining=0; if (argc < 5 ) { @@ -246,6 +250,7 @@ int main(int argc, char *argv[]) return EXIT_FAILURE; } + tot_in=tot_out=0; fprintf(stderr, "%s\n", opus_get_version_string()); args = 1; @@ -617,22 +622,28 @@ int main(int argc, char *argv[]) opus_encoder_ctl(enc, OPUS_SET_FORCE_CHANNELS(mode_list[curr_mode][3])); frame_size = mode_list[curr_mode][2]; } - err = fread(fbytes, sizeof(short)*channels, frame_size, fin); + err = fread(fbytes, sizeof(short)*channels, frame_size-remaining, fin); curr_read = err; + tot_in += curr_read; for(i=0;i0 && rand()%100 < packet_loss_perc); @@ -703,6 +715,11 @@ int main(int argc, char *argv[]) } if (output_samples>0) { + if (!decode_only && tot_out + output_samples > tot_in) + { + stop=1; + output_samples = tot_in-tot_out; + } if (output_samples>skip) { int i; for(i=0;i<(output_samples-skip)*channels;i++) @@ -716,6 +733,7 @@ int main(int argc, char *argv[]) fprintf(stderr, "Error writing.\n"); return EXIT_FAILURE; } + tot_out += output_samples-skip; } if (output_samples Date: Thu, 22 Nov 2012 13:11:43 -0500 Subject: [PATCH 05/15] First attempt at varying the frame size depending on the audio (float only) The search is based on minimizing the bitrate increase due to transients by considering we can reduce the "transient boost" by reducing the frame size, at the cost of increasing the normal overhead. --- include/opus_defines.h | 29 ++++- src/opus_demo.c | 7 ++ src/opus_encoder.c | 257 +++++++++++++++++++++++++++++++++++++++-- 3 files changed, 281 insertions(+), 12 deletions(-) diff --git a/include/opus_defines.h b/include/opus_defines.h index cdde061a..e9434aab 100644 --- a/include/opus_defines.h +++ b/include/opus_defines.h @@ -148,8 +148,9 @@ extern "C" { #define OPUS_GET_GAIN_REQUEST 4045 /* Should have been 4035 */ #define OPUS_SET_LSB_DEPTH_REQUEST 4036 #define OPUS_GET_LSB_DEPTH_REQUEST 4037 - #define OPUS_GET_LAST_PACKET_DURATION_REQUEST 4039 +#define OPUS_SET_EXPERT_VARIABLE_DURATION_REQUEST 4040 +#define OPUS_GET_EXPERT_VARIABLE_DURATION_REQUEST 4041 /* Don't use 4045, it's already taken by OPUS_GET_GAIN_REQUEST */ @@ -525,6 +526,32 @@ extern "C" { * @param[out] x opus_int32 *: Number of samples (at current sampling rate). * @hideinitializer */ #define OPUS_GET_LAST_PACKET_DURATION(x) OPUS_GET_LAST_PACKET_DURATION_REQUEST, __opus_check_int_ptr(x) + +/** Configures the encoder's use of variable duration frames. + * When enabled, the encoder is free to use a shorter frame size than the one + * requested in the opus_encode*() call. It is then the user's responsibility + * to verify how much audio was encoded by checking the ToC byte of the encoded + * packet. The part of the audio that was not encoded needs to be resent to the + * encoder for the next call. Do not use this option unless you really + * know what you are doing. + * @see OPUS_GET_EXPERT_VARIABLE_DURATION + * @param[in] x opus_int32: Allowed values: + *
+ *
0
Disable variable duration (default).
+ *
1
Enable variable duration.
+ *
+ * @hideinitializer */ +#define OPUS_SET_EXPERT_VARIABLE_DURATION(x) OPUS_SET_EXPERT_VARIABLE_DURATION_REQUEST, __opus_check_int(x) +/** Gets the encoder's configured use of variable duration frames. + * @see OPUS_SET_EXPERT_VARIABLE_DURATION + * @param[out] x opus_int32 *: Returns one of the following values: + *
+ *
0
variable duration disabled (default).
+ *
1
variable duration enabled.
+ *
+ * @hideinitializer */ +#define OPUS_GET_EXPERT_VARIABLE_DURATION(x) OPUS_GET_EXPERT_VARIABLE_DURATION_REQUEST, __opus_check_int_ptr(x) + /**@}*/ /** @defgroup opus_genericctls Generic CTLs diff --git a/src/opus_demo.c b/src/opus_demo.c index ae6ccf38..6538aad6 100644 --- a/src/opus_demo.c +++ b/src/opus_demo.c @@ -53,6 +53,7 @@ void print_usage( char* argv[] ) fprintf(stderr, "-d : only runs the decoder (reads the bit-stream as input)\n" ); fprintf(stderr, "-cbr : enable constant bitrate; default: variable bitrate\n" ); fprintf(stderr, "-cvbr : enable constrained variable bitrate; default: unconstrained\n" ); + fprintf(stderr, "-variable-duration : enable frames of variable duration (experts only); default: disabled\n" ); fprintf(stderr, "-bandwidth : audio bandwidth (from narrowband to fullband); default: sampling rate\n" ); fprintf(stderr, "-framesize <2.5|5|10|20|40|60> : frame size in ms; default: 20 \n" ); fprintf(stderr, "-max_payload : maximum payload size in bytes, default: 1024\n" ); @@ -243,6 +244,7 @@ int main(int argc, char *argv[]) int mode_switch_time = 48000; int nb_encoded; int remaining=0; + int variable_duration=0; if (argc < 5 ) { @@ -379,6 +381,10 @@ int main(int argc, char *argv[]) check_encoder_option(decode_only, "-cvbr"); cvbr = 1; args++; + } else if( strcmp( argv[ args ], "-variable-duration" ) == 0 ) { + check_encoder_option(decode_only, "-variable-duration"); + variable_duration = 1; + args++; } else if( strcmp( argv[ args ], "-dtx") == 0 ) { check_encoder_option(decode_only, "-dtx"); use_dtx = 1; @@ -504,6 +510,7 @@ int main(int argc, char *argv[]) opus_encoder_ctl(enc, OPUS_GET_LOOKAHEAD(&skip)); opus_encoder_ctl(enc, OPUS_SET_LSB_DEPTH(16)); + opus_encoder_ctl(enc, OPUS_SET_EXPERT_VARIABLE_DURATION(variable_duration)); } if (!encode_only) { diff --git a/src/opus_encoder.c b/src/opus_encoder.c index 3d8684c4..e55eabda 100644 --- a/src/opus_encoder.c +++ b/src/opus_encoder.c @@ -67,6 +67,7 @@ struct OpusEncoder { opus_int32 Fs; int use_vbr; int vbr_constraint; + int variable_duration; opus_int32 bitrate_bps; opus_int32 user_bitrate_bps; int lsb_depth; @@ -88,6 +89,7 @@ struct OpusEncoder { int first; opus_val16 delay_buffer[MAX_ENCODER_BUFFER*2]; #ifndef FIXED_POINT + opus_val32 subframe_mem[3]; TonalityAnalysisState analysis; int detected_bandwidth; #endif @@ -535,6 +537,205 @@ static opus_int32 user_bitrate_to_bitrate(OpusEncoder *st, int frame_size, int m return st->user_bitrate_bps; } +#ifndef FIXED_POINT +/* Don't use more than 60 ms for the frame size analysis */ +#define MAX_DYNAMIC_FRAMESIZE 24 +/* Estimates how much the bitrate will be boosted based on the sub-frame energy */ +static float transient_boost(const float *E, const float *E_1, int LM, int maxM) +{ + int i; + int M; + float sumE=0, sumE_1=0; + float metric; + + M = IMIN(maxM, (1<10 ? 1 : 0;*/ + /*return MAX16(0,1-exp(-.25*(metric-2.)));*/ + return MIN16(1,sqrt(MAX16(0,.05*(metric-2)))); +} + +/* Viterbi decoding trying to find the best frame size combination using look-ahead + + State numbering: + 0: unused + 1: 2.5 ms + 2: 5 ms (#1) + 3: 5 ms (#2) + 4: 10 ms (#1) + 5: 10 ms (#2) + 6: 10 ms (#3) + 7: 10 ms (#4) + 8: 20 ms (#1) + 9: 20 ms (#2) + 10: 20 ms (#3) + 11: 20 ms (#4) + 12: 20 ms (#5) + 13: 20 ms (#6) + 14: 20 ms (#7) + 15: 20 ms (#8) +*/ +static int transient_viterbi(const float *E, const float *E_1, int N, int frame_cost, int rate) +{ + int i; + float cost[MAX_DYNAMIC_FRAMESIZE][16]; + int states[MAX_DYNAMIC_FRAMESIZE][16]; + float best_cost; + int best_state; + + for (i=0;i<16;i++) + { + /* Impossible state */ + states[0][i] = -1; + cost[0][i] = 1e10; + } + for (i=0;i<4;i++) + { + cost[0][1<=0;i--) + { + /*printf("%d ", best_state);*/ + best_state = states[i][best_state]; + } + /*printf("%d\n", best_state);*/ + return best_state; +} + +static int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs, + int bitrate, opus_val16 tonality, opus_val32 *mem, int buffering) +{ + int N; + int i; + float e[MAX_DYNAMIC_FRAMESIZE+4]; + float e_1[MAX_DYNAMIC_FRAMESIZE+3]; + float memx; + int bestLM=0; + int subframe; + int pos; + + subframe = Fs/400; + e[0]=mem[0]; + e_1[0]=1./(EPSILON+mem[0]); + if (buffering) + { + /* Consider the CELT delay when not in restricted-lowdelay */ + /* We assume the buffering is between 2.5 and 5 ms */ + int offset = 2*subframe - buffering; + celt_assert(offset>=0 && offset <= subframe); + x += C*offset; + len -= offset; + e[1]=mem[1]; + e_1[1]=1./(EPSILON+mem[1]); + e[2]=mem[2]; + e_1[2]=1./(EPSILON+mem[2]); + pos = 3; + } else { + pos=1; + } + N=IMIN(len/subframe, MAX_DYNAMIC_FRAMESIZE); + memx = x[0]; + for (i=0;irangeFinal = 0; - if (400*frame_size != st->Fs && 200*frame_size != st->Fs && 100*frame_size != st->Fs && + if ((!st->variable_duration && 400*frame_size != st->Fs && 200*frame_size != st->Fs && 100*frame_size != st->Fs && 50*frame_size != st->Fs && 25*frame_size != st->Fs && 50*frame_size != 3*st->Fs) - { - RESTORE_STACK; - return OPUS_BAD_ARG; - } - if (max_data_bytes<=0) + || (400*frame_size < st->Fs) + || max_data_bytes<=0 + ) { RESTORE_STACK; return OPUS_BAD_ARG; } silk_enc = (char*)st+st->silk_enc_offset; celt_enc = (CELTEncoder*)((char*)st+st->celt_enc_offset); + if (st->application == OPUS_APPLICATION_RESTRICTED_LOWDELAY) + delay_compensation = 0; + else + delay_compensation = st->delay_compensation; lsb_depth = IMIN(lsb_depth, st->lsb_depth); + orig_frame_size = IMIN(frame_size,st->Fs/50); + if (st->variable_duration) + { + int LM = 3; +#ifndef FIXED_POINT + LM = optimize_framesize(pcm, frame_size, st->channels, st->Fs, st->bitrate_bps, + st->analysis.prev_tonality, st->subframe_mem, delay_compensation); +#endif + while ((st->Fs/400<frame_size) + LM--; + frame_size = (st->Fs/400<application == OPUS_APPLICATION_RESTRICTED_LOWDELAY) - delay_compensation = 0; - else - delay_compensation = st->delay_compensation; total_buffer = delay_compensation; extra_buffer = total_buffer-delay_compensation; st->bitrate_bps = user_bitrate_to_bitrate(st, frame_size, max_data_bytes); @@ -1196,9 +1408,18 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ } else { if (st->use_vbr) { + opus_int32 bonus=0; +#ifndef FIXED_POINT + if (orig_frame_size != frame_size) + { + bonus = (40*st->stream_channels+40)*(48000/frame_size-48000/orig_frame_size); + if (analysis_info.valid) + bonus = bonus*(1.f+.5*analysis_info.tonality); + } +#endif celt_encoder_ctl(celt_enc, OPUS_SET_VBR(1)); celt_encoder_ctl(celt_enc, OPUS_SET_VBR_CONSTRAINT(st->vbr_constraint)); - celt_encoder_ctl(celt_enc, OPUS_SET_BITRATE(st->bitrate_bps)); + celt_encoder_ctl(celt_enc, OPUS_SET_BITRATE(st->bitrate_bps+bonus)); nb_compr_bytes = max_data_bytes-1-redundancy_bytes; } else { nb_compr_bytes = bytes_target; @@ -1741,6 +1962,20 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...) *value = st->lsb_depth; } break; + case OPUS_SET_EXPERT_VARIABLE_DURATION_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if (value<0 || value>1) + goto bad_arg; + st->variable_duration = value; + } + break; + case OPUS_GET_EXPERT_VARIABLE_DURATION_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + *value = st->variable_duration; + } + break; case OPUS_RESET_STATE: { void *silk_enc; From 854796481543539f2cda745861c0cd398a1a78de Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin Date: Mon, 26 Nov 2012 03:08:15 -0500 Subject: [PATCH 06/15] Re-enable analysis for 40- and 60-ms frames --- src/opus_encoder.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/src/opus_encoder.c b/src/opus_encoder.c index e55eabda..f57fa52e 100644 --- a/src/opus_encoder.c +++ b/src/opus_encoder.c @@ -810,20 +810,23 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ #ifndef FIXED_POINT /* Only perform analysis up to 20-ms frames. Longer ones will be split if they're in CELT-only mode. */ - perform_analysis = st->silk_mode.complexity >= 7 && frame_size <= st->Fs/50 && st->Fs==48000; - if (perform_analysis) + analysis_info.valid = 0; + perform_analysis = st->silk_mode.complexity >= 7 && st->Fs==48000; + if (!perform_analysis) + { + st->voice_ratio = -1; + st->detected_bandwidth = 0; + } else if (frame_size <= st->Fs/50) { - analysis_info.valid = 0; tonality_analysis(&st->analysis, &analysis_info, celt_enc, pcm, IMIN(480, frame_size), st->channels, lsb_depth); if (frame_size > st->Fs/100) tonality_analysis(&st->analysis, &analysis_info, celt_enc, pcm+(st->Fs/100)*st->channels, 480, st->channels, lsb_depth); - if (analysis_info.valid && st->signal_type == OPUS_AUTO) - st->voice_ratio = (int)floor(.5+100*(1-analysis_info.music_prob)); - st->detected_bandwidth = analysis_info.opus_bandwidth; - } else { - analysis_info.valid = 0; - st->voice_ratio = -1; - st->detected_bandwidth = 0; + if (analysis_info.valid) + { + if (st->signal_type == OPUS_AUTO) + st->voice_ratio = (int)floor(.5+100*(1-analysis_info.music_prob)); + st->detected_bandwidth = analysis_info.opus_bandwidth; + } } #endif @@ -1160,6 +1163,14 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ RESTORE_STACK; return ret; } + /* Perform analysis for 40-60 ms frames */ + if (perform_analysis && frame_size > st->Fs/50) + { + int nb_analysis = frame_size/(st->Fs/100); + for (i=0;ianalysis, &analysis_info, celt_enc, pcm+i*(st->Fs/100)*st->channels, 480, st->channels, lsb_depth); + st->voice_ratio = (int)floor(.5+100*(1-analysis_info.music_prob)); + } curr_bandwidth = st->bandwidth; From f548a5a35d4e808eba5224084034c09421edcc6a Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin Date: Mon, 26 Nov 2012 23:20:01 -0500 Subject: [PATCH 07/15] Makes variable framesize less aggressive at lower rates --- src/opus_encoder.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/opus_encoder.c b/src/opus_encoder.c index f57fa52e..c4a46557 100644 --- a/src/opus_encoder.c +++ b/src/opus_encoder.c @@ -590,6 +590,10 @@ static int transient_viterbi(const float *E, const float *E_1, int N, int frame_ float best_cost; int best_state; + /* Makes variable framesize less aggressive at lower bitrates, but I can't + find any valid theretical justification for this (other than it seems + to help) */ + frame_cost *= 720/rate; for (i=0;i<16;i++) { /* Impossible state */ From 744836604644fbb94409592069238088852db599 Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin Date: Mon, 17 Dec 2012 16:23:42 -0500 Subject: [PATCH 08/15] Multistream support for variable frame duration Also fixes a bug with stereo streams where the initial memory was only using the left channel. --- src/opus_encoder.c | 35 ++++++------- src/opus_multistream_encoder.c | 89 ++++++++++++++++++++++++++++------ src/opus_private.h | 2 + 3 files changed, 95 insertions(+), 31 deletions(-) diff --git a/src/opus_encoder.c b/src/opus_encoder.c index c4a46557..4c0840ff 100644 --- a/src/opus_encoder.c +++ b/src/opus_encoder.c @@ -665,19 +665,21 @@ static int transient_viterbi(const float *E, const float *E_1, int N, int frame_ return best_state; } -static int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs, +int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs, int bitrate, opus_val16 tonality, opus_val32 *mem, int buffering) { int N; - int i; + int i, c; float e[MAX_DYNAMIC_FRAMESIZE+4]; float e_1[MAX_DYNAMIC_FRAMESIZE+3]; float memx; int bestLM=0; int subframe; int pos; + VARDECL(opus_val16, sub); subframe = Fs/400; + ALLOC(sub, subframe, opus_val16); e[0]=mem[0]; e_1[0]=1./(EPSILON+mem[0]); if (buffering) @@ -698,27 +700,26 @@ static int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs } N=IMIN(len/subframe, MAX_DYNAMIC_FRAMESIZE); memx = x[0]; + for (c=1;cvariable_duration && orig_frame_size != frame_size) { bonus = (40*st->stream_channels+40)*(48000/frame_size-48000/orig_frame_size); if (analysis_info.valid) diff --git a/src/opus_multistream_encoder.c b/src/opus_multistream_encoder.c index db9fc785..d048f535 100644 --- a/src/opus_multistream_encoder.c +++ b/src/opus_multistream_encoder.c @@ -40,6 +40,9 @@ struct OpusMSEncoder { ChannelLayout layout; int bitrate; + int variable_duration; + opus_int32 bitrate_bps; + opus_val32 subframe_mem[3]; /* Encoder states go here */ }; @@ -193,10 +196,38 @@ static int opus_multistream_encode_native VARDECL(opus_val16, buf); unsigned char tmp_data[MS_FRAME_TMP]; OpusRepacketizer rp; + int orig_frame_size; + int coded_channels; + opus_int32 channel_rate; ALLOC_STACK; ptr = (char*)st + align(sizeof(OpusMSEncoder)); opus_encoder_ctl((OpusEncoder*)ptr, OPUS_GET_SAMPLE_RATE(&Fs)); + + if (400*frame_size < Fs) + { + RESTORE_STACK; + return OPUS_BAD_ARG; + } + orig_frame_size = IMIN(frame_size,Fs/50); + if (st->variable_duration) + { + int LM = 3; + int channels; + opus_int32 delay_compensation; + + channels = st->layout.nb_streams + st->layout.nb_coupled_streams; + opus_encoder_ctl((OpusEncoder*)ptr, OPUS_GET_LOOKAHEAD(&delay_compensation)); + delay_compensation -= Fs/400; +#ifndef FIXED_POINT + LM = optimize_framesize(pcm, frame_size, channels, Fs, st->bitrate_bps, + 0.f, st->subframe_mem, delay_compensation); +#endif + while ((Fs/400<frame_size) + LM--; + frame_size = (Fs/400<layout.nb_streams + st->layout.nb_coupled_streams; + channel_rate = st->bitrate_bps / coded_channels; +#ifndef FIXED_POINT + if (st->variable_duration && orig_frame_size != frame_size) + { + opus_int32 bonus; + bonus = 60*(48000/frame_size-48000/orig_frame_size); + channel_rate += bonus; + } +#endif + ptr = (char*)st + align(sizeof(OpusMSEncoder)); + for (s=0;slayout.nb_streams;s++) + { + OpusEncoder *enc; + enc = (OpusEncoder*)ptr; + if (s < st->layout.nb_coupled_streams) + ptr += align(coupled_size); + else + ptr += align(mono_size); + opus_encoder_ctl(enc, OPUS_SET_BITRATE(channel_rate * (s < st->layout.nb_coupled_streams ? 2 : 1))); + } + + ptr = (char*)st + align(sizeof(OpusMSEncoder)); /* Counting ToC */ tot_size = 0; for (s=0;slayout.nb_streams;s++) @@ -378,20 +434,8 @@ int opus_multistream_encoder_ctl(OpusMSEncoder *st, int request, ...) { case OPUS_SET_BITRATE_REQUEST: { - int chan, s; opus_int32 value = va_arg(ap, opus_int32); - chan = st->layout.nb_streams + st->layout.nb_coupled_streams; - value /= chan; - for (s=0;slayout.nb_streams;s++) - { - OpusEncoder *enc; - enc = (OpusEncoder*)ptr; - if (s < st->layout.nb_coupled_streams) - ptr += align(coupled_size); - else - ptr += align(mono_size); - opus_encoder_ctl(enc, request, value * (s < st->layout.nb_coupled_streams ? 2 : 1)); - } + st->bitrate_bps = value; } break; case OPUS_GET_BITRATE_REQUEST: @@ -504,7 +548,21 @@ int opus_multistream_encoder_ctl(OpusMSEncoder *st, int request, ...) } *value = (OpusEncoder*)ptr; } - break; + break; + case OPUS_SET_EXPERT_VARIABLE_DURATION_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if (value<0 || value>1) + goto bad_arg; + st->variable_duration = value; + } + break; + case OPUS_GET_EXPERT_VARIABLE_DURATION_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + *value = st->variable_duration; + } + break; default: ret = OPUS_UNIMPLEMENTED; break; @@ -512,6 +570,9 @@ int opus_multistream_encoder_ctl(OpusMSEncoder *st, int request, ...) va_end(ap); return ret; +bad_arg: + va_end(ap); + return OPUS_BAD_ARG; } void opus_multistream_encoder_destroy(OpusMSEncoder *st) diff --git a/src/opus_private.h b/src/opus_private.h index 977f4a25..94de0033 100644 --- a/src/opus_private.h +++ b/src/opus_private.h @@ -81,6 +81,8 @@ int get_mono_channel(const ChannelLayout *layout, int stream_id, int prev); #define OPUS_SET_FORCE_MODE_REQUEST 11002 #define OPUS_SET_FORCE_MODE(x) OPUS_SET_FORCE_MODE_REQUEST, __opus_check_int(x) +int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs, + int bitrate, opus_val16 tonality, opus_val32 *mem, int buffering); int encode_size(int size, unsigned char *data); From 95561be6dedc77a95a217989ee93255f48f91530 Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin Date: Mon, 17 Dec 2012 17:54:01 -0500 Subject: [PATCH 09/15] Better handling of the multistream bitrate Now supports OPUS_AUTO and OPUS_BITRATE_MAX --- src/opus_multistream_encoder.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/opus_multistream_encoder.c b/src/opus_multistream_encoder.c index d048f535..1b41c651 100644 --- a/src/opus_multistream_encoder.c +++ b/src/opus_multistream_encoder.c @@ -39,7 +39,6 @@ struct OpusMSEncoder { ChannelLayout layout; - int bitrate; int variable_duration; opus_int32 bitrate_bps; opus_val32 subframe_mem[3]; @@ -105,6 +104,7 @@ int opus_multistream_encoder_init( st->layout.nb_streams = streams; st->layout.nb_coupled_streams = coupled_streams; + st->bitrate_bps = OPUS_AUTO; for (i=0;ilayout.nb_channels;i++) st->layout.mapping[i] = mapping[i]; if (!validate_layout(&st->layout) || !validate_encoder_layout(&st->layout)) @@ -249,7 +249,15 @@ static int opus_multistream_encode_native /* Compute bitrate allocation between streams (this could be a lot better) */ coded_channels = st->layout.nb_streams + st->layout.nb_coupled_streams; - channel_rate = st->bitrate_bps / coded_channels; + if (st->bitrate_bps==OPUS_AUTO) + { + channel_rate = Fs+60*Fs/orig_frame_size; + } else if (st->bitrate_bps==OPUS_BITRATE_MAX) + { + channel_rate = 300000; + } else { + channel_rate = st->bitrate_bps/coded_channels; + } #ifndef FIXED_POINT if (st->variable_duration && orig_frame_size != frame_size) { @@ -435,6 +443,8 @@ int opus_multistream_encoder_ctl(OpusMSEncoder *st, int request, ...) case OPUS_SET_BITRATE_REQUEST: { opus_int32 value = va_arg(ap, opus_int32); + if (value<0 && value!=OPUS_AUTO && value!=OPUS_BITRATE_MAX) + goto bad_arg; st->bitrate_bps = value; } break; From bb43b8b69d3e06ec2609a6040513952cb472d742 Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin Date: Mon, 17 Dec 2012 18:02:56 -0500 Subject: [PATCH 10/15] No need for extra_buffer anymore --- src/opus_encoder.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/opus_encoder.c b/src/opus_encoder.c index 4c0840ff..bcfe6b47 100644 --- a/src/opus_encoder.c +++ b/src/opus_encoder.c @@ -769,7 +769,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ int curr_bandwidth; opus_val16 HB_gain; opus_int32 max_data_bytes; /* Max number of bytes we're allowed to use */ - int extra_buffer, total_buffer; + int total_buffer; int perform_analysis=0; int orig_frame_size; #ifndef FIXED_POINT @@ -836,7 +836,6 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ #endif total_buffer = delay_compensation; - extra_buffer = total_buffer-delay_compensation; st->bitrate_bps = user_bitrate_to_bitrate(st, frame_size, max_data_bytes); frame_rate = st->Fs/frame_size; @@ -1450,7 +1449,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ if (st->mode != MODE_SILK_ONLY && st->mode != st->prev_mode && st->prev_mode > 0) { for (i=0;ichannels*st->Fs/400;i++) - tmp_prefill[i] = st->delay_buffer[(extra_buffer+st->encoder_buffer-total_buffer-st->Fs/400)*st->channels + i]; + tmp_prefill[i] = st->delay_buffer[(st->encoder_buffer-total_buffer-st->Fs/400)*st->channels + i]; } for (i=0;ichannels*(st->encoder_buffer-(frame_size+total_buffer));i++) @@ -1464,7 +1463,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ const CELTMode *celt_mode; celt_encoder_ctl(celt_enc, CELT_GET_MODE(&celt_mode)); - gain_fade(pcm_buf+extra_buffer*st->channels, pcm_buf+extra_buffer*st->channels, + gain_fade(pcm_buf, pcm_buf, st->prev_HB_gain, HB_gain, celt_mode->overlap, frame_size, st->channels, celt_mode->window, st->Fs); } st->prev_HB_gain = HB_gain; @@ -1486,7 +1485,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ g1 *= (1.f/16384); g2 *= (1.f/16384); #endif - stereo_fade(pcm_buf+extra_buffer*st->channels, pcm_buf+extra_buffer*st->channels, g1, g2, celt_mode->overlap, + stereo_fade(pcm_buf, pcm_buf, g1, g2, celt_mode->overlap, frame_size, st->channels, celt_mode->window, st->Fs); st->hybrid_stereo_width_Q14 = st->silk_mode.stereoWidth_Q14; } @@ -1540,7 +1539,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ int err; celt_encoder_ctl(celt_enc, CELT_SET_START_BAND(0)); celt_encoder_ctl(celt_enc, OPUS_SET_VBR(0)); - err = celt_encode_with_ec(celt_enc, pcm_buf+extra_buffer*st->channels, st->Fs/200, data+nb_compr_bytes, redundancy_bytes, NULL); + err = celt_encode_with_ec(celt_enc, pcm_buf, st->Fs/200, data+nb_compr_bytes, redundancy_bytes, NULL); if (err < 0) { RESTORE_STACK; @@ -1570,7 +1569,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ if (perform_analysis) celt_encoder_ctl(celt_enc, CELT_SET_ANALYSIS(&analysis_info)); #endif - ret = celt_encode_with_ec(celt_enc, pcm_buf+extra_buffer*st->channels, frame_size, NULL, nb_compr_bytes, &enc); + ret = celt_encode_with_ec(celt_enc, pcm_buf, frame_size, NULL, nb_compr_bytes, &enc); if (ret < 0) { RESTORE_STACK; @@ -1593,9 +1592,9 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ celt_encoder_ctl(celt_enc, CELT_SET_PREDICTION(0)); /* NOTE: We could speed this up slightly (at the expense of code size) by just adding a function that prefills the buffer */ - celt_encode_with_ec(celt_enc, pcm_buf+st->channels*(extra_buffer+frame_size-N2-N4), N4, dummy, 2, NULL); + celt_encode_with_ec(celt_enc, pcm_buf+st->channels*(frame_size-N2-N4), N4, dummy, 2, NULL); - err = celt_encode_with_ec(celt_enc, pcm_buf+st->channels*(extra_buffer+frame_size-N2), N2, data+nb_compr_bytes, redundancy_bytes, NULL); + err = celt_encode_with_ec(celt_enc, pcm_buf+st->channels*(frame_size-N2), N2, data+nb_compr_bytes, redundancy_bytes, NULL); if (err < 0) { RESTORE_STACK; From 10a34a5dd66ff45538ac3843eab7802d260e160f Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin Date: Thu, 20 Dec 2012 00:23:01 -0500 Subject: [PATCH 11/15] Making multistream variable duration work for both the float and int API --- src/opus_encoder.c | 45 +++++++++++++++++++++++++--------- src/opus_multistream_encoder.c | 9 ++++--- src/opus_private.h | 7 +++++- 3 files changed, 45 insertions(+), 16 deletions(-) diff --git a/src/opus_encoder.c b/src/opus_encoder.c index bcfe6b47..19778a40 100644 --- a/src/opus_encoder.c +++ b/src/opus_encoder.c @@ -665,11 +665,36 @@ static int transient_viterbi(const float *E, const float *E_1, int N, int frame_ return best_state; } +void downmix_float(const void *_x, float *sub, int subframe, int i, int C) +{ + const float *x; + int c, j; + x = (const float *)_x; + for (j=0;jchannels, st->Fs, st->bitrate_bps, - st->analysis.prev_tonality, st->subframe_mem, delay_compensation); + st->analysis.prev_tonality, st->subframe_mem, delay_compensation, downmix_float); #endif while ((st->Fs/400<frame_size) LM--; @@ -1167,6 +1187,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ RESTORE_STACK; return ret; } +#ifndef FIXED_POINT /* Perform analysis for 40-60 ms frames */ if (perform_analysis && frame_size > st->Fs/50) { @@ -1175,7 +1196,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ tonality_analysis(&st->analysis, &analysis_info, celt_enc, pcm+i*(st->Fs/100)*st->channels, 480, st->channels, lsb_depth); st->voice_ratio = (int)floor(.5+100*(1-analysis_info.music_prob)); } - +#endif curr_bandwidth = st->bandwidth; /* Chooses the appropriate mode for speech diff --git a/src/opus_multistream_encoder.c b/src/opus_multistream_encoder.c index 1b41c651..c5fb3669 100644 --- a/src/opus_multistream_encoder.c +++ b/src/opus_multistream_encoder.c @@ -185,6 +185,9 @@ static int opus_multistream_encode_native unsigned char *data, opus_int32 max_data_bytes, int lsb_depth +#ifndef FIXED_POINT + , downmix_func downmix +#endif ) { opus_int32 Fs; @@ -221,7 +224,7 @@ static int opus_multistream_encode_native delay_compensation -= Fs/400; #ifndef FIXED_POINT LM = optimize_framesize(pcm, frame_size, channels, Fs, st->bitrate_bps, - 0.f, st->subframe_mem, delay_compensation); + 0.f, st->subframe_mem, delay_compensation, downmix); #endif while ((Fs/400<frame_size) LM--; @@ -410,7 +413,7 @@ int opus_multistream_encode_float ) { return opus_multistream_encode_native(st, opus_copy_channel_in_float, - pcm, frame_size, data, max_data_bytes, 24); + pcm, frame_size, data, max_data_bytes, 24, downmix_float); } int opus_multistream_encode( @@ -422,7 +425,7 @@ int opus_multistream_encode( ) { return opus_multistream_encode_native(st, opus_copy_channel_in_short, - pcm, frame_size, data, max_data_bytes, 16); + pcm, frame_size, data, max_data_bytes, 16, downmix_int); } #endif diff --git a/src/opus_private.h b/src/opus_private.h index 94de0033..33a982e5 100644 --- a/src/opus_private.h +++ b/src/opus_private.h @@ -81,8 +81,13 @@ int get_mono_channel(const ChannelLayout *layout, int stream_id, int prev); #define OPUS_SET_FORCE_MODE_REQUEST 11002 #define OPUS_SET_FORCE_MODE(x) OPUS_SET_FORCE_MODE_REQUEST, __opus_check_int(x) +typedef void (*downmix_func)(const void *, float *, int, int, int); +void downmix_float(const void *_x, float *sub, int subframe, int i, int C); +void downmix_int(const void *_x, float *sub, int subframe, int i, int C); + int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs, - int bitrate, opus_val16 tonality, opus_val32 *mem, int buffering); + int bitrate, opus_val16 tonality, opus_val32 *mem, int buffering, + void (*downmix)(const void *, float *, int, int, int)); int encode_size(int size, unsigned char *data); From 51f4a32ec2b62fd7c53e7b901fefd38ff95e7cc2 Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin Date: Wed, 20 Feb 2013 04:08:04 -0500 Subject: [PATCH 12/15] Adds support for delayed decision Variable duration option renamed to OPUS_SET_EXPERT_FRAME_DURATION, with new API. Also moves up the analysis to avoid having to do int->float conversion on large buffers. --- include/opus_defines.h | 17 ++- src/analysis.c | 184 +++++++++++++++++++++++++++---- src/analysis.h | 22 +++- src/opus_demo.c | 33 +++++- src/opus_encoder.c | 195 +++++++++++++++++++++------------ src/opus_multistream_encoder.c | 51 ++++++--- src/opus_private.h | 15 ++- 7 files changed, 395 insertions(+), 122 deletions(-) diff --git a/include/opus_defines.h b/include/opus_defines.h index e9434aab..203144a7 100644 --- a/include/opus_defines.h +++ b/include/opus_defines.h @@ -149,8 +149,8 @@ extern "C" { #define OPUS_SET_LSB_DEPTH_REQUEST 4036 #define OPUS_GET_LSB_DEPTH_REQUEST 4037 #define OPUS_GET_LAST_PACKET_DURATION_REQUEST 4039 -#define OPUS_SET_EXPERT_VARIABLE_DURATION_REQUEST 4040 -#define OPUS_GET_EXPERT_VARIABLE_DURATION_REQUEST 4041 +#define OPUS_SET_EXPERT_FRAME_DURATION_REQUEST 4040 +#define OPUS_GET_EXPERT_FRAME_DURATION_REQUEST 4041 /* Don't use 4045, it's already taken by OPUS_GET_GAIN_REQUEST */ @@ -186,6 +186,15 @@ extern "C" { #define OPUS_BANDWIDTH_SUPERWIDEBAND 1104 /**<12 kHz bandpass @hideinitializer*/ #define OPUS_BANDWIDTH_FULLBAND 1105 /**<20 kHz bandpass @hideinitializer*/ +#define OPUS_FRAMESIZE_ARG 5000 /**< Select frame size from the argument (default) */ +#define OPUS_FRAMESIZE_2_5_MS 5001 /**< Use 2.5 ms frames */ +#define OPUS_FRAMESIZE_5_MS 5002 /**< Use 5 ms frames */ +#define OPUS_FRAMESIZE_10_MS 5003 /**< Use 10 ms frames */ +#define OPUS_FRAMESIZE_20_MS 5004 /**< Use 20 ms frames */ +#define OPUS_FRAMESIZE_40_MS 5005 /**< Use 40 ms frames */ +#define OPUS_FRAMESIZE_60_MS 5006 /**< Use 60 ms frames */ +#define OPUS_FRAMESIZE_VARIABLE 5010 /**< Optimize the frame size dynamically */ + /**@}*/ @@ -541,7 +550,7 @@ extern "C" { *
1
Enable variable duration.
* * @hideinitializer */ -#define OPUS_SET_EXPERT_VARIABLE_DURATION(x) OPUS_SET_EXPERT_VARIABLE_DURATION_REQUEST, __opus_check_int(x) +#define OPUS_SET_EXPERT_FRAME_DURATION(x) OPUS_SET_EXPERT_FRAME_DURATION_REQUEST, __opus_check_int(x) /** Gets the encoder's configured use of variable duration frames. * @see OPUS_SET_EXPERT_VARIABLE_DURATION * @param[out] x opus_int32 *: Returns one of the following values: @@ -550,7 +559,7 @@ extern "C" { *
1
variable duration enabled.
* * @hideinitializer */ -#define OPUS_GET_EXPERT_VARIABLE_DURATION(x) OPUS_GET_EXPERT_VARIABLE_DURATION_REQUEST, __opus_check_int_ptr(x) +#define OPUS_GET_EXPERT_FRAME_DURATION(x) OPUS_GET_EXPERT_FRAME_DURATION_REQUEST, __opus_check_int_ptr(x) /**@}*/ diff --git a/src/analysis.c b/src/analysis.c index 6b07890a..54005d3a 100644 --- a/src/analysis.c +++ b/src/analysis.c @@ -139,10 +139,81 @@ static inline float fast_atan2f(float y, float x) { } } -void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEncoder *celt_enc, const opus_val16 *x, int len, int C, int lsb_depth) +void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len) +{ +#if 1 + int pos; + int curr_lookahead; + float psum; + int i; + + pos = tonal->read_pos; + curr_lookahead = tonal->write_pos-tonal->read_pos; + if (curr_lookahead<0) + curr_lookahead += DETECT_SIZE; + + if (len > 480 && pos != tonal->write_pos) + { + pos++; + if (pos==DETECT_SIZE) + pos=0; + } + if (pos == tonal->write_pos) + pos--; + if (pos<0) + pos = DETECT_SIZE-1; + OPUS_COPY(info_out, &tonal->info[pos], 1); + tonal->read_subframe += len/120; + while (tonal->read_subframe>=4) + { + tonal->read_subframe -= 4; + tonal->read_pos++; + } + if (tonal->read_pos>=DETECT_SIZE) + tonal->read_pos-=DETECT_SIZE; + + /* Compensate for the delay in the features themselves. + FIXME: Need a better estimate the 10 I just made up */ + curr_lookahead = IMAX(curr_lookahead-10, 0); + + psum=0; + for (i=0;ipmusic[i]; + for (;ipspeech[i]; + /*printf("%f %f\n", psum, info_out->music_prob);*/ + + info_out->music_prob = psum; +#else + /* If data not available, return invalid */ + if (tonal->read_pos==tonal->write_pos) + { + info_out->valid=0; + return; + } + + OPUS_COPY(info_out, &tonal->info[tonal->read_pos], 1); + tonal->read_subframe += len/480; + while (tonal->read_subframe>=4) + { + tonal->read_subframe -= 4; + tonal->read_pos++; + } + if (tonal->read_pos>=DETECT_SIZE) + tonal->read_pos-=DETECT_SIZE; + if (tonal->read_pos == tonal->write_pos) + { + tonal->read_pos = tonal->write_pos-1; + if (tonal->read_pos<0) + tonal->read_pos=DETECT_SIZE-1; + tonal->read_subframe = 3; + } +#endif +} + +void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, const CELTMode *celt_mode, const void *x, int len, int offset, int C, int lsb_depth, downmix_func downmix) { int i, b; - const CELTMode *mode; const kiss_fft_state *kfft; kiss_fft_cpx in[480], out[480]; int N = 480, N2=240; @@ -171,8 +242,7 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc float maxE = 0; float noise_floor; int remaining; - - celt_encoder_ctl(celt_enc, CELT_GET_MODE(&mode)); + AnalysisInfo *info; tonal->last_transition++; alpha = 1.f/IMIN(20, 1+tonal->count); @@ -181,23 +251,19 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc if (tonal->count<4) tonal->music_prob = .5; - kfft = mode->mdct.kfft[0]; + kfft = celt_mode->mdct.kfft[0]; if (tonal->count==0) tonal->mem_fill = 240; - if (C==1) - { - for (i=0;imem_fill);i++) - tonal->inmem[i+tonal->mem_fill] = x[i]; - } else { - for (i=0;imem_fill);i++) - tonal->inmem[i+tonal->mem_fill] = x[2*i]+x[2*i+1]; - } + downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, C); if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE) { tonal->mem_fill += len; /* Don't have enough to update the analysis */ return; } + info = &tonal->info[tonal->write_pos++]; + if (tonal->write_pos>=DETECT_SIZE) + tonal->write_pos-=DETECT_SIZE; for (i=0;iinmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240); remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill); - if (C==1) - { - for (i=0;iinmem[240+i] = x[ANALYSIS_BUF_SIZE-tonal->mem_fill+i]; - } else { - for (i=0;iinmem[240+i] = x[2*(ANALYSIS_BUF_SIZE-tonal->mem_fill+i)] - + x[2*(ANALYSIS_BUF_SIZE-tonal->mem_fill+i)+1]; - } + downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, C); tonal->mem_fill = 240 + remaining; opus_fft(kfft, in, out); @@ -450,13 +508,49 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc tau = .00005f; beta = .1f; max_certainty = .01f+1.f/(20.f+.5f*tonal->last_transition); + max_certainty = 0; p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau; p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau; p0 *= (float)pow(1-frame_prob, beta); p1 *= (float)pow(frame_prob, beta); tonal->music_prob = MAX16(max_certainty, MIN16(1-max_certainty, p1/(p0+p1))); info->music_prob = tonal->music_prob; - /*printf("%f %f\n", frame_prob, info->music_prob);*/ + info->music_prob = frame_prob; + + float psum=1e-20; + float speech0 = (float)pow(1-frame_prob, beta); + float music0 = (float)pow(frame_prob, beta); + if (tonal->count==1) + { + tonal->pspeech[0]=.5; + tonal->pmusic [0]=.5; + } + float s0, m0; + s0 = tonal->pspeech[0] + tonal->pspeech[1]; + m0 = tonal->pmusic [0] + tonal->pmusic [1]; + tonal->pspeech[0] = s0*(1-tau)*speech0; + tonal->pmusic [0] = m0*(1-tau)*music0; + for (i=1;ipspeech[i] = tonal->pspeech[i+1]*speech0; + tonal->pmusic [i] = tonal->pmusic [i+1]*music0; + } + tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0; + tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0; + + for (i=0;ipspeech[i] + tonal->pmusic[i]; + psum = 1.f/psum; + for (i=0;ipspeech[i] *= psum; + tonal->pmusic [i] *= psum; + } + psum = tonal->pmusic[0]; + for (i=1;ipspeech[i]; + + /*printf("%f %f %f\n", frame_prob, info->music_prob, psum);*/ } if (tonal->last_music != (tonal->music_prob>.5f)) tonal->last_transition=0; @@ -484,4 +578,48 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/ info->noisiness = frame_noisiness; info->valid = 1; + if (info_out!=NULL) + OPUS_COPY(info_out, info, 1); +} + +int run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *pcm, + const void *analysis_pcm, int frame_size, int variable_duration, int C, opus_int32 Fs, int bitrate_bps, + int delay_compensation, int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info) +{ + int offset; + int pcm_len; + + /* Avoid overflow/wrap-around of the analysis buffer */ + frame_size = IMIN((DETECT_SIZE-5)*Fs/100, frame_size); + + pcm_len = frame_size - analysis->analysis_offset; + offset = 0; + do { + tonality_analysis(analysis, NULL, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, C, lsb_depth, downmix); + offset += 480; + pcm_len -= 480; + } while (pcm_len>0); + analysis->analysis_offset = frame_size; + + if (variable_duration == OPUS_FRAMESIZE_VARIABLE && frame_size >= Fs/200) + { + int LM = 3; + LM = optimize_framesize(pcm, frame_size, C, Fs, bitrate_bps, + analysis->prev_tonality, analysis->subframe_mem, delay_compensation, downmix); + while ((Fs/400<frame_size) + LM--; + frame_size = (Fs/400<analysis_offset -= frame_size; + + /* Only perform analysis up to 20-ms frames. Longer ones will be split if + they're in CELT-only mode. */ + analysis_info->valid = 0; + tonality_get_info(analysis, analysis_info, frame_size); + + return frame_size; } diff --git a/src/analysis.h b/src/analysis.h index 6f3689da..37a8bf40 100644 --- a/src/analysis.h +++ b/src/analysis.h @@ -28,10 +28,16 @@ #ifndef ANALYSIS_H #define ANALYSIS_H +#include "celt.h" +#include "opus_private.h" + #define NB_FRAMES 8 #define NB_TBANDS 18 #define NB_TOT_BANDS 21 #define ANALYSIS_BUF_SIZE 720 /* 15 ms at 48 kHz */ + +#define DETECT_SIZE 200 + typedef struct { float angle[240]; float d_angle[240]; @@ -55,9 +61,23 @@ typedef struct { int last_transition; int count; int opus_bandwidth; + opus_val32 subframe_mem[3]; + int analysis_offset; + float pspeech[DETECT_SIZE]; + float pmusic[DETECT_SIZE]; + int write_pos; + int read_pos; + int read_subframe; + AnalysisInfo info[DETECT_SIZE]; } TonalityAnalysisState; void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, - CELTEncoder *celt_enc, const opus_val16 *x, int len, int C, int lsb_depth); + const CELTMode *celt_mode, const void *x, int len, int offset, int C, int lsb_depth, downmix_func downmix); + +void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len); + +int run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *pcm, + const void *analysis_pcm, int frame_size, int variable_duration, int C, opus_int32 Fs, int bitrate_bps, + int delay_compensation, int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info); #endif diff --git a/src/opus_demo.c b/src/opus_demo.c index 6538aad6..a0acb0cd 100644 --- a/src/opus_demo.c +++ b/src/opus_demo.c @@ -244,7 +244,8 @@ int main(int argc, char *argv[]) int mode_switch_time = 48000; int nb_encoded; int remaining=0; - int variable_duration=0; + int variable_duration=OPUS_FRAMESIZE_ARG; + int delayed_decision=0; if (argc < 5 ) { @@ -313,7 +314,7 @@ int main(int argc, char *argv[]) forcechannels = OPUS_AUTO; use_dtx = 0; packet_loss_perc = 0; - max_frame_size = 960*6; + max_frame_size = 2*48000; curr_read=0; while( args < argc - 2 ) { @@ -383,7 +384,11 @@ int main(int argc, char *argv[]) args++; } else if( strcmp( argv[ args ], "-variable-duration" ) == 0 ) { check_encoder_option(decode_only, "-variable-duration"); - variable_duration = 1; + variable_duration = OPUS_FRAMESIZE_VARIABLE; + args++; + } else if( strcmp( argv[ args ], "-delayed-decision" ) == 0 ) { + check_encoder_option(decode_only, "-delayed-decision"); + delayed_decision = 1; args++; } else if( strcmp( argv[ args ], "-dtx") == 0 ) { check_encoder_option(decode_only, "-dtx"); @@ -510,7 +515,7 @@ int main(int argc, char *argv[]) opus_encoder_ctl(enc, OPUS_GET_LOOKAHEAD(&skip)); opus_encoder_ctl(enc, OPUS_SET_LSB_DEPTH(16)); - opus_encoder_ctl(enc, OPUS_SET_EXPERT_VARIABLE_DURATION(variable_duration)); + opus_encoder_ctl(enc, OPUS_SET_EXPERT_FRAME_DURATION(variable_duration)); } if (!encode_only) { @@ -566,6 +571,26 @@ int main(int argc, char *argv[]) if ( use_inbandfec ) { data[1] = (unsigned char*)calloc(max_payload_bytes,sizeof(char)); } + if(delayed_decision) + { + if (variable_duration!=OPUS_FRAMESIZE_VARIABLE) + { + if (frame_size==sampling_rate/400) + variable_duration = OPUS_FRAMESIZE_2_5_MS; + else if (frame_size==sampling_rate/200) + variable_duration = OPUS_FRAMESIZE_5_MS; + else if (frame_size==sampling_rate/100) + variable_duration = OPUS_FRAMESIZE_10_MS; + else if (frame_size==sampling_rate/50) + variable_duration = OPUS_FRAMESIZE_20_MS; + else if (frame_size==sampling_rate/25) + variable_duration = OPUS_FRAMESIZE_40_MS; + else + variable_duration = OPUS_FRAMESIZE_60_MS; + opus_encoder_ctl(enc, OPUS_SET_EXPERT_FRAME_DURATION(variable_duration)); + } + frame_size = 2*48000; + } while (!stop) { if (delayed_celt) diff --git a/src/opus_encoder.c b/src/opus_encoder.c index 19778a40..3cee88b3 100644 --- a/src/opus_encoder.c +++ b/src/opus_encoder.c @@ -89,9 +89,9 @@ struct OpusEncoder { int first; opus_val16 delay_buffer[MAX_ENCODER_BUFFER*2]; #ifndef FIXED_POINT - opus_val32 subframe_mem[3]; TonalityAnalysisState analysis; - int detected_bandwidth; + int detected_bandwidth; + int analysis_offset; #endif opus_uint32 rangeFinal; }; @@ -215,6 +215,7 @@ int opus_encoder_init(OpusEncoder* st, opus_int32 Fs, int channels, int applicat st->voice_ratio = -1; st->encoder_buffer = st->Fs/100; st->lsb_depth = 24; + st->variable_duration = OPUS_FRAMESIZE_ARG; /* Delay compensation of 4 ms (2.5 ms for SILK's extra look-ahead + 1.5 ms for SILK resamplers and stereo prediction) */ @@ -665,28 +666,28 @@ static int transient_viterbi(const float *E, const float *E_1, int N, int frame_ return best_state; } -void downmix_float(const void *_x, float *sub, int subframe, int i, int C) +void downmix_float(const void *_x, float *sub, int subframe, int offset, int C) { const float *x; int c, j; x = (const float *)_x; for (j=0;j= OPUS_FRAMESIZE_2_5_MS && variable_duration <= OPUS_FRAMESIZE_60_MS) + new_size = IMAX(3*Fs/50, (Fs/400)<<(variable_duration-OPUS_FRAMESIZE_2_5_MS)); + else + return -1; + if (new_size>frame_size) + return -1; + if (400*new_size!=Fs && 200*new_size!=Fs && 100*new_size!=Fs && + 50*new_size!=Fs && 25*new_size!=Fs && 50*new_size!=3*Fs) + return -1; + return new_size; +} + opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_size, - unsigned char *data, opus_int32 out_data_bytes, int lsb_depth) + unsigned char *data, opus_int32 out_data_bytes, int lsb_depth +#ifndef FIXED_POINT + , AnalysisInfo *analysis_info +#endif + ) { void *silk_enc; CELTEncoder *celt_enc; @@ -790,11 +817,6 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ opus_val16 HB_gain; opus_int32 max_data_bytes; /* Max number of bytes we're allowed to use */ int total_buffer; - int perform_analysis=0; - int orig_frame_size; -#ifndef FIXED_POINT - AnalysisInfo analysis_info; -#endif VARDECL(opus_val16, tmp_prefill); ALLOC_STACK; @@ -820,38 +842,15 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ lsb_depth = IMIN(lsb_depth, st->lsb_depth); - orig_frame_size = IMIN(frame_size,st->Fs/50); - if (st->variable_duration) - { - int LM = 3; + st->voice_ratio = -1; + #ifndef FIXED_POINT - LM = optimize_framesize(pcm, frame_size, st->channels, st->Fs, st->bitrate_bps, - st->analysis.prev_tonality, st->subframe_mem, delay_compensation, downmix_float); -#endif - while ((st->Fs/400<frame_size) - LM--; - frame_size = (st->Fs/400<silk_mode.complexity >= 7 && st->Fs==48000; - if (!perform_analysis) + st->detected_bandwidth = 0; + if (analysis_info->valid) { - st->voice_ratio = -1; - st->detected_bandwidth = 0; - } else if (frame_size <= st->Fs/50) - { - tonality_analysis(&st->analysis, &analysis_info, celt_enc, pcm, IMIN(480, frame_size), st->channels, lsb_depth); - if (frame_size > st->Fs/100) - tonality_analysis(&st->analysis, &analysis_info, celt_enc, pcm+(st->Fs/100)*st->channels, 480, st->channels, lsb_depth); - if (analysis_info.valid) - { - if (st->signal_type == OPUS_AUTO) - st->voice_ratio = (int)floor(.5+100*(1-analysis_info.music_prob)); - st->detected_bandwidth = analysis_info.opus_bandwidth; - } + if (st->signal_type == OPUS_AUTO) + st->voice_ratio = (int)floor(.5+100*(1-analysis_info->music_prob)); + st->detected_bandwidth = analysis_info->opus_bandwidth; } #endif @@ -1161,7 +1160,11 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ /* When switching from SILK/Hybrid to CELT, only ask for a switch at the last frame */ if (to_celt && i==nb_frames-1) st->user_forced_mode = MODE_CELT_ONLY; - tmp_len = opus_encode_native(st, pcm+i*(st->channels*st->Fs/50), st->Fs/50, tmp_data+i*bytes_per_frame, bytes_per_frame, lsb_depth); + tmp_len = opus_encode_native(st, pcm+i*(st->channels*st->Fs/50), st->Fs/50, tmp_data+i*bytes_per_frame, bytes_per_frame, lsb_depth +#ifndef FIXED_POINT + , analysis_info +#endif + ); if (tmp_len<0) { RESTORE_STACK; @@ -1187,16 +1190,6 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ RESTORE_STACK; return ret; } -#ifndef FIXED_POINT - /* Perform analysis for 40-60 ms frames */ - if (perform_analysis && frame_size > st->Fs/50) - { - int nb_analysis = frame_size/(st->Fs/100); - for (i=0;ianalysis, &analysis_info, celt_enc, pcm+i*(st->Fs/100)*st->channels, 480, st->channels, lsb_depth); - st->voice_ratio = (int)floor(.5+100*(1-analysis_info.music_prob)); - } -#endif curr_bandwidth = st->bandwidth; /* Chooses the appropriate mode for speech @@ -1446,11 +1439,11 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ { opus_int32 bonus=0; #ifndef FIXED_POINT - if (st->variable_duration && orig_frame_size != frame_size) + if (st->variable_duration==OPUS_FRAMESIZE_VARIABLE && frame_size != st->Fs/50) { - bonus = (40*st->stream_channels+40)*(48000/frame_size-48000/orig_frame_size); - if (analysis_info.valid) - bonus = bonus*(1.f+.5*analysis_info.tonality); + bonus = (40*st->stream_channels+40)*(st->Fs/frame_size-50); + if (analysis_info->valid) + bonus = bonus*(1.f+.5*analysis_info->tonality); } #endif celt_encoder_ctl(celt_enc, OPUS_SET_VBR(1)); @@ -1587,8 +1580,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ if (ec_tell(&enc) <= 8*nb_compr_bytes) { #ifndef FIXED_POINT - if (perform_analysis) - celt_encoder_ctl(celt_enc, CELT_SET_ANALYSIS(&analysis_info)); + celt_encoder_ctl(celt_enc, CELT_SET_ANALYSIS(analysis_info)); #endif ret = celt_encode_with_ec(celt_enc, pcm_buf, frame_size, NULL, nb_compr_bytes, &enc); if (ret < 0) @@ -1688,6 +1680,7 @@ opus_int32 opus_encode_float(OpusEncoder *st, const float *pcm, int frame_size, VARDECL(opus_int16, in); ALLOC_STACK; + frame_size = frame_size_select(frame_size, st->variable_duration, st->Fs); if(frame_size<0) { RESTORE_STACK; @@ -1707,6 +1700,12 @@ opus_int32 opus_encode_float(OpusEncoder *st, const float *pcm, int frame_size, opus_int32 opus_encode(OpusEncoder *st, const opus_int16 *pcm, int frame_size, unsigned char *data, opus_int32 out_data_bytes) { + frame_size = frame_size_select(frame_size, st->variable_duration, st->Fs); + if(frame_size<0) + { + RESTORE_STACK; + return OPUS_BAD_ARG; + } return opus_encode_native(st, pcm, frame_size, data, out_data_bytes, 16); } @@ -1715,21 +1714,74 @@ opus_int32 opus_encode(OpusEncoder *st, const opus_int16 *pcm, int frame_size, unsigned char *data, opus_int32 max_data_bytes) { int i, ret; + const CELTMode *celt_mode; + int delay_compensation; + int lsb_depth; VARDECL(float, in); + AnalysisInfo analysis_info; ALLOC_STACK; + opus_encoder_ctl(st, CELT_GET_MODE(&celt_mode)); + if (st->application == OPUS_APPLICATION_RESTRICTED_LOWDELAY) + delay_compensation = 0; + else + delay_compensation = st->delay_compensation; + + lsb_depth = IMIN(16, st->lsb_depth); + + analysis_info.valid = 0; + if (st->silk_mode.complexity >= 7 && st->Fs==48000) + { + frame_size = run_analysis(&st->analysis, celt_mode, pcm, pcm+st->channels*st->analysis.analysis_offset, + frame_size, st->variable_duration, st->channels, st->Fs, st->bitrate_bps, delay_compensation, lsb_depth, downmix_int, &analysis_info); + } else { + frame_size = frame_size_select(frame_size, st->variable_duration, st->Fs); + } + if(frame_size<0) + { + RESTORE_STACK; + return OPUS_BAD_ARG; + } + ALLOC(in, frame_size*st->channels, float); for (i=0;ichannels;i++) in[i] = (1.0f/32768)*pcm[i]; - ret = opus_encode_native(st, in, frame_size, data, max_data_bytes, 16); + ret = opus_encode_native(st, in, frame_size, data, max_data_bytes, 16, &analysis_info); RESTORE_STACK; return ret; } opus_int32 opus_encode_float(OpusEncoder *st, const float *pcm, int frame_size, unsigned char *data, opus_int32 out_data_bytes) { - return opus_encode_native(st, pcm, frame_size, data, out_data_bytes, 24); + const CELTMode *celt_mode; + int delay_compensation; + int lsb_depth; + AnalysisInfo analysis_info; + + opus_encoder_ctl(st, CELT_GET_MODE(&celt_mode)); + if (st->application == OPUS_APPLICATION_RESTRICTED_LOWDELAY) + delay_compensation = 0; + else + delay_compensation = st->delay_compensation; + + lsb_depth = IMIN(24, st->lsb_depth); + + analysis_info.valid = 0; + if (st->silk_mode.complexity >= 7 && st->Fs==48000) + { + frame_size = run_analysis(&st->analysis, celt_mode, pcm, pcm+st->channels*st->analysis.analysis_offset, + frame_size, st->variable_duration, st->channels, st->Fs, st->bitrate_bps, delay_compensation, lsb_depth, downmix_float, &analysis_info); + } else { + frame_size = frame_size_select(frame_size, st->variable_duration, st->Fs); + } + if(frame_size<0) + { + RESTORE_STACK; + return OPUS_BAD_ARG; + } + + return opus_encode_native(st, pcm, frame_size, data, out_data_bytes, 24, &analysis_info); } #endif @@ -1998,15 +2050,13 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...) *value = st->lsb_depth; } break; - case OPUS_SET_EXPERT_VARIABLE_DURATION_REQUEST: + case OPUS_SET_EXPERT_FRAME_DURATION_REQUEST: { opus_int32 value = va_arg(ap, opus_int32); - if (value<0 || value>1) - goto bad_arg; st->variable_duration = value; } break; - case OPUS_GET_EXPERT_VARIABLE_DURATION_REQUEST: + case OPUS_GET_EXPERT_FRAME_DURATION_REQUEST: { opus_int32 *value = va_arg(ap, opus_int32*); *value = st->variable_duration; @@ -2041,6 +2091,15 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...) st->user_forced_mode = value; } break; + + case CELT_GET_MODE_REQUEST: + { + const CELTMode ** value = va_arg(ap, const CELTMode**); + if (value==0) + goto bad_arg; + celt_encoder_ctl(celt_enc, CELT_GET_MODE(value)); + } + break; default: /* fprintf(stderr, "unknown opus_encoder_ctl() request: %d", request);*/ ret = OPUS_UNIMPLEMENTED; diff --git a/src/opus_multistream_encoder.c b/src/opus_multistream_encoder.c index c5fb3669..c6204185 100644 --- a/src/opus_multistream_encoder.c +++ b/src/opus_multistream_encoder.c @@ -36,8 +36,10 @@ #include #include "float_cast.h" #include "os_support.h" +#include "analysis.h" struct OpusMSEncoder { + TonalityAnalysisState analysis; ChannelLayout layout; int variable_duration; opus_int32 bitrate_bps; @@ -105,6 +107,7 @@ int opus_multistream_encoder_init( st->layout.nb_coupled_streams = coupled_streams; st->bitrate_bps = OPUS_AUTO; + st->variable_duration = OPUS_FRAMESIZE_ARG; for (i=0;ilayout.nb_channels;i++) st->layout.mapping[i] = mapping[i]; if (!validate_layout(&st->layout) || !validate_encoder_layout(&st->layout)) @@ -187,6 +190,7 @@ static int opus_multistream_encode_native int lsb_depth #ifndef FIXED_POINT , downmix_func downmix + , const void *pcm_analysis #endif ) { @@ -202,10 +206,15 @@ static int opus_multistream_encode_native int orig_frame_size; int coded_channels; opus_int32 channel_rate; + opus_int32 complexity; + AnalysisInfo analysis_info; + const CELTMode *celt_mode; ALLOC_STACK; ptr = (char*)st + align(sizeof(OpusMSEncoder)); opus_encoder_ctl((OpusEncoder*)ptr, OPUS_GET_SAMPLE_RATE(&Fs)); + opus_encoder_ctl((OpusEncoder*)ptr, OPUS_GET_COMPLEXITY(&complexity)); + opus_encoder_ctl((OpusEncoder*)ptr, CELT_GET_MODE(&celt_mode)); if (400*frame_size < Fs) { @@ -213,24 +222,24 @@ static int opus_multistream_encode_native return OPUS_BAD_ARG; } orig_frame_size = IMIN(frame_size,Fs/50); - if (st->variable_duration) +#ifndef FIXED_POINT + analysis_info.valid = 0; + if (complexity >= 7 && Fs==48000) { - int LM = 3; - int channels; opus_int32 delay_compensation; + int channels; channels = st->layout.nb_streams + st->layout.nb_coupled_streams; opus_encoder_ctl((OpusEncoder*)ptr, OPUS_GET_LOOKAHEAD(&delay_compensation)); delay_compensation -= Fs/400; -#ifndef FIXED_POINT - LM = optimize_framesize(pcm, frame_size, channels, Fs, st->bitrate_bps, - 0.f, st->subframe_mem, delay_compensation, downmix); -#endif - while ((Fs/400<frame_size) - LM--; - frame_size = (Fs/400<analysis, celt_mode, pcm, pcm_analysis, + frame_size, st->variable_duration, channels, Fs, st->bitrate_bps, delay_compensation, lsb_depth, downmix, &analysis_info); + } else +#endif + { + frame_size = frame_size_select(frame_size, st->variable_duration, Fs); + } /* Validate frame_size before using it to allocate stack space. This mirrors the checks in opus_encode[_float](). */ if (400*frame_size != Fs && 200*frame_size != Fs && @@ -262,10 +271,10 @@ static int opus_multistream_encode_native channel_rate = st->bitrate_bps/coded_channels; } #ifndef FIXED_POINT - if (st->variable_duration && orig_frame_size != frame_size) + if (st->variable_duration==OPUS_FRAMESIZE_VARIABLE && frame_size != Fs/50) { opus_int32 bonus; - bonus = 60*(48000/frame_size-48000/orig_frame_size); + bonus = 60*(Fs/frame_size-50); channel_rate += bonus; } #endif @@ -313,7 +322,11 @@ static int opus_multistream_encode_native /* Reserve three bytes for the last stream and four for the others */ curr_max -= IMAX(0,4*(st->layout.nb_streams-s-1)-1); curr_max = IMIN(curr_max,MS_FRAME_TMP); - len = opus_encode_native(enc, buf, frame_size, tmp_data, curr_max, lsb_depth); + len = opus_encode_native(enc, buf, frame_size, tmp_data, curr_max, lsb_depth +#ifndef FIXED_POINT + , &analysis_info +#endif + ); if (len<0) { RESTORE_STACK; @@ -412,8 +425,9 @@ int opus_multistream_encode_float opus_int32 max_data_bytes ) { + int channels = st->layout.nb_streams + st->layout.nb_coupled_streams; return opus_multistream_encode_native(st, opus_copy_channel_in_float, - pcm, frame_size, data, max_data_bytes, 24, downmix_float); + pcm, frame_size, data, max_data_bytes, 24, downmix_float, pcm+channels*st->analysis.analysis_offset); } int opus_multistream_encode( @@ -424,8 +438,9 @@ int opus_multistream_encode( opus_int32 max_data_bytes ) { + int channels = st->layout.nb_streams + st->layout.nb_coupled_streams; return opus_multistream_encode_native(st, opus_copy_channel_in_short, - pcm, frame_size, data, max_data_bytes, 16, downmix_int); + pcm, frame_size, data, max_data_bytes, 16, downmix_int, pcm+channels*st->analysis.analysis_offset); } #endif @@ -562,7 +577,7 @@ int opus_multistream_encoder_ctl(OpusMSEncoder *st, int request, ...) *value = (OpusEncoder*)ptr; } break; - case OPUS_SET_EXPERT_VARIABLE_DURATION_REQUEST: + case OPUS_SET_EXPERT_FRAME_DURATION_REQUEST: { opus_int32 value = va_arg(ap, opus_int32); if (value<0 || value>1) @@ -570,7 +585,7 @@ int opus_multistream_encoder_ctl(OpusMSEncoder *st, int request, ...) st->variable_duration = value; } break; - case OPUS_GET_EXPERT_VARIABLE_DURATION_REQUEST: + case OPUS_GET_EXPERT_FRAME_DURATION_REQUEST: { opus_int32 *value = va_arg(ap, opus_int32*); *value = st->variable_duration; diff --git a/src/opus_private.h b/src/opus_private.h index 33a982e5..1da5748b 100644 --- a/src/opus_private.h +++ b/src/opus_private.h @@ -31,6 +31,7 @@ #include "arch.h" #include "opus.h" +#include "celt.h" struct OpusRepacketizer { unsigned char toc; @@ -82,17 +83,23 @@ int get_mono_channel(const ChannelLayout *layout, int stream_id, int prev); #define OPUS_SET_FORCE_MODE(x) OPUS_SET_FORCE_MODE_REQUEST, __opus_check_int(x) typedef void (*downmix_func)(const void *, float *, int, int, int); -void downmix_float(const void *_x, float *sub, int subframe, int i, int C); -void downmix_int(const void *_x, float *sub, int subframe, int i, int C); +void downmix_float(const void *_x, float *sub, int subframe, int offset, int C); +void downmix_int(const void *_x, float *sub, int subframe, int offset, int C); int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs, int bitrate, opus_val16 tonality, opus_val32 *mem, int buffering, - void (*downmix)(const void *, float *, int, int, int)); + downmix_func downmix); int encode_size(int size, unsigned char *data); +opus_int32 frame_size_select(opus_int32 frame_size, int variable_duration, opus_int32 Fs); + opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_size, - unsigned char *data, opus_int32 out_data_bytes, int lsb_depth); + unsigned char *data, opus_int32 out_data_bytes, int lsb_depth +#ifndef FIXED_POINT + , AnalysisInfo *analysis_info +#endif + ); int opus_decode_native(OpusDecoder *st, const unsigned char *data, opus_int32 len, opus_val16 *pcm, int frame_size, int decode_fec, int self_delimited, int *packet_offset); From 74f36b56dd8ad176fecd3344fd27ed9a99a44221 Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin Date: Wed, 20 Feb 2013 22:31:49 -0500 Subject: [PATCH 13/15] oops s/IMAX/IMIN/ --- src/opus_encoder.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/opus_encoder.c b/src/opus_encoder.c index 3cee88b3..7cff8428 100644 --- a/src/opus_encoder.c +++ b/src/opus_encoder.c @@ -773,7 +773,7 @@ opus_int32 frame_size_select(opus_int32 frame_size, int variable_duration, opus_ else if (variable_duration == OPUS_FRAMESIZE_VARIABLE) new_size = Fs/50; else if (variable_duration >= OPUS_FRAMESIZE_2_5_MS && variable_duration <= OPUS_FRAMESIZE_60_MS) - new_size = IMAX(3*Fs/50, (Fs/400)<<(variable_duration-OPUS_FRAMESIZE_2_5_MS)); + new_size = IMIN(3*Fs/50, (Fs/400)<<(variable_duration-OPUS_FRAMESIZE_2_5_MS)); else return -1; if (new_size>frame_size) From 742aac10568839e08e5ee9fe3e92ba587c63e374 Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin Date: Fri, 22 Feb 2013 16:44:56 -0500 Subject: [PATCH 14/15] Adds silence probability to speech/music detector Avoids biasing the decision when it's all silence/noise. --- src/analysis.c | 54 +++++------------ src/mlp_data.c | 153 ++++++++++++++++++++++++++++-------------------- src/mlp_train.c | 21 ++++--- 3 files changed, 119 insertions(+), 109 deletions(-) diff --git a/src/analysis.c b/src/analysis.c index 54005d3a..34ea5107 100644 --- a/src/analysis.c +++ b/src/analysis.c @@ -141,7 +141,6 @@ static inline float fast_atan2f(float y, float x) { void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len) { -#if 1 int pos; int curr_lookahead; float psum; @@ -184,31 +183,6 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int /*printf("%f %f\n", psum, info_out->music_prob);*/ info_out->music_prob = psum; -#else - /* If data not available, return invalid */ - if (tonal->read_pos==tonal->write_pos) - { - info_out->valid=0; - return; - } - - OPUS_COPY(info_out, &tonal->info[tonal->read_pos], 1); - tonal->read_subframe += len/480; - while (tonal->read_subframe>=4) - { - tonal->read_subframe -= 4; - tonal->read_pos++; - } - if (tonal->read_pos>=DETECT_SIZE) - tonal->read_pos-=DETECT_SIZE; - if (tonal->read_pos == tonal->write_pos) - { - tonal->read_pos = tonal->write_pos-1; - if (tonal->read_pos<0) - tonal->read_pos=DETECT_SIZE-1; - tonal->read_subframe = 3; - } -#endif } void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, const CELTMode *celt_mode, const void *x, int len, int offset, int C, int lsb_depth, downmix_func downmix) @@ -234,7 +208,7 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, con float slope=0; float frame_stationarity; float relativeE; - float frame_prob; + float frame_probs[2]; float alpha, alphaE, alphaE2; float frame_loudness; float bandwidth_mask; @@ -494,32 +468,34 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, con features[24] = tonal->lowECount; #ifndef FIXED_POINT - mlp_process(&net, features, &frame_prob); - frame_prob = .5f*(frame_prob+1); + mlp_process(&net, features, frame_probs); + frame_probs[0] = .5f*(frame_probs[0]+1); /* Curve fitting between the MLP probability and the actual probability */ - frame_prob = .01f + 1.21f*frame_prob*frame_prob - .23f*(float)pow(frame_prob, 10); + frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10); + frame_probs[1] = .5*frame_probs[1]+.5; + frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5; - /*printf("%f\n", frame_prob);*/ + /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/ { float tau, beta; float p0, p1; float max_certainty; /* One transition every 3 minutes */ - tau = .00005f; - beta = .1f; + tau = .00005f*frame_probs[1]; + beta = .05f; max_certainty = .01f+1.f/(20.f+.5f*tonal->last_transition); max_certainty = 0; p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau; p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau; - p0 *= (float)pow(1-frame_prob, beta); - p1 *= (float)pow(frame_prob, beta); + p0 *= (float)pow(1-frame_probs[0], beta); + p1 *= (float)pow(frame_probs[0], beta); tonal->music_prob = MAX16(max_certainty, MIN16(1-max_certainty, p1/(p0+p1))); info->music_prob = tonal->music_prob; - info->music_prob = frame_prob; + info->music_prob = frame_probs[0]; float psum=1e-20; - float speech0 = (float)pow(1-frame_prob, beta); - float music0 = (float)pow(frame_prob, beta); + float speech0 = (float)pow(1-frame_probs[0], beta); + float music0 = (float)pow(frame_probs[0], beta); if (tonal->count==1) { tonal->pspeech[0]=.5; @@ -550,7 +526,7 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, con for (i=1;ipspeech[i]; - /*printf("%f %f %f\n", frame_prob, info->music_prob, psum);*/ + /*printf("%f\n", psum);*/ } if (tonal->last_music != (tonal->music_prob>.5f)) tonal->last_transition=0; diff --git a/src/mlp_data.c b/src/mlp_data.c index 5c13ca40..9085b85f 100644 --- a/src/mlp_data.c +++ b/src/mlp_data.c @@ -3,74 +3,103 @@ #include "mlp.h" -/* RMS error was 0.179835, seed was 1322103961 */ +/* RMS error was 0.138320, seed was 1361535663 */ -static const float weights[271] = { +static const float weights[422] = { /* hidden layer */ -1.55597f, -0.0739792f, -0.0646761f, -0.099531f, -0.0794943f, -0.0180174f, -0.0391354f, 0.0508224f, -0.0160169f, -0.0773263f, --0.0300002f, -0.0865361f, 0.124477f, -0.28648f, -0.0860702f, --0.518949f, -0.0873341f, -0.235393f, -0.907833f, -0.383573f, -0.535388f, -0.57944f, 0.98116f, 0.8482f, 1.12426f, --3.23721f, -0.647072f, -0.0265139f, 0.0711052f, -0.00125666f, --0.0396181f, -0.44282f, -0.510495f, -0.201865f, 0.0134336f, --0.167205f, -0.155406f, 0.00041678f, -0.00468705f, -0.0233224f, -0.264279f, -0.301375f, 0.00234895f, 0.0144741f, -0.137535f, -0.200323f, 0.0192027f, 3.19818f, 2.03495f, 0.705517f, --4.6025f, -0.11485f, -0.792716f, 0.150714f, 0.10608f, -0.240633f, 0.0690698f, 0.0695297f, 0.124819f, 0.0501433f, -0.0460952f, 0.147639f, 0.10327f, 0.158007f, 0.113714f, -0.0276191f, 0.0680749f, -0.130012f, 0.0796126f, 0.133067f, -0.51495f, 0.747578f, -0.128742f, 5.98112f, -1.16698f, --0.276492f, -1.73549f, -3.90234f, 2.01489f, -0.040118f, --0.113002f, -0.146751f, -0.113569f, 0.0534873f, 0.0989832f, -0.0872875f, 0.049266f, 0.0367557f, -0.00889148f, -0.0648461f, --0.00190352f, 0.0143773f, 0.0259364f, -0.0592133f, -0.0672924f, -0.1399f, -0.0987886f, -0.347402f, 0.101326f, -0.0680876f, -0.469186f, 0.246922f, 10.4017f, 3.44846f, -0.662725f, --0.0328208f, -0.0561274f, -0.0167744f, 0.00044282f, -0.0457645f, --0.0408314f, -0.013113f, -0.0373873f, -0.0474122f, -0.0273745f, --0.0308505f, 0.000582959f, -0.0421135f, 0.464859f, 0.196842f, -0.320538f, 0.0435528f, -0.200168f, 0.266475f, -0.0853727f, -1.20397f, 0.711542f, -1.04397f, -1.47759f, 1.26768f, -0.446958f, 0.266477f, -0.30802f, 0.28431f, -0.118541f, -0.00836345f, 0.0689026f, -0.0137996f, -0.0395417f, 0.26982f, --0.206255f, 0.16066f, 0.114757f, 0.359587f, -0.106503f, --0.0948534f, 0.175358f, -0.122966f, -0.0056675f, 0.483848f, --0.134916f, -0.427567f, -0.140172f, -1.0866f, -2.73921f, -0.549843f, 0.17685f, 0.0010675f, -0.00137386f, 0.0884424f, --0.0698736f, -0.00174136f, 0.0718775f, -0.0396849f, 0.0448056f, -0.0577853f, -0.0372353f, 0.134599f, 0.0260656f, 0.140322f, -0.22704f, -0.020568f, -0.0142424f, -0.21723f, -0.997704f, --0.884573f, -0.163495f, 2.33617f, 0.224142f, 0.19635f, --0.957387f, 0.144678f, 1.47035f, -0.00700498f, -0.0472309f, --0.0137848f, -0.0189145f, 0.00856479f, 0.0316965f, 0.00613373f, -0.00209807f, 0.00270964f, -0.0490206f, 0.0105712f, -0.0465045f, --0.0381532f, -0.0985268f, -0.108297f, 0.0146409f, -0.0040718f, --0.0698572f, -0.380568f, -0.230479f, 3.98917f, 0.457652f, --1.02355f, -7.4435f, -0.475314f, 1.61743f, 0.0254017f, --0.00791293f, 0.047217f, 0.0220995f, -0.0304311f, 0.0052168f, --0.0404054f, -0.0230293f, 0.00169229f, -0.0138178f, 0.0043137f, --0.0598088f, -0.133601f, 0.0555138f, -0.177358f, -0.159856f, --0.137281f, 0.108051f, -0.305973f, 0.393775f, 0.0747287f, -0.783993f, -0.875086f, 1.06862f, 0.340519f, -0.352681f, --0.0830912f, -0.100017f, 0.0729085f, -0.00829403f, 0.027489f, --0.0779597f, 0.082286f, -0.164181f, -0.41519f, 0.00282335f, --0.29573f, 0.125571f, 0.726935f, 0.392137f, 0.491348f, -0.0723196f, -0.0259758f, -0.0636332f, -0.452384f, -0.000225974f, --2.34001f, 2.45211f, -0.544628f, 5.62944f, -3.44507f, +-0.0941125f, -0.302976f, -0.603555f, -0.19393f, -0.185983f, +-0.601617f, -0.0465317f, -0.114563f, -0.103599f, -0.618938f, +-0.317859f, -0.169949f, -0.0702885f, 0.148065f, 0.409524f, +0.548432f, 0.367649f, -0.494393f, 0.764306f, -1.83957f, +0.170849f, 12.786f, -1.08848f, -1.27284f, -16.2606f, +24.1773f, -5.57454f, -0.17276f, -0.163388f, -0.224421f, +-0.0948944f, -0.0728695f, -0.26557f, -0.100283f, -0.0515459f, +-0.146142f, -0.120674f, -0.180655f, 0.12857f, 0.442138f, +-0.493735f, 0.167767f, 0.206699f, -0.197567f, 0.417999f, +1.50364f, -0.773341f, -10.0401f, 0.401872f, 2.97966f, +15.2165f, -1.88905f, -1.19254f, 0.0285397f, -0.00405139f, +0.0707565f, 0.00825699f, -0.0927269f, -0.010393f, -0.00428882f, +-0.00489743f, -0.0709731f, -0.00255992f, 0.0395619f, 0.226424f, +0.0325231f, 0.162175f, -0.100118f, 0.485789f, 0.12697f, +0.285937f, 0.0155637f, 0.10546f, 3.05558f, 1.15059f, +-1.00904f, -1.83088f, 3.31766f, -3.42516f, -0.119135f, +-0.0405654f, 0.00690068f, 0.0179877f, -0.0382487f, 0.00597941f, +-0.0183611f, 0.00190395f, -0.144322f, -0.0435671f, 0.000990594f, +0.221087f, 0.142405f, 0.484066f, 0.404395f, 0.511955f, +-0.237255f, 0.241742f, 0.35045f, -0.699428f, 10.3993f, +2.6507f, -2.43459f, -4.18838f, 1.05928f, 1.71067f, +0.00667811f, -0.0721335f, -0.0397346f, 0.0362704f, -0.11496f, +-0.0235776f, 0.0082161f, -0.0141741f, -0.0329699f, -0.0354253f, +0.00277404f, -0.290654f, -1.14767f, -0.319157f, -0.686544f, +0.36897f, 0.478899f, 0.182579f, -0.411069f, 0.881104f, +-4.60683f, 1.4697f, 0.335845f, -1.81905f, -30.1699f, +5.55225f, 0.0019508f, -0.123576f, -0.0727332f, -0.0641597f, +-0.0534458f, -0.108166f, -0.0937368f, -0.0697883f, -0.0275475f, +-0.192309f, -0.110074f, 0.285375f, -0.405597f, 0.0926724f, +-0.287881f, -0.851193f, -0.099493f, -0.233764f, -1.2852f, +1.13611f, 3.12168f, -0.0699f, -1.86216f, 2.65292f, +-7.31036f, 2.44776f, -0.00111802f, -0.0632786f, -0.0376296f, +-0.149851f, 0.142963f, 0.184368f, 0.123433f, 0.0756158f, +0.117312f, 0.0933395f, 0.0692163f, 0.0842592f, 0.0704683f, +0.0589963f, 0.0942205f, -0.448862f, 0.0262677f, 0.270352f, +-0.262317f, 0.172586f, 2.00227f, -0.159216f, 0.038422f, +10.2073f, 4.15536f, -2.3407f, -0.0550265f, 0.00964792f, +-0.141336f, 0.0274501f, 0.0343921f, -0.0487428f, 0.0950172f, +-0.00775017f, -0.0372492f, -0.00548121f, -0.0663695f, 0.0960506f, +-0.200008f, -0.0412827f, 0.58728f, 0.0515787f, 0.337254f, +0.855024f, 0.668371f, -0.114904f, -3.62962f, -0.467477f, +-0.215472f, 2.61537f, 0.406117f, -1.36373f, 0.0425394f, +0.12208f, 0.0934502f, 0.123055f, 0.0340935f, -0.142466f, +0.035037f, -0.0490666f, 0.0733208f, 0.0576672f, 0.123984f, +-0.0517194f, -0.253018f, 0.590565f, 0.145849f, 0.315185f, +0.221534f, -0.149081f, 0.216161f, -0.349575f, 24.5664f, +-0.994196f, 0.614289f, -18.7905f, -2.83277f, -0.716801f, +-0.347201f, 0.479515f, -0.246027f, 0.0758683f, 0.137293f, +-0.17781f, 0.118751f, -0.00108329f, -0.237334f, 0.355732f, +-0.12991f, -0.0547627f, -0.318576f, -0.325524f, 0.180494f, +-0.0625604f, 0.141219f, 0.344064f, 0.37658f, -0.591772f, +5.8427f, -0.38075f, 0.221894f, -1.41934f, -1.87943e+06f, +1.34114f, 0.0283355f, -0.0447856f, -0.0211466f, -0.0256927f, +0.0139618f, 0.0207934f, -0.0107666f, 0.0110969f, 0.0586069f, +-0.0253545f, -0.0328433f, 0.11872f, -0.216943f, 0.145748f, +0.119808f, -0.0915211f, -0.120647f, -0.0787719f, -0.143644f, +-0.595116f, -1.152f, -1.25335f, -1.17092f, 4.34023f, +-975268.f, -1.37033f, -0.0401123f, 0.210602f, -0.136656f, +0.135962f, -0.0523293f, 0.0444604f, 0.0143928f, 0.00412666f, +-0.0193003f, 0.218452f, -0.110204f, -2.02563f, 0.918238f, +-2.45362f, 1.19542f, -0.061362f, -1.92243f, 0.308111f, +0.49764f, 0.912356f, 0.209272f, -2.34525f, 2.19326f, +-6.47121f, 1.69771f, -0.725123f, 0.0118929f, 0.0377944f, +0.0554003f, 0.0226452f, -0.0704421f, -0.0300309f, 0.0122978f, +-0.0041782f, -0.0686612f, 0.0313115f, 0.039111f, 0.364111f, +-0.0945548f, 0.0229876f, -0.17414f, 0.329795f, 0.114714f, +0.30022f, 0.106997f, 0.132355f, 5.79932f, 0.908058f, +-0.905324f, -3.3561f, 0.190647f, 0.184211f, -0.673648f, +0.231807f, -0.0586222f, 0.230752f, -0.438277f, 0.245857f, +-0.17215f, 0.0876383f, -0.720512f, 0.162515f, 0.0170571f, +0.101781f, 0.388477f, 1.32931f, 1.08548f, -0.936301f, +-2.36958f, -6.71988f, -3.44376f, 2.13818f, 14.2318f, +4.91459f, -3.09052f, -9.69191f, -0.768234f, 1.79604f, +0.0549653f, 0.163399f, 0.0797025f, 0.0343933f, -0.0555876f, +-0.00505673f, 0.0187258f, 0.0326628f, 0.0231486f, 0.15573f, +0.0476223f, -0.254824f, 1.60155f, -0.801221f, 2.55496f, +0.737629f, -1.36249f, -0.695463f, -2.44301f, -1.73188f, +3.95279f, 1.89068f, 0.486087f, -11.3343f, 3.9416e+06f, /* output layer */ --3.13835f, 0.994751f, 0.444901f, 1.59518f, 1.23665f, -3.37012f, -1.34606f, 1.99131f, 1.33476f, 1.3885f, -1.12559f, }; +-0.381439, 0.12115, -0.906927, 2.93878, 1.6388, +0.882811, 0.874344, 1.21726, -0.874545, 0.321706, +0.785055, 0.946558, -0.575066, -3.46553, 0.884905, +0.0924047, -9.90712, 0.391338, 0.160103, -2.04954, +4.1455, 0.0684029, -0.144761, -0.285282, 0.379244, +-1.1584, -0.0277241, -9.85, -4.82386, 3.71333, +3.87308, 3.52558, }; -static const int topo[3] = {25, 10, 1}; +static const int topo[3] = {25, 15, 2}; const MLP net = { - 3, - topo, - weights + 3, + topo, + weights }; - diff --git a/src/mlp_train.c b/src/mlp_train.c index 5fbbff08..2e9568ba 100644 --- a/src/mlp_train.c +++ b/src/mlp_train.c @@ -106,6 +106,7 @@ MLPTrain * mlp_init(int *topo, int nbLayers, float *inputs, float *outputs, int } #define MAX_NEURONS 100 +#define MAX_OUT 10 double compute_gradient(MLPTrain *net, float *inputs, float *outputs, int nbSamples, double *W0_grad, double *W1_grad, double *error_rate) { @@ -120,7 +121,8 @@ double compute_gradient(MLPTrain *net, float *inputs, float *outputs, int nbSamp double netOut[MAX_NEURONS]; double error[MAX_NEURONS]; - *error_rate = 0; + for (i=0;itopo; inDim = net->topo[0]; hiddenDim = net->topo[1]; @@ -153,7 +155,7 @@ double compute_gradient(MLPTrain *net, float *inputs, float *outputs, int nbSamp netOut[i] = tansig_approx(sum); error[i] = out[i] - netOut[i]; rms += error[i]*error[i]; - *error_rate += fabs(error[i])>1; + error_rate[i] += fabs(error[i])>1; /*error[i] = error[i]/(1+fabs(error[i]));*/ } /* Back-propagate error */ @@ -194,7 +196,7 @@ struct GradientArg { double *W0_grad; double *W1_grad; double rms; - double error_rate; + double error_rate[MAX_OUT]; }; void *gradient_thread_process(void *_arg) @@ -213,7 +215,7 @@ void *gradient_thread_process(void *_arg) sem_wait(&sem_begin[arg->id]); if (arg->done) break; - arg->rms = compute_gradient(arg->net, arg->inputs, arg->outputs, arg->nbSamples, arg->W0_grad, arg->W1_grad, &arg->error_rate); + arg->rms = compute_gradient(arg->net, arg->inputs, arg->outputs, arg->nbSamples, arg->W0_grad, arg->W1_grad, arg->error_rate); sem_post(&sem_end[arg->id]); } fprintf(stderr, "done\n"); @@ -295,7 +297,7 @@ float mlp_train_backprop(MLPTrain *net, float *inputs, float *outputs, int nbSam for (e=0;e Date: Thu, 28 Feb 2013 15:30:51 -0500 Subject: [PATCH 15/15] Makes the speech/music probability estimation mode conservative This is done using an adaptive beta and an estimate of the speech and music detection confidence --- src/analysis.c | 39 +++++++++++++++++++++++++++++++++------ src/analysis.h | 4 ++++ 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/src/analysis.c b/src/analysis.c index 34ea5107..14b2246c 100644 --- a/src/analysis.c +++ b/src/analysis.c @@ -180,6 +180,7 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int psum += tonal->pmusic[i]; for (;ipspeech[i]; + psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence; /*printf("%f %f\n", psum, info_out->music_prob);*/ info_out->music_prob = psum; @@ -479,19 +480,22 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, con { float tau, beta; float p0, p1; - float max_certainty; /* One transition every 3 minutes */ tau = .00005f*frame_probs[1]; beta = .05f; - max_certainty = .01f+1.f/(20.f+.5f*tonal->last_transition); - max_certainty = 0; + if (1) { + /* Adapt beta based on how "unexpected" the new prob is */ + float p, q; + p = MAX16(.05f,MIN16(.95f,frame_probs[0])); + q = MAX16(.05f,MIN16(.95f,tonal->music_prob)); + beta = .01+.05*ABS16(p-q)/(p*(1-q)+q*(1-p)); + } p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau; p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau; p0 *= (float)pow(1-frame_probs[0], beta); p1 *= (float)pow(frame_probs[0], beta); - tonal->music_prob = MAX16(max_certainty, MIN16(1-max_certainty, p1/(p0+p1))); + tonal->music_prob = p1/(p0+p1); info->music_prob = tonal->music_prob; - info->music_prob = frame_probs[0]; float psum=1e-20; float speech0 = (float)pow(1-frame_probs[0], beta); @@ -526,7 +530,30 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, con for (i=1;ipspeech[i]; - /*printf("%f\n", psum);*/ + /* Estimate our confidence in the speech/music decisions */ + if (frame_probs[1]>.75) + { + if (tonal->music_prob>.9) + { + float adapt; + adapt = 1.f/(++tonal->music_confidence_count); + tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500); + tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence); + } + if (tonal->music_prob<.1) + { + float adapt; + adapt = 1.f/(++tonal->speech_confidence_count); + tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500); + tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence); + } + } else { + if (tonal->music_confidence_count==0) + tonal->music_confidence = .9; + if (tonal->speech_confidence_count==0) + tonal->speech_confidence = .1; + } + psum = MAX16(tonal->speech_confidence, MIN16(tonal->music_confidence, psum)); } if (tonal->last_music != (tonal->music_prob>.5f)) tonal->last_transition=0; diff --git a/src/analysis.h b/src/analysis.h index 37a8bf40..7b17118c 100644 --- a/src/analysis.h +++ b/src/analysis.h @@ -65,6 +65,10 @@ typedef struct { int analysis_offset; float pspeech[DETECT_SIZE]; float pmusic[DETECT_SIZE]; + float speech_confidence; + float music_confidence; + int speech_confidence_count; + int music_confidence_count; int write_pos; int read_pos; int read_subframe;