From 7ebacf430a465d000d97d6d9015f8f6061af8804 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Tue, 13 Nov 2012 02:24:07 -0500
Subject: [PATCH 01/15] Moves analysis to the beginning of opus_encode()

---
 src/analysis.c     | 18 ++++++++++--------
 src/analysis.h     |  1 +
 src/opus_encoder.c | 45 ++++++++++++++++++++++-----------------------
 3 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/src/analysis.c b/src/analysis.c
index 22a8fa79..08975992 100644
--- a/src/analysis.c
+++ b/src/analysis.c
@@ -185,19 +185,21 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc
        for (i=0;i<N2;i++)
        {
           float w = analysis_window[i];
-          in[i].r = MULT16_16(w, x[i]);
-          in[i].i = MULT16_16(w, x[N-N2+i]);
-          in[N-i-1].r = MULT16_16(w, x[N-i-1]);
-          in[N-i-1].i = MULT16_16(w, x[2*N-N2-i-1]);
+          in[i].r = MULT16_16(w, tonal->inmem[i]);
+          in[i].i = MULT16_16(w, x[i]);
+          in[N-i-1].r = MULT16_16(w, x[N2-i-1]);
+          in[N-i-1].i = MULT16_16(w, x[N-i-1]);
+          tonal->inmem[i] = x[N2+i];
        }
     } else {
        for (i=0;i<N2;i++)
        {
           float w = analysis_window[i];
-          in[i].r = MULT16_16(w, x[2*i]+x[2*i+1]);
-          in[i].i = MULT16_16(w, x[2*(N-N2+i)]+x[2*(N-N2+i)+1]);
-          in[N-i-1].r = MULT16_16(w, x[2*(N-i-1)]+x[2*(N-i-1)+1]);
-          in[N-i-1].i = MULT16_16(w, x[2*(2*N-N2-i-1)]+x[2*(2*N-N2-i-1)+1]);
+          in[i].r = MULT16_16(w, tonal->inmem[i]);
+          in[i].i = MULT16_16(w, x[2*i]+x[2*i+1]);
+          in[N-i-1].r = MULT16_16(w, x[2*(N2-i-1)]+x[2*(N2-i-1)+1]);
+          in[N-i-1].i = MULT16_16(w, x[2*(N-i-1)]+x[2*(N-i-1)+1]);
+          tonal->inmem[i] = x[2*(N2+i)]+x[2*(N2+i)+1];
        }
     }
     opus_fft(kfft, in, out);
diff --git a/src/analysis.h b/src/analysis.h
index bf8ad40a..951ded50 100644
--- a/src/analysis.h
+++ b/src/analysis.h
@@ -36,6 +36,7 @@ typedef struct {
    float angle[240];
    float d_angle[240];
    float d2_angle[240];
+   float inmem[240];
    float prev_band_tonality[NB_TBANDS];
    float prev_tonality;
    float E[NB_FRAMES][NB_TBANDS];
diff --git a/src/opus_encoder.c b/src/opus_encoder.c
index ec7d6e7b..87f0147a 100644
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -592,18 +592,32 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
     lsb_depth = IMIN(lsb_depth, st->lsb_depth);
 
 #ifndef FIXED_POINT
-    perform_analysis = st->silk_mode.complexity >= 7 && frame_size >= st->Fs/100 && st->Fs==48000;
+    /* Only perform analysis for 10- and 20-ms frames. We don't have enough buffering for shorter
+       ones and longer ones will be split if they're in CELT-only mode. */
+    perform_analysis = st->silk_mode.complexity >= 7
+                       && (frame_size >= st->Fs/100 || frame_size >= st->Fs/50)
+                       && st->Fs==48000;
+    if (perform_analysis)
+    {
+       int nb_analysis_frames;
+       nb_analysis_frames = frame_size/(st->Fs/100);
+       for (i=0;i<nb_analysis_frames;i++)
+          tonality_analysis(&st->analysis, &analysis_info, celt_enc, pcm+i*(st->Fs/100)*st->channels, st->channels, lsb_depth);
+       if (st->signal_type == OPUS_AUTO)
+          st->voice_ratio = (int)floor(.5+100*(1-analysis_info.music_prob));
+       st->detected_bandwidth = analysis_info.opus_bandwidth;
+    } else {
+       analysis_info.valid = 0;
+       st->voice_ratio = -1;
+       st->detected_bandwidth = 0;
+    }
 #endif
+
     if (st->application == OPUS_APPLICATION_RESTRICTED_LOWDELAY)
        delay_compensation = 0;
     else
        delay_compensation = st->delay_compensation;
-    if (perform_analysis)
-    {
-       total_buffer = IMAX(st->Fs/200, delay_compensation);
-    } else {
-       total_buffer = delay_compensation;
-    }
+    total_buffer = delay_compensation;
     extra_buffer = total_buffer-delay_compensation;
     st->bitrate_bps = user_bitrate_to_bitrate(st, frame_size, max_data_bytes);
 
@@ -975,22 +989,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
        dc_reject(pcm, 3, &pcm_buf[total_buffer*st->channels], st->hp_mem, frame_size, st->channels, st->Fs);
     }
 
-#ifndef FIXED_POINT
-    if (perform_analysis)
-    {
-       int nb_analysis_frames;
-       nb_analysis_frames = frame_size/(st->Fs/100);
-       for (i=0;i<nb_analysis_frames;i++)
-          tonality_analysis(&st->analysis, &analysis_info, celt_enc, pcm_buf+i*(st->Fs/100)*st->channels, st->channels, lsb_depth);
-       if (st->signal_type == OPUS_AUTO)
-          st->voice_ratio = (int)floor(.5+100*(1-analysis_info.music_prob));
-       st->detected_bandwidth = analysis_info.opus_bandwidth;
-    } else {
-       analysis_info.valid = 0;
-       st->voice_ratio = -1;
-       st->detected_bandwidth = 0;
-    }
-#endif
+
 
     /* SILK processing */
     HB_gain = Q15ONE;

From 48ac122141c317964fae2987eaea161c46538717 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Wed, 14 Nov 2012 02:39:27 -0500
Subject: [PATCH 02/15] Makes analysis usable for all frame sizes

---
 celt/celt_encoder.c |  2 +-
 src/analysis.c      | 55 +++++++++++++++++++++++++++++----------------
 src/analysis.h      | 10 +++++----
 src/opus_encoder.c  | 18 +++++++--------
 4 files changed, 51 insertions(+), 34 deletions(-)

diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c
index 0d92c1ff..fd811360 100644
--- a/celt/celt_encoder.c
+++ b/celt/celt_encoder.c
@@ -1274,7 +1274,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
 
       prefilter_tapset = st->tapset_decision;
       pf_on = run_prefilter(st, in, prefilter_mem, CC, N, prefilter_tapset, &pitch_index, &gain1, &qg, enabled, nbAvailableBytes);
-      if ((gain1 > QCONST16(.4f,15) || st->prefilter_gain > QCONST16(.4f,15)) && st->analysis.tonality > .3
+      if ((gain1 > QCONST16(.4f,15) || st->prefilter_gain > QCONST16(.4f,15)) && (!st->analysis.valid || st->analysis.tonality > .3)
             && (pitch_index > 1.26*st->prefilter_period || pitch_index < .79*st->prefilter_period))
          pitch_change = 1;
       if (pf_on==0)
diff --git a/src/analysis.c b/src/analysis.c
index 08975992..6b07890a 100644
--- a/src/analysis.c
+++ b/src/analysis.c
@@ -139,7 +139,7 @@ static inline float fast_atan2f(float y, float x) {
    }
 }
 
-void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEncoder *celt_enc, const opus_val16 *x, int C, int lsb_depth)
+void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEncoder *celt_enc, const opus_val16 *x, int len, int C, int lsb_depth)
 {
     int i, b;
     const CELTMode *mode;
@@ -170,6 +170,8 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc
     int bandwidth=0;
     float maxE = 0;
     float noise_floor;
+    int remaining;
+
     celt_encoder_ctl(celt_enc, CELT_GET_MODE(&mode));
 
     tonal->last_transition++;
@@ -180,28 +182,43 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc
     if (tonal->count<4)
        tonal->music_prob = .5;
     kfft = mode->mdct.kfft[0];
+    if (tonal->count==0)
+       tonal->mem_fill = 240;
     if (C==1)
     {
-       for (i=0;i<N2;i++)
-       {
-          float w = analysis_window[i];
-          in[i].r = MULT16_16(w, tonal->inmem[i]);
-          in[i].i = MULT16_16(w, x[i]);
-          in[N-i-1].r = MULT16_16(w, x[N2-i-1]);
-          in[N-i-1].i = MULT16_16(w, x[N-i-1]);
-          tonal->inmem[i] = x[N2+i];
-       }
+       for (i=0;i<IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill);i++)
+          tonal->inmem[i+tonal->mem_fill] = x[i];
     } else {
-       for (i=0;i<N2;i++)
-       {
-          float w = analysis_window[i];
-          in[i].r = MULT16_16(w, tonal->inmem[i]);
-          in[i].i = MULT16_16(w, x[2*i]+x[2*i+1]);
-          in[N-i-1].r = MULT16_16(w, x[2*(N2-i-1)]+x[2*(N2-i-1)+1]);
-          in[N-i-1].i = MULT16_16(w, x[2*(N-i-1)]+x[2*(N-i-1)+1]);
-          tonal->inmem[i] = x[2*(N2+i)]+x[2*(N2+i)+1];
-       }
+       for (i=0;i<IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill);i++)
+          tonal->inmem[i+tonal->mem_fill] = x[2*i]+x[2*i+1];
     }
+    if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
+    {
+       tonal->mem_fill += len;
+       /* Don't have enough to update the analysis */
+       return;
+    }
+
+    for (i=0;i<N2;i++)
+    {
+       float w = analysis_window[i];
+       in[i].r = MULT16_16(w, tonal->inmem[i]);
+       in[i].i = MULT16_16(w, tonal->inmem[N2+i]);
+       in[N-i-1].r = MULT16_16(w, tonal->inmem[N-i-1]);
+       in[N-i-1].i = MULT16_16(w, tonal->inmem[N+N2-i-1]);
+    }
+    OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
+    remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
+    if (C==1)
+    {
+       for (i=0;i<remaining;i++)
+          tonal->inmem[240+i] = x[ANALYSIS_BUF_SIZE-tonal->mem_fill+i];
+    } else {
+       for (i=0;i<remaining;i++)
+          tonal->inmem[240+i] = x[2*(ANALYSIS_BUF_SIZE-tonal->mem_fill+i)]
+                              + x[2*(ANALYSIS_BUF_SIZE-tonal->mem_fill+i)+1];
+    }
+    tonal->mem_fill = 240 + remaining;
     opus_fft(kfft, in, out);
 
     for (i=1;i<N2;i++)
diff --git a/src/analysis.h b/src/analysis.h
index 951ded50..6f3689da 100644
--- a/src/analysis.h
+++ b/src/analysis.h
@@ -31,16 +31,18 @@
 #define NB_FRAMES 8
 #define NB_TBANDS 18
 #define NB_TOT_BANDS 21
-
+#define ANALYSIS_BUF_SIZE 720 /* 15 ms at 48 kHz */
 typedef struct {
    float angle[240];
    float d_angle[240];
    float d2_angle[240];
-   float inmem[240];
+   float inmem[ANALYSIS_BUF_SIZE];
+   int   mem_fill;                      /* number of usable samples in the buffer */
    float prev_band_tonality[NB_TBANDS];
    float prev_tonality;
    float E[NB_FRAMES][NB_TBANDS];
-   float lowE[NB_TBANDS], highE[NB_TBANDS];
+   float lowE[NB_TBANDS];
+   float highE[NB_TBANDS];
    float meanE[NB_TOT_BANDS];
    float mem[32];
    float cmean[8];
@@ -56,6 +58,6 @@ typedef struct {
 } TonalityAnalysisState;
 
 void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info,
-     CELTEncoder *celt_enc, const opus_val16 *x, int C, int lsb_depth);
+     CELTEncoder *celt_enc, const opus_val16 *x, int len, int C, int lsb_depth);
 
 #endif
diff --git a/src/opus_encoder.c b/src/opus_encoder.c
index 87f0147a..3d8684c4 100644
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -592,18 +592,16 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
     lsb_depth = IMIN(lsb_depth, st->lsb_depth);
 
 #ifndef FIXED_POINT
-    /* Only perform analysis for 10- and 20-ms frames. We don't have enough buffering for shorter
-       ones and longer ones will be split if they're in CELT-only mode. */
-    perform_analysis = st->silk_mode.complexity >= 7
-                       && (frame_size >= st->Fs/100 || frame_size >= st->Fs/50)
-                       && st->Fs==48000;
+    /* Only perform analysis up to 20-ms frames. Longer ones will be split if
+       they're in CELT-only mode. */
+    perform_analysis = st->silk_mode.complexity >= 7 && frame_size <= st->Fs/50 && st->Fs==48000;
     if (perform_analysis)
     {
-       int nb_analysis_frames;
-       nb_analysis_frames = frame_size/(st->Fs/100);
-       for (i=0;i<nb_analysis_frames;i++)
-          tonality_analysis(&st->analysis, &analysis_info, celt_enc, pcm+i*(st->Fs/100)*st->channels, st->channels, lsb_depth);
-       if (st->signal_type == OPUS_AUTO)
+       analysis_info.valid = 0;
+       tonality_analysis(&st->analysis, &analysis_info, celt_enc, pcm, IMIN(480, frame_size), st->channels, lsb_depth);
+       if (frame_size > st->Fs/100)
+          tonality_analysis(&st->analysis, &analysis_info, celt_enc, pcm+(st->Fs/100)*st->channels, 480, st->channels, lsb_depth);
+       if (analysis_info.valid && st->signal_type == OPUS_AUTO)
           st->voice_ratio = (int)floor(.5+100*(1-analysis_info.music_prob));
        st->detected_bandwidth = analysis_info.opus_bandwidth;
     } else {

From 2a5f0565b8b037c0d32fc471544e7d32a9f010fe Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Mon, 19 Nov 2012 23:17:06 -0500
Subject: [PATCH 03/15] Running transient_analysis() even for 2.5 ms frames

This means 2.5 ms frames can now use a higher bitrate for transients.
---
 celt/celt_encoder.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c
index fd811360..1b5426a7 100644
--- a/celt/celt_encoder.c
+++ b/celt/celt_encoder.c
@@ -1298,16 +1298,18 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
 
    isTransient = 0;
    shortBlocks = 0;
+   if (st->complexity >= 1)
+   {
+      isTransient = transient_analysis(in, N+st->overlap, CC,
+            &tf_estimate, &tf_chan);
+   }
    if (LM>0 && ec_tell(enc)+3<=total_bits)
    {
-      if (st->complexity >= 1)
-      {
-         isTransient = transient_analysis(in, N+st->overlap, CC,
-                  &tf_estimate, &tf_chan);
-         if (isTransient)
-            shortBlocks = M;
-      }
+      if (isTransient)
+         shortBlocks = M;
       ec_enc_bit_logp(enc, isTransient, 3);
+   } else {
+      isTransient = 0;
    }
 
    ALLOC(freq, CC*N, celt_sig); /**< Interleaved signal MDCTs */

From e85a6f5cbee9edd7ceb10e05db6652872cf8d8d2 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Mon, 19 Nov 2012 23:21:43 -0500
Subject: [PATCH 04/15] Makes opus_demo rubust to the encoder using variable
 frame duration

Also, the encode+decode mode now produces an output of the same size
as the original.
---
 src/opus_demo.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/src/opus_demo.c b/src/opus_demo.c
index 09b12a33..ae6ccf38 100644
--- a/src/opus_demo.c
+++ b/src/opus_demo.c
@@ -221,6 +221,8 @@ int main(int argc, char *argv[])
     short *in, *out;
     int application=OPUS_APPLICATION_AUDIO;
     double bits=0.0, bits_max=0.0, bits_act=0.0, bits2=0.0, nrg;
+    double tot_samples=0;
+    opus_uint64 tot_in, tot_out;
     int bandwidth=-1;
     const char *bandwidth_string;
     int lost = 0, lost_prev = 1;
@@ -239,6 +241,8 @@ int main(int argc, char *argv[])
     int curr_mode=0;
     int curr_mode_count=0;
     int mode_switch_time = 48000;
+    int nb_encoded;
+    int remaining=0;
 
     if (argc < 5 )
     {
@@ -246,6 +250,7 @@ int main(int argc, char *argv[])
        return EXIT_FAILURE;
     }
 
+    tot_in=tot_out=0;
     fprintf(stderr, "%s\n", opus_get_version_string());
 
     args = 1;
@@ -617,22 +622,28 @@ int main(int argc, char *argv[])
                 opus_encoder_ctl(enc, OPUS_SET_FORCE_CHANNELS(mode_list[curr_mode][3]));
                 frame_size = mode_list[curr_mode][2];
             }
-            err = fread(fbytes, sizeof(short)*channels, frame_size, fin);
+            err = fread(fbytes, sizeof(short)*channels, frame_size-remaining, fin);
             curr_read = err;
+            tot_in += curr_read;
             for(i=0;i<curr_read*channels;i++)
             {
                 opus_int32 s;
                 s=fbytes[2*i+1]<<8|fbytes[2*i];
                 s=((s&0xFFFF)^0x8000)-0x8000;
-                in[i]=s;
+                in[i+remaining*channels]=s;
             }
-            if (curr_read < frame_size)
+            if (curr_read+remaining < frame_size)
             {
-                for (i=curr_read*channels;i<frame_size*channels;i++)
+                for (i=(curr_read+remaining)*channels;i<frame_size*channels;i++)
                    in[i] = 0;
-                stop = 1;
+                if (encode_only || decode_only)
+                   stop = 1;
             }
             len[toggle] = opus_encode(enc, in, frame_size, data[toggle], max_payload_bytes);
+            nb_encoded = opus_packet_get_samples_per_frame(data[toggle], sampling_rate)*opus_packet_get_nb_frames(data[toggle], len[toggle]);
+            remaining = frame_size-nb_encoded;
+            for(i=0;i<remaining*channels;i++)
+               in[i] = in[nb_encoded*channels+i];
             if (sweep_bps!=0)
             {
                bitrate_bps += sweep_bps;
@@ -681,6 +692,7 @@ int main(int argc, char *argv[])
                fprintf(stderr, "Error writing.\n");
                return EXIT_FAILURE;
             }
+            tot_samples += nb_encoded;
         } else {
             int output_samples;
             lost = len[toggle]==0 || (packet_loss_perc>0 && rand()%100 < packet_loss_perc);
@@ -703,6 +715,11 @@ int main(int argc, char *argv[])
                 }
                 if (output_samples>0)
                 {
+                    if (!decode_only && tot_out + output_samples > tot_in)
+                    {
+                       stop=1;
+                       output_samples  = tot_in-tot_out;
+                    }
                     if (output_samples>skip) {
                        int i;
                        for(i=0;i<(output_samples-skip)*channels;i++)
@@ -716,6 +733,7 @@ int main(int argc, char *argv[])
                           fprintf(stderr, "Error writing.\n");
                           return EXIT_FAILURE;
                        }
+                       tot_out += output_samples-skip;
                     }
                     if (output_samples<skip) skip -= output_samples;
                     else skip = 0;
@@ -723,6 +741,7 @@ int main(int argc, char *argv[])
                    fprintf(stderr, "error decoding frame: %s\n",
                                    opus_strerror(output_samples));
                 }
+                tot_samples += output_samples;
             }
         }
 
@@ -767,7 +786,7 @@ int main(int argc, char *argv[])
         toggle = (toggle + use_inbandfec) & 1;
     }
     fprintf (stderr, "average bitrate:             %7.3f kb/s\n",
-                     1e-3*bits*sampling_rate/(frame_size*(double)count));
+                     1e-3*bits*sampling_rate/tot_samples);
     fprintf (stderr, "maximum bitrate:             %7.3f kb/s\n",
                      1e-3*bits_max*sampling_rate/frame_size);
     if (!decode_only)

From 49583ed672f5a685a5f4f88eb98648eb5d5e7ae6 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Thu, 22 Nov 2012 13:11:43 -0500
Subject: [PATCH 05/15] First attempt at varying the frame size depending on
 the audio (float only)

The search is based on minimizing the bitrate increase due to transients
by considering we can reduce the "transient boost" by reducing the frame
size, at the cost of increasing the normal overhead.
---
 include/opus_defines.h |  29 ++++-
 src/opus_demo.c        |   7 ++
 src/opus_encoder.c     | 257 +++++++++++++++++++++++++++++++++++++++--
 3 files changed, 281 insertions(+), 12 deletions(-)

diff --git a/include/opus_defines.h b/include/opus_defines.h
index cdde061a..e9434aab 100644
--- a/include/opus_defines.h
+++ b/include/opus_defines.h
@@ -148,8 +148,9 @@ extern "C" {
 #define OPUS_GET_GAIN_REQUEST                4045 /* Should have been 4035 */
 #define OPUS_SET_LSB_DEPTH_REQUEST           4036
 #define OPUS_GET_LSB_DEPTH_REQUEST           4037
-
 #define OPUS_GET_LAST_PACKET_DURATION_REQUEST 4039
+#define OPUS_SET_EXPERT_VARIABLE_DURATION_REQUEST 4040
+#define OPUS_GET_EXPERT_VARIABLE_DURATION_REQUEST 4041
 
 /* Don't use 4045, it's already taken by OPUS_GET_GAIN_REQUEST */
 
@@ -525,6 +526,32 @@ extern "C" {
   * @param[out] x <tt>opus_int32 *</tt>: Number of samples (at current sampling rate).
   * @hideinitializer */
 #define OPUS_GET_LAST_PACKET_DURATION(x) OPUS_GET_LAST_PACKET_DURATION_REQUEST, __opus_check_int_ptr(x)
+
+/** Configures the encoder's use of variable duration frames.
+  * When enabled, the encoder is free to use a shorter frame size than the one
+  * requested in the opus_encode*() call. It is then the user's responsibility
+  * to verify how much audio was encoded by checking the ToC byte of the encoded
+  * packet. The part of the audio that was not encoded needs to be resent to the
+  * encoder for the next call. Do not use this option unless you <b>really</b>
+  * know what you are doing.
+  * @see OPUS_GET_EXPERT_VARIABLE_DURATION
+  * @param[in] x <tt>opus_int32</tt>: Allowed values:
+  * <dl>
+  * <dt>0</dt><dd>Disable variable duration (default).</dd>
+  * <dt>1</dt><dd>Enable variable duration.</dd>
+  * </dl>
+  * @hideinitializer */
+#define OPUS_SET_EXPERT_VARIABLE_DURATION(x) OPUS_SET_EXPERT_VARIABLE_DURATION_REQUEST, __opus_check_int(x)
+/** Gets the encoder's configured use of variable duration frames.
+  * @see OPUS_SET_EXPERT_VARIABLE_DURATION
+  * @param[out] x <tt>opus_int32 *</tt>: Returns one of the following values:
+  * <dl>
+  * <dt>0</dt><dd>variable duration disabled (default).</dd>
+  * <dt>1</dt><dd>variable duration enabled.</dd>
+  * </dl>
+  * @hideinitializer */
+#define OPUS_GET_EXPERT_VARIABLE_DURATION(x) OPUS_GET_EXPERT_VARIABLE_DURATION_REQUEST, __opus_check_int_ptr(x)
+
 /**@}*/
 
 /** @defgroup opus_genericctls Generic CTLs
diff --git a/src/opus_demo.c b/src/opus_demo.c
index ae6ccf38..6538aad6 100644
--- a/src/opus_demo.c
+++ b/src/opus_demo.c
@@ -53,6 +53,7 @@ void print_usage( char* argv[] )
     fprintf(stderr, "-d                   : only runs the decoder (reads the bit-stream as input)\n" );
     fprintf(stderr, "-cbr                 : enable constant bitrate; default: variable bitrate\n" );
     fprintf(stderr, "-cvbr                : enable constrained variable bitrate; default: unconstrained\n" );
+    fprintf(stderr, "-variable-duration   : enable frames of variable duration (experts only); default: disabled\n" );
     fprintf(stderr, "-bandwidth <NB|MB|WB|SWB|FB> : audio bandwidth (from narrowband to fullband); default: sampling rate\n" );
     fprintf(stderr, "-framesize <2.5|5|10|20|40|60> : frame size in ms; default: 20 \n" );
     fprintf(stderr, "-max_payload <bytes> : maximum payload size in bytes, default: 1024\n" );
@@ -243,6 +244,7 @@ int main(int argc, char *argv[])
     int mode_switch_time = 48000;
     int nb_encoded;
     int remaining=0;
+    int variable_duration=0;
 
     if (argc < 5 )
     {
@@ -379,6 +381,10 @@ int main(int argc, char *argv[])
             check_encoder_option(decode_only, "-cvbr");
             cvbr = 1;
             args++;
+        } else if( strcmp( argv[ args ], "-variable-duration" ) == 0 ) {
+            check_encoder_option(decode_only, "-variable-duration");
+            variable_duration = 1;
+            args++;
         } else if( strcmp( argv[ args ], "-dtx") == 0 ) {
             check_encoder_option(decode_only, "-dtx");
             use_dtx = 1;
@@ -504,6 +510,7 @@ int main(int argc, char *argv[])
 
        opus_encoder_ctl(enc, OPUS_GET_LOOKAHEAD(&skip));
        opus_encoder_ctl(enc, OPUS_SET_LSB_DEPTH(16));
+       opus_encoder_ctl(enc, OPUS_SET_EXPERT_VARIABLE_DURATION(variable_duration));
     }
     if (!encode_only)
     {
diff --git a/src/opus_encoder.c b/src/opus_encoder.c
index 3d8684c4..e55eabda 100644
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -67,6 +67,7 @@ struct OpusEncoder {
     opus_int32   Fs;
     int          use_vbr;
     int          vbr_constraint;
+    int          variable_duration;
     opus_int32   bitrate_bps;
     opus_int32   user_bitrate_bps;
     int          lsb_depth;
@@ -88,6 +89,7 @@ struct OpusEncoder {
     int          first;
     opus_val16   delay_buffer[MAX_ENCODER_BUFFER*2];
 #ifndef FIXED_POINT
+    opus_val32   subframe_mem[3];
     TonalityAnalysisState analysis;
     int                   detected_bandwidth;
 #endif
@@ -535,6 +537,205 @@ static opus_int32 user_bitrate_to_bitrate(OpusEncoder *st, int frame_size, int m
     return st->user_bitrate_bps;
 }
 
+#ifndef FIXED_POINT
+/* Don't use more than 60 ms for the frame size analysis */
+#define MAX_DYNAMIC_FRAMESIZE 24
+/* Estimates how much the bitrate will be boosted based on the sub-frame energy */
+static float transient_boost(const float *E, const float *E_1, int LM, int maxM)
+{
+   int i;
+   int M;
+   float sumE=0, sumE_1=0;
+   float metric;
+
+   M = IMIN(maxM, (1<<LM)+1);
+   for (i=0;i<M;i++)
+   {
+      sumE += E[i];
+      sumE_1 += E_1[i];
+   }
+   metric = sumE*sumE_1/(M*M);
+   /*if (LM==3)
+      printf("%f\n", metric);*/
+   /*return metric>10 ? 1 : 0;*/
+   /*return MAX16(0,1-exp(-.25*(metric-2.)));*/
+   return MIN16(1,sqrt(MAX16(0,.05*(metric-2))));
+}
+
+/* Viterbi decoding trying to find the best frame size combination using look-ahead
+
+   State numbering:
+    0: unused
+    1:  2.5 ms
+    2:  5 ms (#1)
+    3:  5 ms (#2)
+    4: 10 ms (#1)
+    5: 10 ms (#2)
+    6: 10 ms (#3)
+    7: 10 ms (#4)
+    8: 20 ms (#1)
+    9: 20 ms (#2)
+   10: 20 ms (#3)
+   11: 20 ms (#4)
+   12: 20 ms (#5)
+   13: 20 ms (#6)
+   14: 20 ms (#7)
+   15: 20 ms (#8)
+*/
+static int transient_viterbi(const float *E, const float *E_1, int N, int frame_cost, int rate)
+{
+   int i;
+   float cost[MAX_DYNAMIC_FRAMESIZE][16];
+   int states[MAX_DYNAMIC_FRAMESIZE][16];
+   float best_cost;
+   int best_state;
+
+   for (i=0;i<16;i++)
+   {
+      /* Impossible state */
+      states[0][i] = -1;
+      cost[0][i] = 1e10;
+   }
+   for (i=0;i<4;i++)
+   {
+      cost[0][1<<i] = frame_cost + rate*(1<<i)*transient_boost(E, E_1, i, N+1);
+      states[0][1<<i] = i;
+   }
+   for (i=1;i<N;i++)
+   {
+      int j;
+
+      /* Follow continuations */
+      for (j=2;j<16;j++)
+      {
+         cost[i][j] = cost[i-1][j-1];
+         states[i][j] = j-1;
+      }
+
+      /* New frames */
+      for(j=0;j<4;j++)
+      {
+         int k;
+         float min_cost;
+         float curr_cost;
+         states[i][1<<j] = 1;
+         min_cost = cost[i-1][1];
+         for(k=1;k<4;k++)
+         {
+            float tmp = cost[i-1][(1<<(k+1))-1];
+            if (tmp < min_cost)
+            {
+               states[i][1<<j] = (1<<(k+1))-1;
+               min_cost = tmp;
+            }
+         }
+         curr_cost = frame_cost+rate*(1<<j)*transient_boost(E+i, E_1+i, j, N-i+1);
+         cost[i][1<<j] = min_cost;
+         /* If part of the frame is outside the analysis window, only count part of the cost */
+         if (N-i < (1<<j))
+            cost[i][1<<j] += curr_cost*(float)(N-i)/(1<<j);
+         else
+            cost[i][1<<j] += curr_cost;
+      }
+   }
+
+   best_state=1;
+   best_cost = cost[N-1][1];
+   /* Find best end state (doesn't force a frame to end at N-1) */
+   for (i=2;i<16;i++)
+   {
+      if (cost[N-1][i]<best_cost)
+      {
+         best_cost = cost[N-1][i];
+         best_state = i;
+      }
+   }
+
+   /* Follow transitions back */
+   for (i=N-1;i>=0;i--)
+   {
+      /*printf("%d ", best_state);*/
+      best_state = states[i][best_state];
+   }
+   /*printf("%d\n", best_state);*/
+   return best_state;
+}
+
+static int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs,
+                int bitrate, opus_val16 tonality, opus_val32 *mem, int buffering)
+{
+   int N;
+   int i;
+   float e[MAX_DYNAMIC_FRAMESIZE+4];
+   float e_1[MAX_DYNAMIC_FRAMESIZE+3];
+   float memx;
+   int bestLM=0;
+   int subframe;
+   int pos;
+
+   subframe = Fs/400;
+   e[0]=mem[0];
+   e_1[0]=1./(EPSILON+mem[0]);
+   if (buffering)
+   {
+      /* Consider the CELT delay when not in restricted-lowdelay */
+      /* We assume the buffering is between 2.5 and 5 ms */
+      int offset = 2*subframe - buffering;
+      celt_assert(offset>=0 && offset <= subframe);
+      x += C*offset;
+      len -= offset;
+      e[1]=mem[1];
+      e_1[1]=1./(EPSILON+mem[1]);
+      e[2]=mem[2];
+      e_1[2]=1./(EPSILON+mem[2]);
+      pos = 3;
+   } else {
+      pos=1;
+   }
+   N=IMIN(len/subframe, MAX_DYNAMIC_FRAMESIZE);
+   memx = x[0];
+   for (i=0;i<N;i++)
+   {
+      float tmp;
+      float tmpx;
+      int j;
+      tmp=EPSILON;
+      if (C==1)
+      {
+         for (j=0;j<subframe;j++)
+         {
+            tmpx = x[subframe*i+j];
+            tmp += (tmpx-memx)*(tmpx-memx);
+            memx = tmpx;
+         }
+      } else {
+         for (j=0;j<subframe;j++)
+         {
+            tmpx = x[(subframe*i+j)*2]+x[(subframe*i+j)*2+1];
+            tmp += (tmpx-memx)*(tmpx-memx);
+            memx = tmpx;
+         }
+      }
+      e[i+pos] = tmp;
+      e_1[i+pos] = 1.f/tmp;
+   }
+   /* Hack to get 20 ms working with APPLICATION_AUDIO
+      The real problem is that the corresponding memory needs to use 1.5 ms
+      from this frame and 1 ms from the next frame */
+   e[i+pos] = e[i+pos-1];
+   if (buffering)
+      N=IMIN(MAX_DYNAMIC_FRAMESIZE, N+2);
+   bestLM = transient_viterbi(e, e_1, N, (1.f+.5*tonality)*(40*C+40), bitrate/400);
+   mem[0] = e[1<<bestLM];
+   if (buffering)
+   {
+      mem[1] = e[(1<<bestLM)+1];
+      mem[2] = e[(1<<bestLM)+2];
+   }
+   return bestLM;
+}
+#endif
+
 opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_size,
                 unsigned char *data, opus_int32 out_data_bytes, int lsb_depth)
 {
@@ -565,6 +766,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
     opus_int32 max_data_bytes; /* Max number of bytes we're allowed to use */
     int extra_buffer, total_buffer;
     int perform_analysis=0;
+    int orig_frame_size;
 #ifndef FIXED_POINT
     AnalysisInfo analysis_info;
 #endif
@@ -575,22 +777,36 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
     max_data_bytes = IMIN(1276, out_data_bytes);
 
     st->rangeFinal = 0;
-    if (400*frame_size != st->Fs && 200*frame_size != st->Fs && 100*frame_size != st->Fs &&
+    if ((!st->variable_duration && 400*frame_size != st->Fs && 200*frame_size != st->Fs && 100*frame_size != st->Fs &&
          50*frame_size != st->Fs &&  25*frame_size != st->Fs &&  50*frame_size != 3*st->Fs)
-    {
-       RESTORE_STACK;
-       return OPUS_BAD_ARG;
-    }
-    if (max_data_bytes<=0)
+         || (400*frame_size < st->Fs)
+         || max_data_bytes<=0
+         )
     {
        RESTORE_STACK;
        return OPUS_BAD_ARG;
     }
     silk_enc = (char*)st+st->silk_enc_offset;
     celt_enc = (CELTEncoder*)((char*)st+st->celt_enc_offset);
+    if (st->application == OPUS_APPLICATION_RESTRICTED_LOWDELAY)
+       delay_compensation = 0;
+    else
+       delay_compensation = st->delay_compensation;
 
     lsb_depth = IMIN(lsb_depth, st->lsb_depth);
 
+    orig_frame_size = IMIN(frame_size,st->Fs/50);
+    if (st->variable_duration)
+    {
+       int LM = 3;
+#ifndef FIXED_POINT
+       LM = optimize_framesize(pcm, frame_size, st->channels, st->Fs, st->bitrate_bps,
+             st->analysis.prev_tonality, st->subframe_mem, delay_compensation);
+#endif
+       while ((st->Fs/400<<LM)>frame_size)
+          LM--;
+       frame_size = (st->Fs/400<<LM);
+    }
 #ifndef FIXED_POINT
     /* Only perform analysis up to 20-ms frames. Longer ones will be split if
        they're in CELT-only mode. */
@@ -611,10 +827,6 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
     }
 #endif
 
-    if (st->application == OPUS_APPLICATION_RESTRICTED_LOWDELAY)
-       delay_compensation = 0;
-    else
-       delay_compensation = st->delay_compensation;
     total_buffer = delay_compensation;
     extra_buffer = total_buffer-delay_compensation;
     st->bitrate_bps = user_bitrate_to_bitrate(st, frame_size, max_data_bytes);
@@ -1196,9 +1408,18 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
         } else {
             if (st->use_vbr)
             {
+                opus_int32 bonus=0;
+#ifndef FIXED_POINT
+                if (orig_frame_size != frame_size)
+                {
+                   bonus = (40*st->stream_channels+40)*(48000/frame_size-48000/orig_frame_size);
+                   if (analysis_info.valid)
+                      bonus = bonus*(1.f+.5*analysis_info.tonality);
+                }
+#endif
                 celt_encoder_ctl(celt_enc, OPUS_SET_VBR(1));
                 celt_encoder_ctl(celt_enc, OPUS_SET_VBR_CONSTRAINT(st->vbr_constraint));
-                celt_encoder_ctl(celt_enc, OPUS_SET_BITRATE(st->bitrate_bps));
+                celt_encoder_ctl(celt_enc, OPUS_SET_BITRATE(st->bitrate_bps+bonus));
                 nb_compr_bytes = max_data_bytes-1-redundancy_bytes;
             } else {
                 nb_compr_bytes = bytes_target;
@@ -1741,6 +1962,20 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...)
             *value = st->lsb_depth;
         }
         break;
+        case OPUS_SET_EXPERT_VARIABLE_DURATION_REQUEST:
+        {
+            opus_int32 value = va_arg(ap, opus_int32);
+            if (value<0 || value>1)
+               goto bad_arg;
+            st->variable_duration = value;
+        }
+        break;
+        case OPUS_GET_EXPERT_VARIABLE_DURATION_REQUEST:
+        {
+            opus_int32 *value = va_arg(ap, opus_int32*);
+            *value = st->variable_duration;
+        }
+        break;
         case OPUS_RESET_STATE:
         {
            void *silk_enc;

From 854796481543539f2cda745861c0cd398a1a78de Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Mon, 26 Nov 2012 03:08:15 -0500
Subject: [PATCH 06/15] Re-enable analysis for 40- and 60-ms frames

---
 src/opus_encoder.c | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/src/opus_encoder.c b/src/opus_encoder.c
index e55eabda..f57fa52e 100644
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -810,20 +810,23 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
 #ifndef FIXED_POINT
     /* Only perform analysis up to 20-ms frames. Longer ones will be split if
        they're in CELT-only mode. */
-    perform_analysis = st->silk_mode.complexity >= 7 && frame_size <= st->Fs/50 && st->Fs==48000;
-    if (perform_analysis)
+    analysis_info.valid = 0;
+    perform_analysis = st->silk_mode.complexity >= 7 && st->Fs==48000;
+    if (!perform_analysis)
+    {
+       st->voice_ratio = -1;
+       st->detected_bandwidth = 0;
+    } else if (frame_size <= st->Fs/50)
     {
-       analysis_info.valid = 0;
        tonality_analysis(&st->analysis, &analysis_info, celt_enc, pcm, IMIN(480, frame_size), st->channels, lsb_depth);
        if (frame_size > st->Fs/100)
           tonality_analysis(&st->analysis, &analysis_info, celt_enc, pcm+(st->Fs/100)*st->channels, 480, st->channels, lsb_depth);
-       if (analysis_info.valid && st->signal_type == OPUS_AUTO)
-          st->voice_ratio = (int)floor(.5+100*(1-analysis_info.music_prob));
-       st->detected_bandwidth = analysis_info.opus_bandwidth;
-    } else {
-       analysis_info.valid = 0;
-       st->voice_ratio = -1;
-       st->detected_bandwidth = 0;
+       if (analysis_info.valid)
+       {
+          if (st->signal_type == OPUS_AUTO)
+             st->voice_ratio = (int)floor(.5+100*(1-analysis_info.music_prob));
+          st->detected_bandwidth = analysis_info.opus_bandwidth;
+       }
     }
 #endif
 
@@ -1160,6 +1163,14 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
        RESTORE_STACK;
        return ret;
     }
+    /* Perform analysis for 40-60 ms frames */
+    if (perform_analysis && frame_size > st->Fs/50)
+    {
+       int nb_analysis = frame_size/(st->Fs/100);
+       for (i=0;i<nb_analysis;i++)
+          tonality_analysis(&st->analysis, &analysis_info, celt_enc, pcm+i*(st->Fs/100)*st->channels, 480, st->channels, lsb_depth);
+       st->voice_ratio = (int)floor(.5+100*(1-analysis_info.music_prob));
+    }
 
     curr_bandwidth = st->bandwidth;
 

From f548a5a35d4e808eba5224084034c09421edcc6a Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Mon, 26 Nov 2012 23:20:01 -0500
Subject: [PATCH 07/15] Makes variable framesize less aggressive at lower rates

---
 src/opus_encoder.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/opus_encoder.c b/src/opus_encoder.c
index f57fa52e..c4a46557 100644
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -590,6 +590,10 @@ static int transient_viterbi(const float *E, const float *E_1, int N, int frame_
    float best_cost;
    int best_state;
 
+   /* Makes variable framesize less aggressive at lower bitrates, but I can't
+      find any valid theretical justification for this (other than it seems
+      to help) */
+   frame_cost *= 720/rate;
    for (i=0;i<16;i++)
    {
       /* Impossible state */

From 744836604644fbb94409592069238088852db599 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Mon, 17 Dec 2012 16:23:42 -0500
Subject: [PATCH 08/15] Multistream support for variable frame duration

Also fixes a bug with stereo streams where the initial memory was only
using the left channel.
---
 src/opus_encoder.c             | 35 ++++++-------
 src/opus_multistream_encoder.c | 89 ++++++++++++++++++++++++++++------
 src/opus_private.h             |  2 +
 3 files changed, 95 insertions(+), 31 deletions(-)

diff --git a/src/opus_encoder.c b/src/opus_encoder.c
index c4a46557..4c0840ff 100644
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -665,19 +665,21 @@ static int transient_viterbi(const float *E, const float *E_1, int N, int frame_
    return best_state;
 }
 
-static int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs,
+int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs,
                 int bitrate, opus_val16 tonality, opus_val32 *mem, int buffering)
 {
    int N;
-   int i;
+   int i, c;
    float e[MAX_DYNAMIC_FRAMESIZE+4];
    float e_1[MAX_DYNAMIC_FRAMESIZE+3];
    float memx;
    int bestLM=0;
    int subframe;
    int pos;
+   VARDECL(opus_val16, sub);
 
    subframe = Fs/400;
+   ALLOC(sub, subframe, opus_val16);
    e[0]=mem[0];
    e_1[0]=1./(EPSILON+mem[0]);
    if (buffering)
@@ -698,27 +700,26 @@ static int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs
    }
    N=IMIN(len/subframe, MAX_DYNAMIC_FRAMESIZE);
    memx = x[0];
+   for (c=1;c<C;c++)
+      memx += x[c];
    for (i=0;i<N;i++)
    {
       float tmp;
       float tmpx;
       int j;
       tmp=EPSILON;
-      if (C==1)
+
+      for (j=0;j<subframe;j++)
+         sub[j] = x[(subframe*i+j)*C];
+      for (c=1;c<C;c++)
+         for (j=0;j<subframe;j++)
+            sub[j] += x[(subframe*i+j)*C+c];
+
+      for (j=0;j<subframe;j++)
       {
-         for (j=0;j<subframe;j++)
-         {
-            tmpx = x[subframe*i+j];
-            tmp += (tmpx-memx)*(tmpx-memx);
-            memx = tmpx;
-         }
-      } else {
-         for (j=0;j<subframe;j++)
-         {
-            tmpx = x[(subframe*i+j)*2]+x[(subframe*i+j)*2+1];
-            tmp += (tmpx-memx)*(tmpx-memx);
-            memx = tmpx;
-         }
+         tmpx = sub[j];
+         tmp += (tmpx-memx)*(tmpx-memx);
+         memx = tmpx;
       }
       e[i+pos] = tmp;
       e_1[i+pos] = 1.f/tmp;
@@ -1425,7 +1426,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
             {
                 opus_int32 bonus=0;
 #ifndef FIXED_POINT
-                if (orig_frame_size != frame_size)
+                if (st->variable_duration && orig_frame_size != frame_size)
                 {
                    bonus = (40*st->stream_channels+40)*(48000/frame_size-48000/orig_frame_size);
                    if (analysis_info.valid)
diff --git a/src/opus_multistream_encoder.c b/src/opus_multistream_encoder.c
index db9fc785..d048f535 100644
--- a/src/opus_multistream_encoder.c
+++ b/src/opus_multistream_encoder.c
@@ -40,6 +40,9 @@
 struct OpusMSEncoder {
    ChannelLayout layout;
    int bitrate;
+   int variable_duration;
+   opus_int32 bitrate_bps;
+   opus_val32 subframe_mem[3];
    /* Encoder states go here */
 };
 
@@ -193,10 +196,38 @@ static int opus_multistream_encode_native
    VARDECL(opus_val16, buf);
    unsigned char tmp_data[MS_FRAME_TMP];
    OpusRepacketizer rp;
+   int orig_frame_size;
+   int coded_channels;
+   opus_int32 channel_rate;
    ALLOC_STACK;
 
    ptr = (char*)st + align(sizeof(OpusMSEncoder));
    opus_encoder_ctl((OpusEncoder*)ptr, OPUS_GET_SAMPLE_RATE(&Fs));
+
+   if (400*frame_size < Fs)
+   {
+      RESTORE_STACK;
+      return OPUS_BAD_ARG;
+   }
+   orig_frame_size = IMIN(frame_size,Fs/50);
+   if (st->variable_duration)
+   {
+      int LM = 3;
+      int channels;
+      opus_int32 delay_compensation;
+
+      channels = st->layout.nb_streams + st->layout.nb_coupled_streams;
+      opus_encoder_ctl((OpusEncoder*)ptr, OPUS_GET_LOOKAHEAD(&delay_compensation));
+      delay_compensation -= Fs/400;
+#ifndef FIXED_POINT
+      LM = optimize_framesize(pcm, frame_size, channels, Fs, st->bitrate_bps,
+            0.f, st->subframe_mem, delay_compensation);
+#endif
+      while ((Fs/400<<LM)>frame_size)
+         LM--;
+      frame_size = (Fs/400<<LM);
+   }
+
    /* Validate frame_size before using it to allocate stack space.
       This mirrors the checks in opus_encode[_float](). */
    if (400*frame_size != Fs && 200*frame_size != Fs &&
@@ -215,6 +246,31 @@ static int opus_multistream_encode_native
       RESTORE_STACK;
       return OPUS_BUFFER_TOO_SMALL;
    }
+
+   /* Compute bitrate allocation between streams (this could be a lot better) */
+   coded_channels = st->layout.nb_streams + st->layout.nb_coupled_streams;
+   channel_rate = st->bitrate_bps / coded_channels;
+#ifndef FIXED_POINT
+   if (st->variable_duration && orig_frame_size != frame_size)
+   {
+      opus_int32 bonus;
+      bonus = 60*(48000/frame_size-48000/orig_frame_size);
+      channel_rate += bonus;
+   }
+#endif
+   ptr = (char*)st + align(sizeof(OpusMSEncoder));
+   for (s=0;s<st->layout.nb_streams;s++)
+   {
+      OpusEncoder *enc;
+      enc = (OpusEncoder*)ptr;
+      if (s < st->layout.nb_coupled_streams)
+         ptr += align(coupled_size);
+      else
+         ptr += align(mono_size);
+      opus_encoder_ctl(enc, OPUS_SET_BITRATE(channel_rate * (s < st->layout.nb_coupled_streams ? 2 : 1)));
+   }
+
+   ptr = (char*)st + align(sizeof(OpusMSEncoder));
    /* Counting ToC */
    tot_size = 0;
    for (s=0;s<st->layout.nb_streams;s++)
@@ -378,20 +434,8 @@ int opus_multistream_encoder_ctl(OpusMSEncoder *st, int request, ...)
    {
    case OPUS_SET_BITRATE_REQUEST:
    {
-      int chan, s;
       opus_int32 value = va_arg(ap, opus_int32);
-      chan = st->layout.nb_streams + st->layout.nb_coupled_streams;
-      value /= chan;
-      for (s=0;s<st->layout.nb_streams;s++)
-      {
-         OpusEncoder *enc;
-         enc = (OpusEncoder*)ptr;
-         if (s < st->layout.nb_coupled_streams)
-            ptr += align(coupled_size);
-         else
-            ptr += align(mono_size);
-         opus_encoder_ctl(enc, request, value * (s < st->layout.nb_coupled_streams ? 2 : 1));
-      }
+      st->bitrate_bps = value;
    }
    break;
    case OPUS_GET_BITRATE_REQUEST:
@@ -504,7 +548,21 @@ int opus_multistream_encoder_ctl(OpusMSEncoder *st, int request, ...)
       }
       *value = (OpusEncoder*)ptr;
    }
-      break;
+   break;
+   case OPUS_SET_EXPERT_VARIABLE_DURATION_REQUEST:
+   {
+       opus_int32 value = va_arg(ap, opus_int32);
+       if (value<0 || value>1)
+          goto bad_arg;
+       st->variable_duration = value;
+   }
+   break;
+   case OPUS_GET_EXPERT_VARIABLE_DURATION_REQUEST:
+   {
+       opus_int32 *value = va_arg(ap, opus_int32*);
+       *value = st->variable_duration;
+   }
+   break;
    default:
       ret = OPUS_UNIMPLEMENTED;
       break;
@@ -512,6 +570,9 @@ int opus_multistream_encoder_ctl(OpusMSEncoder *st, int request, ...)
 
    va_end(ap);
    return ret;
+bad_arg:
+   va_end(ap);
+   return OPUS_BAD_ARG;
 }
 
 void opus_multistream_encoder_destroy(OpusMSEncoder *st)
diff --git a/src/opus_private.h b/src/opus_private.h
index 977f4a25..94de0033 100644
--- a/src/opus_private.h
+++ b/src/opus_private.h
@@ -81,6 +81,8 @@ int get_mono_channel(const ChannelLayout *layout, int stream_id, int prev);
 #define OPUS_SET_FORCE_MODE_REQUEST    11002
 #define OPUS_SET_FORCE_MODE(x) OPUS_SET_FORCE_MODE_REQUEST, __opus_check_int(x)
 
+int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs,
+                int bitrate, opus_val16 tonality, opus_val32 *mem, int buffering);
 
 int encode_size(int size, unsigned char *data);
 

From 95561be6dedc77a95a217989ee93255f48f91530 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Mon, 17 Dec 2012 17:54:01 -0500
Subject: [PATCH 09/15] Better handling of the multistream bitrate

Now supports OPUS_AUTO and OPUS_BITRATE_MAX
---
 src/opus_multistream_encoder.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/opus_multistream_encoder.c b/src/opus_multistream_encoder.c
index d048f535..1b41c651 100644
--- a/src/opus_multistream_encoder.c
+++ b/src/opus_multistream_encoder.c
@@ -39,7 +39,6 @@
 
 struct OpusMSEncoder {
    ChannelLayout layout;
-   int bitrate;
    int variable_duration;
    opus_int32 bitrate_bps;
    opus_val32 subframe_mem[3];
@@ -105,6 +104,7 @@ int opus_multistream_encoder_init(
    st->layout.nb_streams = streams;
    st->layout.nb_coupled_streams = coupled_streams;
 
+   st->bitrate_bps = OPUS_AUTO;
    for (i=0;i<st->layout.nb_channels;i++)
       st->layout.mapping[i] = mapping[i];
    if (!validate_layout(&st->layout) || !validate_encoder_layout(&st->layout))
@@ -249,7 +249,15 @@ static int opus_multistream_encode_native
 
    /* Compute bitrate allocation between streams (this could be a lot better) */
    coded_channels = st->layout.nb_streams + st->layout.nb_coupled_streams;
-   channel_rate = st->bitrate_bps / coded_channels;
+   if (st->bitrate_bps==OPUS_AUTO)
+   {
+      channel_rate = Fs+60*Fs/orig_frame_size;
+   } else if (st->bitrate_bps==OPUS_BITRATE_MAX)
+   {
+      channel_rate = 300000;
+   } else {
+      channel_rate = st->bitrate_bps/coded_channels;
+   }
 #ifndef FIXED_POINT
    if (st->variable_duration && orig_frame_size != frame_size)
    {
@@ -435,6 +443,8 @@ int opus_multistream_encoder_ctl(OpusMSEncoder *st, int request, ...)
    case OPUS_SET_BITRATE_REQUEST:
    {
       opus_int32 value = va_arg(ap, opus_int32);
+      if (value<0 && value!=OPUS_AUTO && value!=OPUS_BITRATE_MAX)
+         goto bad_arg;
       st->bitrate_bps = value;
    }
    break;

From bb43b8b69d3e06ec2609a6040513952cb472d742 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Mon, 17 Dec 2012 18:02:56 -0500
Subject: [PATCH 10/15] No need for extra_buffer anymore

---
 src/opus_encoder.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/opus_encoder.c b/src/opus_encoder.c
index 4c0840ff..bcfe6b47 100644
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -769,7 +769,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
     int curr_bandwidth;
     opus_val16 HB_gain;
     opus_int32 max_data_bytes; /* Max number of bytes we're allowed to use */
-    int extra_buffer, total_buffer;
+    int total_buffer;
     int perform_analysis=0;
     int orig_frame_size;
 #ifndef FIXED_POINT
@@ -836,7 +836,6 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
 #endif
 
     total_buffer = delay_compensation;
-    extra_buffer = total_buffer-delay_compensation;
     st->bitrate_bps = user_bitrate_to_bitrate(st, frame_size, max_data_bytes);
 
     frame_rate = st->Fs/frame_size;
@@ -1450,7 +1449,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
     if (st->mode != MODE_SILK_ONLY && st->mode != st->prev_mode && st->prev_mode > 0)
     {
        for (i=0;i<st->channels*st->Fs/400;i++)
-          tmp_prefill[i] = st->delay_buffer[(extra_buffer+st->encoder_buffer-total_buffer-st->Fs/400)*st->channels + i];
+          tmp_prefill[i] = st->delay_buffer[(st->encoder_buffer-total_buffer-st->Fs/400)*st->channels + i];
     }
 
     for (i=0;i<st->channels*(st->encoder_buffer-(frame_size+total_buffer));i++)
@@ -1464,7 +1463,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
        const CELTMode *celt_mode;
 
        celt_encoder_ctl(celt_enc, CELT_GET_MODE(&celt_mode));
-       gain_fade(pcm_buf+extra_buffer*st->channels, pcm_buf+extra_buffer*st->channels,
+       gain_fade(pcm_buf, pcm_buf,
              st->prev_HB_gain, HB_gain, celt_mode->overlap, frame_size, st->channels, celt_mode->window, st->Fs);
     }
     st->prev_HB_gain = HB_gain;
@@ -1486,7 +1485,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
             g1 *= (1.f/16384);
             g2 *= (1.f/16384);
 #endif
-            stereo_fade(pcm_buf+extra_buffer*st->channels, pcm_buf+extra_buffer*st->channels, g1, g2, celt_mode->overlap,
+            stereo_fade(pcm_buf, pcm_buf, g1, g2, celt_mode->overlap,
                   frame_size, st->channels, celt_mode->window, st->Fs);
             st->hybrid_stereo_width_Q14 = st->silk_mode.stereoWidth_Q14;
         }
@@ -1540,7 +1539,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
         int err;
         celt_encoder_ctl(celt_enc, CELT_SET_START_BAND(0));
         celt_encoder_ctl(celt_enc, OPUS_SET_VBR(0));
-        err = celt_encode_with_ec(celt_enc, pcm_buf+extra_buffer*st->channels, st->Fs/200, data+nb_compr_bytes, redundancy_bytes, NULL);
+        err = celt_encode_with_ec(celt_enc, pcm_buf, st->Fs/200, data+nb_compr_bytes, redundancy_bytes, NULL);
         if (err < 0)
         {
            RESTORE_STACK;
@@ -1570,7 +1569,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
            if (perform_analysis)
               celt_encoder_ctl(celt_enc, CELT_SET_ANALYSIS(&analysis_info));
 #endif
-           ret = celt_encode_with_ec(celt_enc, pcm_buf+extra_buffer*st->channels, frame_size, NULL, nb_compr_bytes, &enc);
+           ret = celt_encode_with_ec(celt_enc, pcm_buf, frame_size, NULL, nb_compr_bytes, &enc);
            if (ret < 0)
            {
               RESTORE_STACK;
@@ -1593,9 +1592,9 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
         celt_encoder_ctl(celt_enc, CELT_SET_PREDICTION(0));
 
         /* NOTE: We could speed this up slightly (at the expense of code size) by just adding a function that prefills the buffer */
-        celt_encode_with_ec(celt_enc, pcm_buf+st->channels*(extra_buffer+frame_size-N2-N4), N4, dummy, 2, NULL);
+        celt_encode_with_ec(celt_enc, pcm_buf+st->channels*(frame_size-N2-N4), N4, dummy, 2, NULL);
 
-        err = celt_encode_with_ec(celt_enc, pcm_buf+st->channels*(extra_buffer+frame_size-N2), N2, data+nb_compr_bytes, redundancy_bytes, NULL);
+        err = celt_encode_with_ec(celt_enc, pcm_buf+st->channels*(frame_size-N2), N2, data+nb_compr_bytes, redundancy_bytes, NULL);
         if (err < 0)
         {
            RESTORE_STACK;

From 10a34a5dd66ff45538ac3843eab7802d260e160f Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Thu, 20 Dec 2012 00:23:01 -0500
Subject: [PATCH 11/15] Making multistream variable duration work for both the
 float and int API

---
 src/opus_encoder.c             | 45 +++++++++++++++++++++++++---------
 src/opus_multistream_encoder.c |  9 ++++---
 src/opus_private.h             |  7 +++++-
 3 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/src/opus_encoder.c b/src/opus_encoder.c
index bcfe6b47..19778a40 100644
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -665,11 +665,36 @@ static int transient_viterbi(const float *E, const float *E_1, int N, int frame_
    return best_state;
 }
 
+void downmix_float(const void *_x, float *sub, int subframe, int i, int C)
+{
+   const float *x;
+   int c, j;
+   x = (const float *)_x;
+   for (j=0;j<subframe;j++)
+      sub[j] = x[(subframe*i+j)*C];
+   for (c=1;c<C;c++)
+      for (j=0;j<subframe;j++)
+         sub[j] += x[(subframe*i+j)*C+c];
+}
+
+void downmix_int(const void *_x, float *sub, int subframe, int i, int C)
+{
+   const opus_int16 *x;
+   int c, j;
+   x = (const opus_int16 *)_x;
+   for (j=0;j<subframe;j++)
+      sub[j] = x[(subframe*i+j)*C];
+   for (c=1;c<C;c++)
+      for (j=0;j<subframe;j++)
+         sub[j] += x[(subframe*i+j)*C+c];
+}
+
 int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs,
-                int bitrate, opus_val16 tonality, opus_val32 *mem, int buffering)
+                int bitrate, opus_val16 tonality, opus_val32 *mem, int buffering,
+                downmix_func downmix)
 {
    int N;
-   int i, c;
+   int i;
    float e[MAX_DYNAMIC_FRAMESIZE+4];
    float e_1[MAX_DYNAMIC_FRAMESIZE+3];
    float memx;
@@ -700,8 +725,6 @@ int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs,
    }
    N=IMIN(len/subframe, MAX_DYNAMIC_FRAMESIZE);
    memx = x[0];
-   for (c=1;c<C;c++)
-      memx += x[c];
    for (i=0;i<N;i++)
    {
       float tmp;
@@ -709,12 +732,9 @@ int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs,
       int j;
       tmp=EPSILON;
 
-      for (j=0;j<subframe;j++)
-         sub[j] = x[(subframe*i+j)*C];
-      for (c=1;c<C;c++)
-         for (j=0;j<subframe;j++)
-            sub[j] += x[(subframe*i+j)*C+c];
-
+      downmix(x, sub, subframe, i, C);
+      if (i==0)
+         memx = sub[0];
       for (j=0;j<subframe;j++)
       {
          tmpx = sub[j];
@@ -806,7 +826,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
        int LM = 3;
 #ifndef FIXED_POINT
        LM = optimize_framesize(pcm, frame_size, st->channels, st->Fs, st->bitrate_bps,
-             st->analysis.prev_tonality, st->subframe_mem, delay_compensation);
+             st->analysis.prev_tonality, st->subframe_mem, delay_compensation, downmix_float);
 #endif
        while ((st->Fs/400<<LM)>frame_size)
           LM--;
@@ -1167,6 +1187,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
        RESTORE_STACK;
        return ret;
     }
+#ifndef FIXED_POINT
     /* Perform analysis for 40-60 ms frames */
     if (perform_analysis && frame_size > st->Fs/50)
     {
@@ -1175,7 +1196,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
           tonality_analysis(&st->analysis, &analysis_info, celt_enc, pcm+i*(st->Fs/100)*st->channels, 480, st->channels, lsb_depth);
        st->voice_ratio = (int)floor(.5+100*(1-analysis_info.music_prob));
     }
-
+#endif
     curr_bandwidth = st->bandwidth;
 
     /* Chooses the appropriate mode for speech
diff --git a/src/opus_multistream_encoder.c b/src/opus_multistream_encoder.c
index 1b41c651..c5fb3669 100644
--- a/src/opus_multistream_encoder.c
+++ b/src/opus_multistream_encoder.c
@@ -185,6 +185,9 @@ static int opus_multistream_encode_native
     unsigned char *data,
     opus_int32 max_data_bytes,
     int lsb_depth
+#ifndef FIXED_POINT
+    , downmix_func downmix
+#endif
 )
 {
    opus_int32 Fs;
@@ -221,7 +224,7 @@ static int opus_multistream_encode_native
       delay_compensation -= Fs/400;
 #ifndef FIXED_POINT
       LM = optimize_framesize(pcm, frame_size, channels, Fs, st->bitrate_bps,
-            0.f, st->subframe_mem, delay_compensation);
+            0.f, st->subframe_mem, delay_compensation, downmix);
 #endif
       while ((Fs/400<<LM)>frame_size)
          LM--;
@@ -410,7 +413,7 @@ int opus_multistream_encode_float
 )
 {
    return opus_multistream_encode_native(st, opus_copy_channel_in_float,
-      pcm, frame_size, data, max_data_bytes, 24);
+      pcm, frame_size, data, max_data_bytes, 24, downmix_float);
 }
 
 int opus_multistream_encode(
@@ -422,7 +425,7 @@ int opus_multistream_encode(
 )
 {
    return opus_multistream_encode_native(st, opus_copy_channel_in_short,
-      pcm, frame_size, data, max_data_bytes, 16);
+      pcm, frame_size, data, max_data_bytes, 16, downmix_int);
 }
 #endif
 
diff --git a/src/opus_private.h b/src/opus_private.h
index 94de0033..33a982e5 100644
--- a/src/opus_private.h
+++ b/src/opus_private.h
@@ -81,8 +81,13 @@ int get_mono_channel(const ChannelLayout *layout, int stream_id, int prev);
 #define OPUS_SET_FORCE_MODE_REQUEST    11002
 #define OPUS_SET_FORCE_MODE(x) OPUS_SET_FORCE_MODE_REQUEST, __opus_check_int(x)
 
+typedef void (*downmix_func)(const void *, float *, int, int, int);
+void downmix_float(const void *_x, float *sub, int subframe, int i, int C);
+void downmix_int(const void *_x, float *sub, int subframe, int i, int C);
+
 int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs,
-                int bitrate, opus_val16 tonality, opus_val32 *mem, int buffering);
+                int bitrate, opus_val16 tonality, opus_val32 *mem, int buffering,
+                void (*downmix)(const void *, float *, int, int, int));
 
 int encode_size(int size, unsigned char *data);
 

From 51f4a32ec2b62fd7c53e7b901fefd38ff95e7cc2 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Wed, 20 Feb 2013 04:08:04 -0500
Subject: [PATCH 12/15] Adds support for delayed decision

Variable duration option renamed to OPUS_SET_EXPERT_FRAME_DURATION,
with new API.
Also moves up the analysis to avoid having to do int->float conversion
on large buffers.
---
 include/opus_defines.h         |  17 ++-
 src/analysis.c                 | 184 +++++++++++++++++++++++++++----
 src/analysis.h                 |  22 +++-
 src/opus_demo.c                |  33 +++++-
 src/opus_encoder.c             | 195 +++++++++++++++++++++------------
 src/opus_multistream_encoder.c |  51 ++++++---
 src/opus_private.h             |  15 ++-
 7 files changed, 395 insertions(+), 122 deletions(-)

diff --git a/include/opus_defines.h b/include/opus_defines.h
index e9434aab..203144a7 100644
--- a/include/opus_defines.h
+++ b/include/opus_defines.h
@@ -149,8 +149,8 @@ extern "C" {
 #define OPUS_SET_LSB_DEPTH_REQUEST           4036
 #define OPUS_GET_LSB_DEPTH_REQUEST           4037
 #define OPUS_GET_LAST_PACKET_DURATION_REQUEST 4039
-#define OPUS_SET_EXPERT_VARIABLE_DURATION_REQUEST 4040
-#define OPUS_GET_EXPERT_VARIABLE_DURATION_REQUEST 4041
+#define OPUS_SET_EXPERT_FRAME_DURATION_REQUEST 4040
+#define OPUS_GET_EXPERT_FRAME_DURATION_REQUEST 4041
 
 /* Don't use 4045, it's already taken by OPUS_GET_GAIN_REQUEST */
 
@@ -186,6 +186,15 @@ extern "C" {
 #define OPUS_BANDWIDTH_SUPERWIDEBAND         1104 /**<12 kHz bandpass @hideinitializer*/
 #define OPUS_BANDWIDTH_FULLBAND              1105 /**<20 kHz bandpass @hideinitializer*/
 
+#define OPUS_FRAMESIZE_ARG                   5000 /**< Select frame size from the argument (default) */
+#define OPUS_FRAMESIZE_2_5_MS                5001 /**< Use 2.5 ms frames */
+#define OPUS_FRAMESIZE_5_MS                  5002 /**< Use 5 ms frames */
+#define OPUS_FRAMESIZE_10_MS                 5003 /**< Use 10 ms frames */
+#define OPUS_FRAMESIZE_20_MS                 5004 /**< Use 20 ms frames */
+#define OPUS_FRAMESIZE_40_MS                 5005 /**< Use 40 ms frames */
+#define OPUS_FRAMESIZE_60_MS                 5006 /**< Use 60 ms frames */
+#define OPUS_FRAMESIZE_VARIABLE              5010 /**< Optimize the frame size dynamically */
+
 /**@}*/
 
 
@@ -541,7 +550,7 @@ extern "C" {
   * <dt>1</dt><dd>Enable variable duration.</dd>
   * </dl>
   * @hideinitializer */
-#define OPUS_SET_EXPERT_VARIABLE_DURATION(x) OPUS_SET_EXPERT_VARIABLE_DURATION_REQUEST, __opus_check_int(x)
+#define OPUS_SET_EXPERT_FRAME_DURATION(x) OPUS_SET_EXPERT_FRAME_DURATION_REQUEST, __opus_check_int(x)
 /** Gets the encoder's configured use of variable duration frames.
   * @see OPUS_SET_EXPERT_VARIABLE_DURATION
   * @param[out] x <tt>opus_int32 *</tt>: Returns one of the following values:
@@ -550,7 +559,7 @@ extern "C" {
   * <dt>1</dt><dd>variable duration enabled.</dd>
   * </dl>
   * @hideinitializer */
-#define OPUS_GET_EXPERT_VARIABLE_DURATION(x) OPUS_GET_EXPERT_VARIABLE_DURATION_REQUEST, __opus_check_int_ptr(x)
+#define OPUS_GET_EXPERT_FRAME_DURATION(x) OPUS_GET_EXPERT_FRAME_DURATION_REQUEST, __opus_check_int_ptr(x)
 
 /**@}*/
 
diff --git a/src/analysis.c b/src/analysis.c
index 6b07890a..54005d3a 100644
--- a/src/analysis.c
+++ b/src/analysis.c
@@ -139,10 +139,81 @@ static inline float fast_atan2f(float y, float x) {
    }
 }
 
-void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEncoder *celt_enc, const opus_val16 *x, int len, int C, int lsb_depth)
+void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
+{
+#if 1
+   int pos;
+   int curr_lookahead;
+   float psum;
+   int i;
+
+   pos = tonal->read_pos;
+   curr_lookahead = tonal->write_pos-tonal->read_pos;
+   if (curr_lookahead<0)
+      curr_lookahead += DETECT_SIZE;
+
+   if (len > 480 && pos != tonal->write_pos)
+   {
+      pos++;
+      if (pos==DETECT_SIZE)
+         pos=0;
+   }
+   if (pos == tonal->write_pos)
+      pos--;
+   if (pos<0)
+      pos = DETECT_SIZE-1;
+   OPUS_COPY(info_out, &tonal->info[pos], 1);
+   tonal->read_subframe += len/120;
+   while (tonal->read_subframe>=4)
+   {
+      tonal->read_subframe -= 4;
+      tonal->read_pos++;
+   }
+   if (tonal->read_pos>=DETECT_SIZE)
+      tonal->read_pos-=DETECT_SIZE;
+
+   /* Compensate for the delay in the features themselves.
+      FIXME: Need a better estimate the 10 I just made up */
+   curr_lookahead = IMAX(curr_lookahead-10, 0);
+
+   psum=0;
+   for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
+      psum += tonal->pmusic[i];
+   for (;i<DETECT_SIZE;i++)
+      psum += tonal->pspeech[i];
+   /*printf("%f %f\n", psum, info_out->music_prob);*/
+
+   info_out->music_prob = psum;
+#else
+   /* If data not available, return invalid */
+   if (tonal->read_pos==tonal->write_pos)
+   {
+      info_out->valid=0;
+      return;
+   }
+
+   OPUS_COPY(info_out, &tonal->info[tonal->read_pos], 1);
+   tonal->read_subframe += len/480;
+   while (tonal->read_subframe>=4)
+   {
+      tonal->read_subframe -= 4;
+      tonal->read_pos++;
+   }
+   if (tonal->read_pos>=DETECT_SIZE)
+      tonal->read_pos-=DETECT_SIZE;
+   if (tonal->read_pos == tonal->write_pos)
+   {
+      tonal->read_pos = tonal->write_pos-1;
+      if (tonal->read_pos<0)
+         tonal->read_pos=DETECT_SIZE-1;
+      tonal->read_subframe = 3;
+   }
+#endif
+}
+
+void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, const CELTMode *celt_mode, const void *x, int len, int offset, int C, int lsb_depth, downmix_func downmix)
 {
     int i, b;
-    const CELTMode *mode;
     const kiss_fft_state *kfft;
     kiss_fft_cpx in[480], out[480];
     int N = 480, N2=240;
@@ -171,8 +242,7 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc
     float maxE = 0;
     float noise_floor;
     int remaining;
-
-    celt_encoder_ctl(celt_enc, CELT_GET_MODE(&mode));
+    AnalysisInfo *info;
 
     tonal->last_transition++;
     alpha = 1.f/IMIN(20, 1+tonal->count);
@@ -181,23 +251,19 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc
 
     if (tonal->count<4)
        tonal->music_prob = .5;
-    kfft = mode->mdct.kfft[0];
+    kfft = celt_mode->mdct.kfft[0];
     if (tonal->count==0)
        tonal->mem_fill = 240;
-    if (C==1)
-    {
-       for (i=0;i<IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill);i++)
-          tonal->inmem[i+tonal->mem_fill] = x[i];
-    } else {
-       for (i=0;i<IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill);i++)
-          tonal->inmem[i+tonal->mem_fill] = x[2*i]+x[2*i+1];
-    }
+    downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, C);
     if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
     {
        tonal->mem_fill += len;
        /* Don't have enough to update the analysis */
        return;
     }
+    info = &tonal->info[tonal->write_pos++];
+    if (tonal->write_pos>=DETECT_SIZE)
+       tonal->write_pos-=DETECT_SIZE;
 
     for (i=0;i<N2;i++)
     {
@@ -209,15 +275,7 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc
     }
     OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
     remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
-    if (C==1)
-    {
-       for (i=0;i<remaining;i++)
-          tonal->inmem[240+i] = x[ANALYSIS_BUF_SIZE-tonal->mem_fill+i];
-    } else {
-       for (i=0;i<remaining;i++)
-          tonal->inmem[240+i] = x[2*(ANALYSIS_BUF_SIZE-tonal->mem_fill+i)]
-                              + x[2*(ANALYSIS_BUF_SIZE-tonal->mem_fill+i)+1];
-    }
+    downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, C);
     tonal->mem_fill = 240 + remaining;
     opus_fft(kfft, in, out);
 
@@ -450,13 +508,49 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc
        tau = .00005f;
        beta = .1f;
        max_certainty = .01f+1.f/(20.f+.5f*tonal->last_transition);
+       max_certainty = 0;
        p0 = (1-tonal->music_prob)*(1-tau) +    tonal->music_prob *tau;
        p1 =    tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
        p0 *= (float)pow(1-frame_prob, beta);
        p1 *= (float)pow(frame_prob, beta);
        tonal->music_prob = MAX16(max_certainty, MIN16(1-max_certainty, p1/(p0+p1)));
        info->music_prob = tonal->music_prob;
-       /*printf("%f %f\n", frame_prob, info->music_prob);*/
+       info->music_prob = frame_prob;
+
+       float psum=1e-20;
+       float speech0 = (float)pow(1-frame_prob, beta);
+       float music0  = (float)pow(frame_prob, beta);
+       if (tonal->count==1)
+       {
+          tonal->pspeech[0]=.5;
+          tonal->pmusic [0]=.5;
+       }
+       float s0, m0;
+       s0 = tonal->pspeech[0] + tonal->pspeech[1];
+       m0 = tonal->pmusic [0] + tonal->pmusic [1];
+       tonal->pspeech[0] = s0*(1-tau)*speech0;
+       tonal->pmusic [0] = m0*(1-tau)*music0;
+       for (i=1;i<DETECT_SIZE-1;i++)
+       {
+          tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
+          tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
+       }
+       tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
+       tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
+
+       for (i=0;i<DETECT_SIZE;i++)
+          psum += tonal->pspeech[i] + tonal->pmusic[i];
+       psum = 1.f/psum;
+       for (i=0;i<DETECT_SIZE;i++)
+       {
+          tonal->pspeech[i] *= psum;
+          tonal->pmusic [i] *= psum;
+       }
+       psum = tonal->pmusic[0];
+       for (i=1;i<DETECT_SIZE;i++)
+          psum += tonal->pspeech[i];
+
+       /*printf("%f %f %f\n", frame_prob, info->music_prob, psum);*/
     }
     if (tonal->last_music != (tonal->music_prob>.5f))
        tonal->last_transition=0;
@@ -484,4 +578,48 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc
     /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
     info->noisiness = frame_noisiness;
     info->valid = 1;
+    if (info_out!=NULL)
+       OPUS_COPY(info_out, info, 1);
+}
+
+int run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *pcm,
+                        const void *analysis_pcm, int frame_size, int variable_duration, int C, opus_int32 Fs, int bitrate_bps,
+                        int delay_compensation, int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
+{
+   int offset;
+   int pcm_len;
+
+   /* Avoid overflow/wrap-around of the analysis buffer */
+   frame_size = IMIN((DETECT_SIZE-5)*Fs/100, frame_size);
+
+   pcm_len = frame_size - analysis->analysis_offset;
+   offset = 0;
+   do {
+      tonality_analysis(analysis, NULL, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, C, lsb_depth, downmix);
+      offset += 480;
+      pcm_len -= 480;
+   } while (pcm_len>0);
+   analysis->analysis_offset = frame_size;
+
+   if (variable_duration == OPUS_FRAMESIZE_VARIABLE && frame_size >= Fs/200)
+   {
+      int LM = 3;
+      LM = optimize_framesize(pcm, frame_size, C, Fs, bitrate_bps,
+            analysis->prev_tonality, analysis->subframe_mem, delay_compensation, downmix);
+      while ((Fs/400<<LM)>frame_size)
+         LM--;
+      frame_size = (Fs/400<<LM);
+   } else {
+      frame_size = frame_size_select(frame_size, variable_duration, Fs);
+   }
+   if (frame_size<0)
+      return -1;
+   analysis->analysis_offset -= frame_size;
+
+   /* Only perform analysis up to 20-ms frames. Longer ones will be split if
+      they're in CELT-only mode. */
+   analysis_info->valid = 0;
+   tonality_get_info(analysis, analysis_info, frame_size);
+
+   return frame_size;
 }
diff --git a/src/analysis.h b/src/analysis.h
index 6f3689da..37a8bf40 100644
--- a/src/analysis.h
+++ b/src/analysis.h
@@ -28,10 +28,16 @@
 #ifndef ANALYSIS_H
 #define ANALYSIS_H
 
+#include "celt.h"
+#include "opus_private.h"
+
 #define NB_FRAMES 8
 #define NB_TBANDS 18
 #define NB_TOT_BANDS 21
 #define ANALYSIS_BUF_SIZE 720 /* 15 ms at 48 kHz */
+
+#define DETECT_SIZE 200
+
 typedef struct {
    float angle[240];
    float d_angle[240];
@@ -55,9 +61,23 @@ typedef struct {
    int last_transition;
    int count;
    int opus_bandwidth;
+   opus_val32   subframe_mem[3];
+   int analysis_offset;
+   float pspeech[DETECT_SIZE];
+   float pmusic[DETECT_SIZE];
+   int write_pos;
+   int read_pos;
+   int read_subframe;
+   AnalysisInfo info[DETECT_SIZE];
 } TonalityAnalysisState;
 
 void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info,
-     CELTEncoder *celt_enc, const opus_val16 *x, int len, int C, int lsb_depth);
+     const CELTMode *celt_mode, const void *x, int len, int offset, int C, int lsb_depth, downmix_func downmix);
+
+void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len);
+
+int run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *pcm,
+                        const void *analysis_pcm, int frame_size, int variable_duration, int C, opus_int32 Fs, int bitrate_bps,
+                        int delay_compensation, int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info);
 
 #endif
diff --git a/src/opus_demo.c b/src/opus_demo.c
index 6538aad6..a0acb0cd 100644
--- a/src/opus_demo.c
+++ b/src/opus_demo.c
@@ -244,7 +244,8 @@ int main(int argc, char *argv[])
     int mode_switch_time = 48000;
     int nb_encoded;
     int remaining=0;
-    int variable_duration=0;
+    int variable_duration=OPUS_FRAMESIZE_ARG;
+    int delayed_decision=0;
 
     if (argc < 5 )
     {
@@ -313,7 +314,7 @@ int main(int argc, char *argv[])
     forcechannels = OPUS_AUTO;
     use_dtx = 0;
     packet_loss_perc = 0;
-    max_frame_size = 960*6;
+    max_frame_size = 2*48000;
     curr_read=0;
 
     while( args < argc - 2 ) {
@@ -383,7 +384,11 @@ int main(int argc, char *argv[])
             args++;
         } else if( strcmp( argv[ args ], "-variable-duration" ) == 0 ) {
             check_encoder_option(decode_only, "-variable-duration");
-            variable_duration = 1;
+            variable_duration = OPUS_FRAMESIZE_VARIABLE;
+            args++;
+        } else if( strcmp( argv[ args ], "-delayed-decision" ) == 0 ) {
+            check_encoder_option(decode_only, "-delayed-decision");
+            delayed_decision = 1;
             args++;
         } else if( strcmp( argv[ args ], "-dtx") == 0 ) {
             check_encoder_option(decode_only, "-dtx");
@@ -510,7 +515,7 @@ int main(int argc, char *argv[])
 
        opus_encoder_ctl(enc, OPUS_GET_LOOKAHEAD(&skip));
        opus_encoder_ctl(enc, OPUS_SET_LSB_DEPTH(16));
-       opus_encoder_ctl(enc, OPUS_SET_EXPERT_VARIABLE_DURATION(variable_duration));
+       opus_encoder_ctl(enc, OPUS_SET_EXPERT_FRAME_DURATION(variable_duration));
     }
     if (!encode_only)
     {
@@ -566,6 +571,26 @@ int main(int argc, char *argv[])
     if ( use_inbandfec ) {
         data[1] = (unsigned char*)calloc(max_payload_bytes,sizeof(char));
     }
+    if(delayed_decision)
+    {
+       if (variable_duration!=OPUS_FRAMESIZE_VARIABLE)
+       {
+          if (frame_size==sampling_rate/400)
+             variable_duration = OPUS_FRAMESIZE_2_5_MS;
+          else if (frame_size==sampling_rate/200)
+             variable_duration = OPUS_FRAMESIZE_5_MS;
+          else if (frame_size==sampling_rate/100)
+             variable_duration = OPUS_FRAMESIZE_10_MS;
+          else if (frame_size==sampling_rate/50)
+             variable_duration = OPUS_FRAMESIZE_20_MS;
+          else if (frame_size==sampling_rate/25)
+             variable_duration = OPUS_FRAMESIZE_40_MS;
+          else
+             variable_duration = OPUS_FRAMESIZE_60_MS;
+          opus_encoder_ctl(enc, OPUS_SET_EXPERT_FRAME_DURATION(variable_duration));
+       }
+       frame_size = 2*48000;
+    }
     while (!stop)
     {
         if (delayed_celt)
diff --git a/src/opus_encoder.c b/src/opus_encoder.c
index 19778a40..3cee88b3 100644
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -89,9 +89,9 @@ struct OpusEncoder {
     int          first;
     opus_val16   delay_buffer[MAX_ENCODER_BUFFER*2];
 #ifndef FIXED_POINT
-    opus_val32   subframe_mem[3];
     TonalityAnalysisState analysis;
-    int                   detected_bandwidth;
+    int          detected_bandwidth;
+    int          analysis_offset;
 #endif
     opus_uint32  rangeFinal;
 };
@@ -215,6 +215,7 @@ int opus_encoder_init(OpusEncoder* st, opus_int32 Fs, int channels, int applicat
     st->voice_ratio = -1;
     st->encoder_buffer = st->Fs/100;
     st->lsb_depth = 24;
+    st->variable_duration = OPUS_FRAMESIZE_ARG;
 
     /* Delay compensation of 4 ms (2.5 ms for SILK's extra look-ahead 
        + 1.5 ms for SILK resamplers and stereo prediction) */
@@ -665,28 +666,28 @@ static int transient_viterbi(const float *E, const float *E_1, int N, int frame_
    return best_state;
 }
 
-void downmix_float(const void *_x, float *sub, int subframe, int i, int C)
+void downmix_float(const void *_x, float *sub, int subframe, int offset, int C)
 {
    const float *x;
    int c, j;
    x = (const float *)_x;
    for (j=0;j<subframe;j++)
-      sub[j] = x[(subframe*i+j)*C];
+      sub[j] = x[(j+offset)*C];
    for (c=1;c<C;c++)
       for (j=0;j<subframe;j++)
-         sub[j] += x[(subframe*i+j)*C+c];
+         sub[j] += x[(j+offset)*C+c];
 }
 
-void downmix_int(const void *_x, float *sub, int subframe, int i, int C)
+void downmix_int(const void *_x, float *sub, int subframe, int offset, int C)
 {
    const opus_int16 *x;
    int c, j;
    x = (const opus_int16 *)_x;
    for (j=0;j<subframe;j++)
-      sub[j] = x[(subframe*i+j)*C];
+      sub[j] = x[(j+offset)*C];
    for (c=1;c<C;c++)
       for (j=0;j<subframe;j++)
-         sub[j] += x[(subframe*i+j)*C+c];
+         sub[j] += x[(j+offset)*C+c];
 }
 
 int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs,
@@ -732,7 +733,7 @@ int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs,
       int j;
       tmp=EPSILON;
 
-      downmix(x, sub, subframe, i, C);
+      downmix(x, sub, subframe, i*subframe, C);
       if (i==0)
          memx = sub[0];
       for (j=0;j<subframe;j++)
@@ -759,10 +760,36 @@ int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs,
    }
    return bestLM;
 }
+
 #endif
 
+opus_int32 frame_size_select(opus_int32 frame_size, int variable_duration, opus_int32 Fs)
+{
+   int new_size;
+   if (frame_size<Fs/400)
+      return -1;
+   if (variable_duration == OPUS_FRAMESIZE_ARG)
+      new_size = frame_size;
+   else if (variable_duration == OPUS_FRAMESIZE_VARIABLE)
+      new_size = Fs/50;
+   else if (variable_duration >= OPUS_FRAMESIZE_2_5_MS && variable_duration <= OPUS_FRAMESIZE_60_MS)
+      new_size = IMAX(3*Fs/50, (Fs/400)<<(variable_duration-OPUS_FRAMESIZE_2_5_MS));
+   else
+      return -1;
+   if (new_size>frame_size)
+      return -1;
+   if (400*new_size!=Fs && 200*new_size!=Fs && 100*new_size!=Fs &&
+            50*new_size!=Fs && 25*new_size!=Fs && 50*new_size!=3*Fs)
+      return -1;
+   return new_size;
+}
+
 opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_size,
-                unsigned char *data, opus_int32 out_data_bytes, int lsb_depth)
+                unsigned char *data, opus_int32 out_data_bytes, int lsb_depth
+#ifndef FIXED_POINT
+                , AnalysisInfo *analysis_info
+#endif
+                )
 {
     void *silk_enc;
     CELTEncoder *celt_enc;
@@ -790,11 +817,6 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
     opus_val16 HB_gain;
     opus_int32 max_data_bytes; /* Max number of bytes we're allowed to use */
     int total_buffer;
-    int perform_analysis=0;
-    int orig_frame_size;
-#ifndef FIXED_POINT
-    AnalysisInfo analysis_info;
-#endif
     VARDECL(opus_val16, tmp_prefill);
 
     ALLOC_STACK;
@@ -820,38 +842,15 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
 
     lsb_depth = IMIN(lsb_depth, st->lsb_depth);
 
-    orig_frame_size = IMIN(frame_size,st->Fs/50);
-    if (st->variable_duration)
-    {
-       int LM = 3;
+    st->voice_ratio = -1;
+
 #ifndef FIXED_POINT
-       LM = optimize_framesize(pcm, frame_size, st->channels, st->Fs, st->bitrate_bps,
-             st->analysis.prev_tonality, st->subframe_mem, delay_compensation, downmix_float);
-#endif
-       while ((st->Fs/400<<LM)>frame_size)
-          LM--;
-       frame_size = (st->Fs/400<<LM);
-    }
-#ifndef FIXED_POINT
-    /* Only perform analysis up to 20-ms frames. Longer ones will be split if
-       they're in CELT-only mode. */
-    analysis_info.valid = 0;
-    perform_analysis = st->silk_mode.complexity >= 7 && st->Fs==48000;
-    if (!perform_analysis)
+    st->detected_bandwidth = 0;
+    if (analysis_info->valid)
     {
-       st->voice_ratio = -1;
-       st->detected_bandwidth = 0;
-    } else if (frame_size <= st->Fs/50)
-    {
-       tonality_analysis(&st->analysis, &analysis_info, celt_enc, pcm, IMIN(480, frame_size), st->channels, lsb_depth);
-       if (frame_size > st->Fs/100)
-          tonality_analysis(&st->analysis, &analysis_info, celt_enc, pcm+(st->Fs/100)*st->channels, 480, st->channels, lsb_depth);
-       if (analysis_info.valid)
-       {
-          if (st->signal_type == OPUS_AUTO)
-             st->voice_ratio = (int)floor(.5+100*(1-analysis_info.music_prob));
-          st->detected_bandwidth = analysis_info.opus_bandwidth;
-       }
+       if (st->signal_type == OPUS_AUTO)
+          st->voice_ratio = (int)floor(.5+100*(1-analysis_info->music_prob));
+       st->detected_bandwidth = analysis_info->opus_bandwidth;
     }
 #endif
 
@@ -1161,7 +1160,11 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
           /* When switching from SILK/Hybrid to CELT, only ask for a switch at the last frame */
           if (to_celt && i==nb_frames-1)
              st->user_forced_mode = MODE_CELT_ONLY;
-          tmp_len = opus_encode_native(st, pcm+i*(st->channels*st->Fs/50), st->Fs/50, tmp_data+i*bytes_per_frame, bytes_per_frame, lsb_depth);
+          tmp_len = opus_encode_native(st, pcm+i*(st->channels*st->Fs/50), st->Fs/50, tmp_data+i*bytes_per_frame, bytes_per_frame, lsb_depth
+#ifndef FIXED_POINT
+                , analysis_info
+#endif
+                );
           if (tmp_len<0)
           {
              RESTORE_STACK;
@@ -1187,16 +1190,6 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
        RESTORE_STACK;
        return ret;
     }
-#ifndef FIXED_POINT
-    /* Perform analysis for 40-60 ms frames */
-    if (perform_analysis && frame_size > st->Fs/50)
-    {
-       int nb_analysis = frame_size/(st->Fs/100);
-       for (i=0;i<nb_analysis;i++)
-          tonality_analysis(&st->analysis, &analysis_info, celt_enc, pcm+i*(st->Fs/100)*st->channels, 480, st->channels, lsb_depth);
-       st->voice_ratio = (int)floor(.5+100*(1-analysis_info.music_prob));
-    }
-#endif
     curr_bandwidth = st->bandwidth;
 
     /* Chooses the appropriate mode for speech
@@ -1446,11 +1439,11 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
             {
                 opus_int32 bonus=0;
 #ifndef FIXED_POINT
-                if (st->variable_duration && orig_frame_size != frame_size)
+                if (st->variable_duration==OPUS_FRAMESIZE_VARIABLE && frame_size != st->Fs/50)
                 {
-                   bonus = (40*st->stream_channels+40)*(48000/frame_size-48000/orig_frame_size);
-                   if (analysis_info.valid)
-                      bonus = bonus*(1.f+.5*analysis_info.tonality);
+                   bonus = (40*st->stream_channels+40)*(st->Fs/frame_size-50);
+                   if (analysis_info->valid)
+                      bonus = bonus*(1.f+.5*analysis_info->tonality);
                 }
 #endif
                 celt_encoder_ctl(celt_enc, OPUS_SET_VBR(1));
@@ -1587,8 +1580,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
         if (ec_tell(&enc) <= 8*nb_compr_bytes)
         {
 #ifndef FIXED_POINT
-           if (perform_analysis)
-              celt_encoder_ctl(celt_enc, CELT_SET_ANALYSIS(&analysis_info));
+           celt_encoder_ctl(celt_enc, CELT_SET_ANALYSIS(analysis_info));
 #endif
            ret = celt_encode_with_ec(celt_enc, pcm_buf, frame_size, NULL, nb_compr_bytes, &enc);
            if (ret < 0)
@@ -1688,6 +1680,7 @@ opus_int32 opus_encode_float(OpusEncoder *st, const float *pcm, int frame_size,
    VARDECL(opus_int16, in);
    ALLOC_STACK;
 
+   frame_size = frame_size_select(frame_size, st->variable_duration, st->Fs);
    if(frame_size<0)
    {
       RESTORE_STACK;
@@ -1707,6 +1700,12 @@ opus_int32 opus_encode_float(OpusEncoder *st, const float *pcm, int frame_size,
 opus_int32 opus_encode(OpusEncoder *st, const opus_int16 *pcm, int frame_size,
                 unsigned char *data, opus_int32 out_data_bytes)
 {
+   frame_size = frame_size_select(frame_size, st->variable_duration, st->Fs);
+   if(frame_size<0)
+   {
+      RESTORE_STACK;
+      return OPUS_BAD_ARG;
+   }
    return opus_encode_native(st, pcm, frame_size, data, out_data_bytes, 16);
 }
 
@@ -1715,21 +1714,74 @@ opus_int32 opus_encode(OpusEncoder *st, const opus_int16 *pcm, int frame_size,
       unsigned char *data, opus_int32 max_data_bytes)
 {
    int i, ret;
+   const CELTMode *celt_mode;
+   int delay_compensation;
+   int lsb_depth;
    VARDECL(float, in);
+   AnalysisInfo analysis_info;
    ALLOC_STACK;
 
+   opus_encoder_ctl(st, CELT_GET_MODE(&celt_mode));
+   if (st->application == OPUS_APPLICATION_RESTRICTED_LOWDELAY)
+      delay_compensation = 0;
+   else
+      delay_compensation = st->delay_compensation;
+
+   lsb_depth = IMIN(16, st->lsb_depth);
+
+   analysis_info.valid = 0;
+   if (st->silk_mode.complexity >= 7 && st->Fs==48000)
+   {
+      frame_size = run_analysis(&st->analysis, celt_mode, pcm, pcm+st->channels*st->analysis.analysis_offset,
+            frame_size, st->variable_duration, st->channels, st->Fs, st->bitrate_bps, delay_compensation, lsb_depth, downmix_int, &analysis_info);
+   } else {
+      frame_size = frame_size_select(frame_size, st->variable_duration, st->Fs);
+   }
+   if(frame_size<0)
+   {
+      RESTORE_STACK;
+      return OPUS_BAD_ARG;
+   }
+
    ALLOC(in, frame_size*st->channels, float);
 
    for (i=0;i<frame_size*st->channels;i++)
       in[i] = (1.0f/32768)*pcm[i];
-   ret = opus_encode_native(st, in, frame_size, data, max_data_bytes, 16);
+   ret = opus_encode_native(st, in, frame_size, data, max_data_bytes, 16, &analysis_info);
    RESTORE_STACK;
    return ret;
 }
 opus_int32 opus_encode_float(OpusEncoder *st, const float *pcm, int frame_size,
                       unsigned char *data, opus_int32 out_data_bytes)
 {
-   return opus_encode_native(st, pcm, frame_size, data, out_data_bytes, 24);
+   const CELTMode *celt_mode;
+   int delay_compensation;
+   int lsb_depth;
+   AnalysisInfo analysis_info;
+
+   opus_encoder_ctl(st, CELT_GET_MODE(&celt_mode));
+   if (st->application == OPUS_APPLICATION_RESTRICTED_LOWDELAY)
+      delay_compensation = 0;
+   else
+      delay_compensation = st->delay_compensation;
+
+   lsb_depth = IMIN(24, st->lsb_depth);
+
+   analysis_info.valid = 0;
+   if (st->silk_mode.complexity >= 7 && st->Fs==48000)
+   {
+      frame_size = run_analysis(&st->analysis, celt_mode, pcm, pcm+st->channels*st->analysis.analysis_offset,
+            frame_size, st->variable_duration, st->channels, st->Fs, st->bitrate_bps, delay_compensation, lsb_depth, downmix_float, &analysis_info);
+   } else {
+      frame_size = frame_size_select(frame_size, st->variable_duration, st->Fs);
+   }
+   if(frame_size<0)
+   {
+      RESTORE_STACK;
+      return OPUS_BAD_ARG;
+   }
+
+   return opus_encode_native(st, pcm, frame_size, data, out_data_bytes, 24, &analysis_info);
 
 }
 #endif
@@ -1998,15 +2050,13 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...)
             *value = st->lsb_depth;
         }
         break;
-        case OPUS_SET_EXPERT_VARIABLE_DURATION_REQUEST:
+        case OPUS_SET_EXPERT_FRAME_DURATION_REQUEST:
         {
             opus_int32 value = va_arg(ap, opus_int32);
-            if (value<0 || value>1)
-               goto bad_arg;
             st->variable_duration = value;
         }
         break;
-        case OPUS_GET_EXPERT_VARIABLE_DURATION_REQUEST:
+        case OPUS_GET_EXPERT_FRAME_DURATION_REQUEST:
         {
             opus_int32 *value = va_arg(ap, opus_int32*);
             *value = st->variable_duration;
@@ -2041,6 +2091,15 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...)
             st->user_forced_mode = value;
         }
         break;
+
+        case CELT_GET_MODE_REQUEST:
+        {
+           const CELTMode ** value = va_arg(ap, const CELTMode**);
+           if (value==0)
+              goto bad_arg;
+           celt_encoder_ctl(celt_enc, CELT_GET_MODE(value));
+        }
+        break;
         default:
             /* fprintf(stderr, "unknown opus_encoder_ctl() request: %d", request);*/
             ret = OPUS_UNIMPLEMENTED;
diff --git a/src/opus_multistream_encoder.c b/src/opus_multistream_encoder.c
index c5fb3669..c6204185 100644
--- a/src/opus_multistream_encoder.c
+++ b/src/opus_multistream_encoder.c
@@ -36,8 +36,10 @@
 #include <stdarg.h>
 #include "float_cast.h"
 #include "os_support.h"
+#include "analysis.h"
 
 struct OpusMSEncoder {
+   TonalityAnalysisState analysis;
    ChannelLayout layout;
    int variable_duration;
    opus_int32 bitrate_bps;
@@ -105,6 +107,7 @@ int opus_multistream_encoder_init(
    st->layout.nb_coupled_streams = coupled_streams;
 
    st->bitrate_bps = OPUS_AUTO;
+   st->variable_duration = OPUS_FRAMESIZE_ARG;
    for (i=0;i<st->layout.nb_channels;i++)
       st->layout.mapping[i] = mapping[i];
    if (!validate_layout(&st->layout) || !validate_encoder_layout(&st->layout))
@@ -187,6 +190,7 @@ static int opus_multistream_encode_native
     int lsb_depth
 #ifndef FIXED_POINT
     , downmix_func downmix
+    , const void *pcm_analysis
 #endif
 )
 {
@@ -202,10 +206,15 @@ static int opus_multistream_encode_native
    int orig_frame_size;
    int coded_channels;
    opus_int32 channel_rate;
+   opus_int32 complexity;
+   AnalysisInfo analysis_info;
+   const CELTMode *celt_mode;
    ALLOC_STACK;
 
    ptr = (char*)st + align(sizeof(OpusMSEncoder));
    opus_encoder_ctl((OpusEncoder*)ptr, OPUS_GET_SAMPLE_RATE(&Fs));
+   opus_encoder_ctl((OpusEncoder*)ptr, OPUS_GET_COMPLEXITY(&complexity));
+   opus_encoder_ctl((OpusEncoder*)ptr, CELT_GET_MODE(&celt_mode));
 
    if (400*frame_size < Fs)
    {
@@ -213,24 +222,24 @@ static int opus_multistream_encode_native
       return OPUS_BAD_ARG;
    }
    orig_frame_size = IMIN(frame_size,Fs/50);
-   if (st->variable_duration)
+#ifndef FIXED_POINT
+   analysis_info.valid = 0;
+   if (complexity >= 7 && Fs==48000)
    {
-      int LM = 3;
-      int channels;
       opus_int32 delay_compensation;
+      int channels;
 
       channels = st->layout.nb_streams + st->layout.nb_coupled_streams;
       opus_encoder_ctl((OpusEncoder*)ptr, OPUS_GET_LOOKAHEAD(&delay_compensation));
       delay_compensation -= Fs/400;
-#ifndef FIXED_POINT
-      LM = optimize_framesize(pcm, frame_size, channels, Fs, st->bitrate_bps,
-            0.f, st->subframe_mem, delay_compensation, downmix);
-#endif
-      while ((Fs/400<<LM)>frame_size)
-         LM--;
-      frame_size = (Fs/400<<LM);
-   }
 
+      frame_size = run_analysis(&st->analysis, celt_mode, pcm, pcm_analysis,
+            frame_size, st->variable_duration, channels, Fs, st->bitrate_bps, delay_compensation, lsb_depth, downmix, &analysis_info);
+   } else
+#endif
+   {
+      frame_size = frame_size_select(frame_size, st->variable_duration, Fs);
+   }
    /* Validate frame_size before using it to allocate stack space.
       This mirrors the checks in opus_encode[_float](). */
    if (400*frame_size != Fs && 200*frame_size != Fs &&
@@ -262,10 +271,10 @@ static int opus_multistream_encode_native
       channel_rate = st->bitrate_bps/coded_channels;
    }
 #ifndef FIXED_POINT
-   if (st->variable_duration && orig_frame_size != frame_size)
+   if (st->variable_duration==OPUS_FRAMESIZE_VARIABLE && frame_size != Fs/50)
    {
       opus_int32 bonus;
-      bonus = 60*(48000/frame_size-48000/orig_frame_size);
+      bonus = 60*(Fs/frame_size-50);
       channel_rate += bonus;
    }
 #endif
@@ -313,7 +322,11 @@ static int opus_multistream_encode_native
       /* Reserve three bytes for the last stream and four for the others */
       curr_max -= IMAX(0,4*(st->layout.nb_streams-s-1)-1);
       curr_max = IMIN(curr_max,MS_FRAME_TMP);
-      len = opus_encode_native(enc, buf, frame_size, tmp_data, curr_max, lsb_depth);
+      len = opus_encode_native(enc, buf, frame_size, tmp_data, curr_max, lsb_depth
+#ifndef FIXED_POINT
+            , &analysis_info
+#endif
+            );
       if (len<0)
       {
          RESTORE_STACK;
@@ -412,8 +425,9 @@ int opus_multistream_encode_float
     opus_int32 max_data_bytes
 )
 {
+   int channels = st->layout.nb_streams + st->layout.nb_coupled_streams;
    return opus_multistream_encode_native(st, opus_copy_channel_in_float,
-      pcm, frame_size, data, max_data_bytes, 24, downmix_float);
+      pcm, frame_size, data, max_data_bytes, 24, downmix_float, pcm+channels*st->analysis.analysis_offset);
 }
 
 int opus_multistream_encode(
@@ -424,8 +438,9 @@ int opus_multistream_encode(
     opus_int32 max_data_bytes
 )
 {
+   int channels = st->layout.nb_streams + st->layout.nb_coupled_streams;
    return opus_multistream_encode_native(st, opus_copy_channel_in_short,
-      pcm, frame_size, data, max_data_bytes, 16, downmix_int);
+      pcm, frame_size, data, max_data_bytes, 16, downmix_int, pcm+channels*st->analysis.analysis_offset);
 }
 #endif
 
@@ -562,7 +577,7 @@ int opus_multistream_encoder_ctl(OpusMSEncoder *st, int request, ...)
       *value = (OpusEncoder*)ptr;
    }
    break;
-   case OPUS_SET_EXPERT_VARIABLE_DURATION_REQUEST:
+   case OPUS_SET_EXPERT_FRAME_DURATION_REQUEST:
    {
        opus_int32 value = va_arg(ap, opus_int32);
        if (value<0 || value>1)
@@ -570,7 +585,7 @@ int opus_multistream_encoder_ctl(OpusMSEncoder *st, int request, ...)
        st->variable_duration = value;
    }
    break;
-   case OPUS_GET_EXPERT_VARIABLE_DURATION_REQUEST:
+   case OPUS_GET_EXPERT_FRAME_DURATION_REQUEST:
    {
        opus_int32 *value = va_arg(ap, opus_int32*);
        *value = st->variable_duration;
diff --git a/src/opus_private.h b/src/opus_private.h
index 33a982e5..1da5748b 100644
--- a/src/opus_private.h
+++ b/src/opus_private.h
@@ -31,6 +31,7 @@
 
 #include "arch.h"
 #include "opus.h"
+#include "celt.h"
 
 struct OpusRepacketizer {
    unsigned char toc;
@@ -82,17 +83,23 @@ int get_mono_channel(const ChannelLayout *layout, int stream_id, int prev);
 #define OPUS_SET_FORCE_MODE(x) OPUS_SET_FORCE_MODE_REQUEST, __opus_check_int(x)
 
 typedef void (*downmix_func)(const void *, float *, int, int, int);
-void downmix_float(const void *_x, float *sub, int subframe, int i, int C);
-void downmix_int(const void *_x, float *sub, int subframe, int i, int C);
+void downmix_float(const void *_x, float *sub, int subframe, int offset, int C);
+void downmix_int(const void *_x, float *sub, int subframe, int offset, int C);
 
 int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs,
                 int bitrate, opus_val16 tonality, opus_val32 *mem, int buffering,
-                void (*downmix)(const void *, float *, int, int, int));
+                downmix_func downmix);
 
 int encode_size(int size, unsigned char *data);
 
+opus_int32 frame_size_select(opus_int32 frame_size, int variable_duration, opus_int32 Fs);
+
 opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_size,
-      unsigned char *data, opus_int32 out_data_bytes, int lsb_depth);
+      unsigned char *data, opus_int32 out_data_bytes, int lsb_depth
+#ifndef FIXED_POINT
+                , AnalysisInfo *analysis_info
+#endif
+      );
 
 int opus_decode_native(OpusDecoder *st, const unsigned char *data, opus_int32 len,
       opus_val16 *pcm, int frame_size, int decode_fec, int self_delimited, int *packet_offset);

From 74f36b56dd8ad176fecd3344fd27ed9a99a44221 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Wed, 20 Feb 2013 22:31:49 -0500
Subject: [PATCH 13/15] oops s/IMAX/IMIN/

---
 src/opus_encoder.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/opus_encoder.c b/src/opus_encoder.c
index 3cee88b3..7cff8428 100644
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -773,7 +773,7 @@ opus_int32 frame_size_select(opus_int32 frame_size, int variable_duration, opus_
    else if (variable_duration == OPUS_FRAMESIZE_VARIABLE)
       new_size = Fs/50;
    else if (variable_duration >= OPUS_FRAMESIZE_2_5_MS && variable_duration <= OPUS_FRAMESIZE_60_MS)
-      new_size = IMAX(3*Fs/50, (Fs/400)<<(variable_duration-OPUS_FRAMESIZE_2_5_MS));
+      new_size = IMIN(3*Fs/50, (Fs/400)<<(variable_duration-OPUS_FRAMESIZE_2_5_MS));
    else
       return -1;
    if (new_size>frame_size)

From 742aac10568839e08e5ee9fe3e92ba587c63e374 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Fri, 22 Feb 2013 16:44:56 -0500
Subject: [PATCH 14/15] Adds silence probability to speech/music detector

Avoids biasing the decision when it's all silence/noise.
---
 src/analysis.c  |  54 +++++------------
 src/mlp_data.c  | 153 ++++++++++++++++++++++++++++--------------------
 src/mlp_train.c |  21 ++++---
 3 files changed, 119 insertions(+), 109 deletions(-)

diff --git a/src/analysis.c b/src/analysis.c
index 54005d3a..34ea5107 100644
--- a/src/analysis.c
+++ b/src/analysis.c
@@ -141,7 +141,6 @@ static inline float fast_atan2f(float y, float x) {
 
 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
 {
-#if 1
    int pos;
    int curr_lookahead;
    float psum;
@@ -184,31 +183,6 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int
    /*printf("%f %f\n", psum, info_out->music_prob);*/
 
    info_out->music_prob = psum;
-#else
-   /* If data not available, return invalid */
-   if (tonal->read_pos==tonal->write_pos)
-   {
-      info_out->valid=0;
-      return;
-   }
-
-   OPUS_COPY(info_out, &tonal->info[tonal->read_pos], 1);
-   tonal->read_subframe += len/480;
-   while (tonal->read_subframe>=4)
-   {
-      tonal->read_subframe -= 4;
-      tonal->read_pos++;
-   }
-   if (tonal->read_pos>=DETECT_SIZE)
-      tonal->read_pos-=DETECT_SIZE;
-   if (tonal->read_pos == tonal->write_pos)
-   {
-      tonal->read_pos = tonal->write_pos-1;
-      if (tonal->read_pos<0)
-         tonal->read_pos=DETECT_SIZE-1;
-      tonal->read_subframe = 3;
-   }
-#endif
 }
 
 void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, const CELTMode *celt_mode, const void *x, int len, int offset, int C, int lsb_depth, downmix_func downmix)
@@ -234,7 +208,7 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, con
     float slope=0;
     float frame_stationarity;
     float relativeE;
-    float frame_prob;
+    float frame_probs[2];
     float alpha, alphaE, alphaE2;
     float frame_loudness;
     float bandwidth_mask;
@@ -494,32 +468,34 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, con
     features[24] = tonal->lowECount;
 
 #ifndef FIXED_POINT
-    mlp_process(&net, features, &frame_prob);
-    frame_prob = .5f*(frame_prob+1);
+    mlp_process(&net, features, frame_probs);
+    frame_probs[0] = .5f*(frame_probs[0]+1);
     /* Curve fitting between the MLP probability and the actual probability */
-    frame_prob = .01f + 1.21f*frame_prob*frame_prob - .23f*(float)pow(frame_prob, 10);
+    frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);
+    frame_probs[1] = .5*frame_probs[1]+.5;
+    frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5;
 
-    /*printf("%f\n", frame_prob);*/
+    /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/
     {
        float tau, beta;
        float p0, p1;
        float max_certainty;
        /* One transition every 3 minutes */
-       tau = .00005f;
-       beta = .1f;
+       tau = .00005f*frame_probs[1];
+       beta = .05f;
        max_certainty = .01f+1.f/(20.f+.5f*tonal->last_transition);
        max_certainty = 0;
        p0 = (1-tonal->music_prob)*(1-tau) +    tonal->music_prob *tau;
        p1 =    tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
-       p0 *= (float)pow(1-frame_prob, beta);
-       p1 *= (float)pow(frame_prob, beta);
+       p0 *= (float)pow(1-frame_probs[0], beta);
+       p1 *= (float)pow(frame_probs[0], beta);
        tonal->music_prob = MAX16(max_certainty, MIN16(1-max_certainty, p1/(p0+p1)));
        info->music_prob = tonal->music_prob;
-       info->music_prob = frame_prob;
+       info->music_prob = frame_probs[0];
 
        float psum=1e-20;
-       float speech0 = (float)pow(1-frame_prob, beta);
-       float music0  = (float)pow(frame_prob, beta);
+       float speech0 = (float)pow(1-frame_probs[0], beta);
+       float music0  = (float)pow(frame_probs[0], beta);
        if (tonal->count==1)
        {
           tonal->pspeech[0]=.5;
@@ -550,7 +526,7 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, con
        for (i=1;i<DETECT_SIZE;i++)
           psum += tonal->pspeech[i];
 
-       /*printf("%f %f %f\n", frame_prob, info->music_prob, psum);*/
+       /*printf("%f\n", psum);*/
     }
     if (tonal->last_music != (tonal->music_prob>.5f))
        tonal->last_transition=0;
diff --git a/src/mlp_data.c b/src/mlp_data.c
index 5c13ca40..9085b85f 100644
--- a/src/mlp_data.c
+++ b/src/mlp_data.c
@@ -3,74 +3,103 @@
 
 #include "mlp.h"
 
-/* RMS error was 0.179835, seed was 1322103961 */
+/* RMS error was 0.138320, seed was 1361535663 */
 
-static const float weights[271] = {
+static const float weights[422] = {
 
 /* hidden layer */
-1.55597f, -0.0739792f, -0.0646761f, -0.099531f, -0.0794943f,
-0.0180174f, -0.0391354f, 0.0508224f, -0.0160169f, -0.0773263f,
--0.0300002f, -0.0865361f, 0.124477f, -0.28648f, -0.0860702f,
--0.518949f, -0.0873341f, -0.235393f, -0.907833f, -0.383573f,
-0.535388f, -0.57944f, 0.98116f, 0.8482f, 1.12426f,
--3.23721f, -0.647072f, -0.0265139f, 0.0711052f, -0.00125666f,
--0.0396181f, -0.44282f, -0.510495f, -0.201865f, 0.0134336f,
--0.167205f, -0.155406f, 0.00041678f, -0.00468705f, -0.0233224f,
-0.264279f, -0.301375f, 0.00234895f, 0.0144741f, -0.137535f,
-0.200323f, 0.0192027f, 3.19818f, 2.03495f, 0.705517f,
--4.6025f, -0.11485f, -0.792716f, 0.150714f, 0.10608f,
-0.240633f, 0.0690698f, 0.0695297f, 0.124819f, 0.0501433f,
-0.0460952f, 0.147639f, 0.10327f, 0.158007f, 0.113714f,
-0.0276191f, 0.0680749f, -0.130012f, 0.0796126f, 0.133067f,
-0.51495f, 0.747578f, -0.128742f, 5.98112f, -1.16698f,
--0.276492f, -1.73549f, -3.90234f, 2.01489f, -0.040118f,
--0.113002f, -0.146751f, -0.113569f, 0.0534873f, 0.0989832f,
-0.0872875f, 0.049266f, 0.0367557f, -0.00889148f, -0.0648461f,
--0.00190352f, 0.0143773f, 0.0259364f, -0.0592133f, -0.0672924f,
-0.1399f, -0.0987886f, -0.347402f, 0.101326f, -0.0680876f,
-0.469186f, 0.246922f, 10.4017f, 3.44846f, -0.662725f,
--0.0328208f, -0.0561274f, -0.0167744f, 0.00044282f, -0.0457645f,
--0.0408314f, -0.013113f, -0.0373873f, -0.0474122f, -0.0273745f,
--0.0308505f, 0.000582959f, -0.0421135f, 0.464859f, 0.196842f,
-0.320538f, 0.0435528f, -0.200168f, 0.266475f, -0.0853727f,
-1.20397f, 0.711542f, -1.04397f, -1.47759f, 1.26768f,
-0.446958f, 0.266477f, -0.30802f, 0.28431f, -0.118541f,
-0.00836345f, 0.0689026f, -0.0137996f, -0.0395417f, 0.26982f,
--0.206255f, 0.16066f, 0.114757f, 0.359587f, -0.106503f,
--0.0948534f, 0.175358f, -0.122966f, -0.0056675f, 0.483848f,
--0.134916f, -0.427567f, -0.140172f, -1.0866f, -2.73921f,
-0.549843f, 0.17685f, 0.0010675f, -0.00137386f, 0.0884424f,
--0.0698736f, -0.00174136f, 0.0718775f, -0.0396849f, 0.0448056f,
-0.0577853f, -0.0372353f, 0.134599f, 0.0260656f, 0.140322f,
-0.22704f, -0.020568f, -0.0142424f, -0.21723f, -0.997704f,
--0.884573f, -0.163495f, 2.33617f, 0.224142f, 0.19635f,
--0.957387f, 0.144678f, 1.47035f, -0.00700498f, -0.0472309f,
--0.0137848f, -0.0189145f, 0.00856479f, 0.0316965f, 0.00613373f,
-0.00209807f, 0.00270964f, -0.0490206f, 0.0105712f, -0.0465045f,
--0.0381532f, -0.0985268f, -0.108297f, 0.0146409f, -0.0040718f,
--0.0698572f, -0.380568f, -0.230479f, 3.98917f, 0.457652f,
--1.02355f, -7.4435f, -0.475314f, 1.61743f, 0.0254017f,
--0.00791293f, 0.047217f, 0.0220995f, -0.0304311f, 0.0052168f,
--0.0404054f, -0.0230293f, 0.00169229f, -0.0138178f, 0.0043137f,
--0.0598088f, -0.133601f, 0.0555138f, -0.177358f, -0.159856f,
--0.137281f, 0.108051f, -0.305973f, 0.393775f, 0.0747287f,
-0.783993f, -0.875086f, 1.06862f, 0.340519f, -0.352681f,
--0.0830912f, -0.100017f, 0.0729085f, -0.00829403f, 0.027489f,
--0.0779597f, 0.082286f, -0.164181f, -0.41519f, 0.00282335f,
--0.29573f, 0.125571f, 0.726935f, 0.392137f, 0.491348f,
-0.0723196f, -0.0259758f, -0.0636332f, -0.452384f, -0.000225974f,
--2.34001f, 2.45211f, -0.544628f, 5.62944f, -3.44507f,
+-0.0941125f, -0.302976f, -0.603555f, -0.19393f, -0.185983f,
+-0.601617f, -0.0465317f, -0.114563f, -0.103599f, -0.618938f,
+-0.317859f, -0.169949f, -0.0702885f, 0.148065f, 0.409524f,
+0.548432f, 0.367649f, -0.494393f, 0.764306f, -1.83957f,
+0.170849f, 12.786f, -1.08848f, -1.27284f, -16.2606f,
+24.1773f, -5.57454f, -0.17276f, -0.163388f, -0.224421f,
+-0.0948944f, -0.0728695f, -0.26557f, -0.100283f, -0.0515459f,
+-0.146142f, -0.120674f, -0.180655f, 0.12857f, 0.442138f,
+-0.493735f, 0.167767f, 0.206699f, -0.197567f, 0.417999f,
+1.50364f, -0.773341f, -10.0401f, 0.401872f, 2.97966f,
+15.2165f, -1.88905f, -1.19254f, 0.0285397f, -0.00405139f,
+0.0707565f, 0.00825699f, -0.0927269f, -0.010393f, -0.00428882f,
+-0.00489743f, -0.0709731f, -0.00255992f, 0.0395619f, 0.226424f,
+0.0325231f, 0.162175f, -0.100118f, 0.485789f, 0.12697f,
+0.285937f, 0.0155637f, 0.10546f, 3.05558f, 1.15059f,
+-1.00904f, -1.83088f, 3.31766f, -3.42516f, -0.119135f,
+-0.0405654f, 0.00690068f, 0.0179877f, -0.0382487f, 0.00597941f,
+-0.0183611f, 0.00190395f, -0.144322f, -0.0435671f, 0.000990594f,
+0.221087f, 0.142405f, 0.484066f, 0.404395f, 0.511955f,
+-0.237255f, 0.241742f, 0.35045f, -0.699428f, 10.3993f,
+2.6507f, -2.43459f, -4.18838f, 1.05928f, 1.71067f,
+0.00667811f, -0.0721335f, -0.0397346f, 0.0362704f, -0.11496f,
+-0.0235776f, 0.0082161f, -0.0141741f, -0.0329699f, -0.0354253f,
+0.00277404f, -0.290654f, -1.14767f, -0.319157f, -0.686544f,
+0.36897f, 0.478899f, 0.182579f, -0.411069f, 0.881104f,
+-4.60683f, 1.4697f, 0.335845f, -1.81905f, -30.1699f,
+5.55225f, 0.0019508f, -0.123576f, -0.0727332f, -0.0641597f,
+-0.0534458f, -0.108166f, -0.0937368f, -0.0697883f, -0.0275475f,
+-0.192309f, -0.110074f, 0.285375f, -0.405597f, 0.0926724f,
+-0.287881f, -0.851193f, -0.099493f, -0.233764f, -1.2852f,
+1.13611f, 3.12168f, -0.0699f, -1.86216f, 2.65292f,
+-7.31036f, 2.44776f, -0.00111802f, -0.0632786f, -0.0376296f,
+-0.149851f, 0.142963f, 0.184368f, 0.123433f, 0.0756158f,
+0.117312f, 0.0933395f, 0.0692163f, 0.0842592f, 0.0704683f,
+0.0589963f, 0.0942205f, -0.448862f, 0.0262677f, 0.270352f,
+-0.262317f, 0.172586f, 2.00227f, -0.159216f, 0.038422f,
+10.2073f, 4.15536f, -2.3407f, -0.0550265f, 0.00964792f,
+-0.141336f, 0.0274501f, 0.0343921f, -0.0487428f, 0.0950172f,
+-0.00775017f, -0.0372492f, -0.00548121f, -0.0663695f, 0.0960506f,
+-0.200008f, -0.0412827f, 0.58728f, 0.0515787f, 0.337254f,
+0.855024f, 0.668371f, -0.114904f, -3.62962f, -0.467477f,
+-0.215472f, 2.61537f, 0.406117f, -1.36373f, 0.0425394f,
+0.12208f, 0.0934502f, 0.123055f, 0.0340935f, -0.142466f,
+0.035037f, -0.0490666f, 0.0733208f, 0.0576672f, 0.123984f,
+-0.0517194f, -0.253018f, 0.590565f, 0.145849f, 0.315185f,
+0.221534f, -0.149081f, 0.216161f, -0.349575f, 24.5664f,
+-0.994196f, 0.614289f, -18.7905f, -2.83277f, -0.716801f,
+-0.347201f, 0.479515f, -0.246027f, 0.0758683f, 0.137293f,
+-0.17781f, 0.118751f, -0.00108329f, -0.237334f, 0.355732f,
+-0.12991f, -0.0547627f, -0.318576f, -0.325524f, 0.180494f,
+-0.0625604f, 0.141219f, 0.344064f, 0.37658f, -0.591772f,
+5.8427f, -0.38075f, 0.221894f, -1.41934f, -1.87943e+06f,
+1.34114f, 0.0283355f, -0.0447856f, -0.0211466f, -0.0256927f,
+0.0139618f, 0.0207934f, -0.0107666f, 0.0110969f, 0.0586069f,
+-0.0253545f, -0.0328433f, 0.11872f, -0.216943f, 0.145748f,
+0.119808f, -0.0915211f, -0.120647f, -0.0787719f, -0.143644f,
+-0.595116f, -1.152f, -1.25335f, -1.17092f, 4.34023f,
+-975268.f, -1.37033f, -0.0401123f, 0.210602f, -0.136656f,
+0.135962f, -0.0523293f, 0.0444604f, 0.0143928f, 0.00412666f,
+-0.0193003f, 0.218452f, -0.110204f, -2.02563f, 0.918238f,
+-2.45362f, 1.19542f, -0.061362f, -1.92243f, 0.308111f,
+0.49764f, 0.912356f, 0.209272f, -2.34525f, 2.19326f,
+-6.47121f, 1.69771f, -0.725123f, 0.0118929f, 0.0377944f,
+0.0554003f, 0.0226452f, -0.0704421f, -0.0300309f, 0.0122978f,
+-0.0041782f, -0.0686612f, 0.0313115f, 0.039111f, 0.364111f,
+-0.0945548f, 0.0229876f, -0.17414f, 0.329795f, 0.114714f,
+0.30022f, 0.106997f, 0.132355f, 5.79932f, 0.908058f,
+-0.905324f, -3.3561f, 0.190647f, 0.184211f, -0.673648f,
+0.231807f, -0.0586222f, 0.230752f, -0.438277f, 0.245857f,
+-0.17215f, 0.0876383f, -0.720512f, 0.162515f, 0.0170571f,
+0.101781f, 0.388477f, 1.32931f, 1.08548f, -0.936301f,
+-2.36958f, -6.71988f, -3.44376f, 2.13818f, 14.2318f,
+4.91459f, -3.09052f, -9.69191f, -0.768234f, 1.79604f,
+0.0549653f, 0.163399f, 0.0797025f, 0.0343933f, -0.0555876f,
+-0.00505673f, 0.0187258f, 0.0326628f, 0.0231486f, 0.15573f,
+0.0476223f, -0.254824f, 1.60155f, -0.801221f, 2.55496f,
+0.737629f, -1.36249f, -0.695463f, -2.44301f, -1.73188f,
+3.95279f, 1.89068f, 0.486087f, -11.3343f, 3.9416e+06f,
 
 /* output layer */
--3.13835f, 0.994751f, 0.444901f, 1.59518f, 1.23665f,
-3.37012f, -1.34606f, 1.99131f, 1.33476f, 1.3885f,
-1.12559f, };
+-0.381439, 0.12115, -0.906927, 2.93878, 1.6388,
+0.882811, 0.874344, 1.21726, -0.874545, 0.321706,
+0.785055, 0.946558, -0.575066, -3.46553, 0.884905,
+0.0924047, -9.90712, 0.391338, 0.160103, -2.04954,
+4.1455, 0.0684029, -0.144761, -0.285282, 0.379244,
+-1.1584, -0.0277241, -9.85, -4.82386, 3.71333,
+3.87308, 3.52558, };
 
-static const int topo[3] = {25, 10, 1};
+static const int topo[3] = {25, 15, 2};
 
 const MLP net = {
-	3,
-	topo,
-	weights
+    3,
+    topo,
+    weights
 };
-
diff --git a/src/mlp_train.c b/src/mlp_train.c
index 5fbbff08..2e9568ba 100644
--- a/src/mlp_train.c
+++ b/src/mlp_train.c
@@ -106,6 +106,7 @@ MLPTrain * mlp_init(int *topo, int nbLayers, float *inputs, float *outputs, int
 }
 
 #define MAX_NEURONS 100
+#define MAX_OUT 10
 
 double compute_gradient(MLPTrain *net, float *inputs, float *outputs, int nbSamples, double *W0_grad, double *W1_grad, double *error_rate)
 {
@@ -120,7 +121,8 @@ double compute_gradient(MLPTrain *net, float *inputs, float *outputs, int nbSamp
 	double netOut[MAX_NEURONS];
 	double error[MAX_NEURONS];
 
-        *error_rate = 0;
+	for (i=0;i<outDim;i++)
+	   error_rate[i] = 0;
 	topo = net->topo;
 	inDim = net->topo[0];
 	hiddenDim = net->topo[1];
@@ -153,7 +155,7 @@ double compute_gradient(MLPTrain *net, float *inputs, float *outputs, int nbSamp
 			netOut[i] = tansig_approx(sum);
 			error[i] = out[i] - netOut[i];
 			rms += error[i]*error[i];
-			*error_rate += fabs(error[i])>1;
+			error_rate[i] += fabs(error[i])>1;
 			/*error[i] = error[i]/(1+fabs(error[i]));*/
 		}
 		/* Back-propagate error */
@@ -194,7 +196,7 @@ struct GradientArg {
 	double *W0_grad;
 	double *W1_grad;
 	double rms;
-	double error_rate;
+	double error_rate[MAX_OUT];
 };
 
 void *gradient_thread_process(void *_arg)
@@ -213,7 +215,7 @@ void *gradient_thread_process(void *_arg)
 		sem_wait(&sem_begin[arg->id]);
 		if (arg->done)
 			break;
-		arg->rms = compute_gradient(arg->net, arg->inputs, arg->outputs, arg->nbSamples, arg->W0_grad, arg->W1_grad, &arg->error_rate);
+		arg->rms = compute_gradient(arg->net, arg->inputs, arg->outputs, arg->nbSamples, arg->W0_grad, arg->W1_grad, arg->error_rate);
 		sem_post(&sem_end[arg->id]);
 	}
 	fprintf(stderr, "done\n");
@@ -295,7 +297,7 @@ float mlp_train_backprop(MLPTrain *net, float *inputs, float *outputs, int nbSam
 	for (e=0;e<nbEpoch;e++)
 	{
 		double rms=0;
-                double error_rate = 0;
+		double error_rate[2] = {0,0};
 		for (i=0;i<NB_THREADS;i++)
 		{
 			sem_post(&sem_begin[i]);
@@ -306,7 +308,8 @@ float mlp_train_backprop(MLPTrain *net, float *inputs, float *outputs, int nbSam
 		{
 			sem_wait(&sem_end[i]);
 			rms += args[i].rms;
-			error_rate += args[i].error_rate;
+			error_rate[0] += args[i].error_rate[0];
+            error_rate[1] += args[i].error_rate[1];
 			for (j=0;j<W0_size;j++)
 				W0_grad[j] += args[i].W0_grad[j];
 			for (j=0;j<W1_size;j++)
@@ -315,8 +318,9 @@ float mlp_train_backprop(MLPTrain *net, float *inputs, float *outputs, int nbSam
 
 		float mean_rate = 0, min_rate = 1e10;
 		rms = (rms/(outDim*nbSamples));
-		error_rate = (error_rate/(outDim*nbSamples));
-		fprintf (stderr, "%f (%f %f) ", error_rate, rms, best_rms);
+		error_rate[0] = (error_rate[0]/(nbSamples));
+        error_rate[1] = (error_rate[1]/(nbSamples));
+		fprintf (stderr, "%f %f (%f %f) ", error_rate[0], error_rate[1], rms, best_rms);
 		if (rms < best_rms)
 		{
 			best_rms = rms;
@@ -445,6 +449,7 @@ int main(int argc, char **argv)
 	outputs = malloc(nbOutputs*nbSamples*sizeof(*outputs));
 	
 	seed = time(NULL);
+    /*seed = 1361480659;*/
 	fprintf (stderr, "Seed is %u\n", seed);
 	srand(seed);
 	build_tansig_table();

From 73142b100adebe4321d3919ab657f510b7cfe40d Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Thu, 28 Feb 2013 15:30:51 -0500
Subject: [PATCH 15/15] Makes the speech/music probability estimation mode
 conservative

This is done using an adaptive beta and an estimate of the speech
and music detection confidence
---
 src/analysis.c | 39 +++++++++++++++++++++++++++++++++------
 src/analysis.h |  4 ++++
 2 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/src/analysis.c b/src/analysis.c
index 34ea5107..14b2246c 100644
--- a/src/analysis.c
+++ b/src/analysis.c
@@ -180,6 +180,7 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int
       psum += tonal->pmusic[i];
    for (;i<DETECT_SIZE;i++)
       psum += tonal->pspeech[i];
+   psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
    /*printf("%f %f\n", psum, info_out->music_prob);*/
 
    info_out->music_prob = psum;
@@ -479,19 +480,22 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, con
     {
        float tau, beta;
        float p0, p1;
-       float max_certainty;
        /* One transition every 3 minutes */
        tau = .00005f*frame_probs[1];
        beta = .05f;
-       max_certainty = .01f+1.f/(20.f+.5f*tonal->last_transition);
-       max_certainty = 0;
+       if (1) {
+          /* Adapt beta based on how "unexpected" the new prob is */
+          float p, q;
+          p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
+          q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
+          beta = .01+.05*ABS16(p-q)/(p*(1-q)+q*(1-p));
+       }
        p0 = (1-tonal->music_prob)*(1-tau) +    tonal->music_prob *tau;
        p1 =    tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
        p0 *= (float)pow(1-frame_probs[0], beta);
        p1 *= (float)pow(frame_probs[0], beta);
-       tonal->music_prob = MAX16(max_certainty, MIN16(1-max_certainty, p1/(p0+p1)));
+       tonal->music_prob = p1/(p0+p1);
        info->music_prob = tonal->music_prob;
-       info->music_prob = frame_probs[0];
 
        float psum=1e-20;
        float speech0 = (float)pow(1-frame_probs[0], beta);
@@ -526,7 +530,30 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, con
        for (i=1;i<DETECT_SIZE;i++)
           psum += tonal->pspeech[i];
 
-       /*printf("%f\n", psum);*/
+       /* Estimate our confidence in the speech/music decisions */
+       if (frame_probs[1]>.75)
+       {
+          if (tonal->music_prob>.9)
+          {
+             float adapt;
+             adapt = 1.f/(++tonal->music_confidence_count);
+             tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
+             tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
+          }
+          if (tonal->music_prob<.1)
+          {
+             float adapt;
+             adapt = 1.f/(++tonal->speech_confidence_count);
+             tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
+             tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
+          }
+       } else {
+          if (tonal->music_confidence_count==0)
+             tonal->music_confidence = .9;
+          if (tonal->speech_confidence_count==0)
+             tonal->speech_confidence = .1;
+       }
+       psum = MAX16(tonal->speech_confidence, MIN16(tonal->music_confidence, psum));
     }
     if (tonal->last_music != (tonal->music_prob>.5f))
        tonal->last_transition=0;
diff --git a/src/analysis.h b/src/analysis.h
index 37a8bf40..7b17118c 100644
--- a/src/analysis.h
+++ b/src/analysis.h
@@ -65,6 +65,10 @@ typedef struct {
    int analysis_offset;
    float pspeech[DETECT_SIZE];
    float pmusic[DETECT_SIZE];
+   float speech_confidence;
+   float music_confidence;
+   int speech_confidence_count;
+   int music_confidence_count;
    int write_pos;
    int read_pos;
    int read_subframe;