Adds support for delayed decision

Variable duration option renamed to OPUS_SET_EXPERT_FRAME_DURATION, with new API. Also moves up the analysis to avoid having to do int->float conversion on large buffers.
2025-06-04 01:27:42 +00:00 · 2013-02-20 04:08:04 -05:00 · 2013-02-20 04:08:04 -05:00 · 51f4a32ec2
commit 51f4a32ec2
parent 10a34a5dd6
7 changed files with 395 additions and 122 deletions
--- a/src/analysis.c
+++ b/src/analysis.c
@ -139,10 +139,81 @@ static inline float fast_atan2f(float y, float x) {
   }
 }

-void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEncoder *celt_enc, const opus_val16 *x, int len, int C, int lsb_depth)
+void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
+{
+#if 1
+   int pos;
+   int curr_lookahead;
+   float psum;
+   int i;
+
+   pos = tonal->read_pos;
+   curr_lookahead = tonal->write_pos-tonal->read_pos;
+   if (curr_lookahead<0)
+      curr_lookahead += DETECT_SIZE;
+
+   if (len > 480 && pos != tonal->write_pos)
+   {
+      pos++;
+      if (pos==DETECT_SIZE)
+         pos=0;
+   }
+   if (pos == tonal->write_pos)
+      pos--;
+   if (pos<0)
+      pos = DETECT_SIZE-1;
+   OPUS_COPY(info_out, &tonal->info[pos], 1);
+   tonal->read_subframe += len/120;
+   while (tonal->read_subframe>=4)
+   {
+      tonal->read_subframe -= 4;
+      tonal->read_pos++;
+   }
+   if (tonal->read_pos>=DETECT_SIZE)
+      tonal->read_pos-=DETECT_SIZE;
+
+   /* Compensate for the delay in the features themselves.
+      FIXME: Need a better estimate the 10 I just made up */
+   curr_lookahead = IMAX(curr_lookahead-10, 0);
+
+   psum=0;
+   for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
+      psum += tonal->pmusic[i];
+   for (;i<DETECT_SIZE;i++)
+      psum += tonal->pspeech[i];
+   /*printf("%f %f\n", psum, info_out->music_prob);*/
+
+   info_out->music_prob = psum;
+#else
+   /* If data not available, return invalid */
+   if (tonal->read_pos==tonal->write_pos)
+   {
+      info_out->valid=0;
+      return;
+   }
+
+   OPUS_COPY(info_out, &tonal->info[tonal->read_pos], 1);
+   tonal->read_subframe += len/480;
+   while (tonal->read_subframe>=4)
+   {
+      tonal->read_subframe -= 4;
+      tonal->read_pos++;
+   }
+   if (tonal->read_pos>=DETECT_SIZE)
+      tonal->read_pos-=DETECT_SIZE;
+   if (tonal->read_pos == tonal->write_pos)
+   {
+      tonal->read_pos = tonal->write_pos-1;
+      if (tonal->read_pos<0)
+         tonal->read_pos=DETECT_SIZE-1;
+      tonal->read_subframe = 3;
+   }
+#endif
+}
+
+void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, const CELTMode *celt_mode, const void *x, int len, int offset, int C, int lsb_depth, downmix_func downmix)
 {
    int i, b;
-    const CELTMode *mode;
    const kiss_fft_state *kfft;
    kiss_fft_cpx in[480], out[480];
    int N = 480, N2=240;
@ -171,8 +242,7 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc
    float maxE = 0;
    float noise_floor;
    int remaining;
-
-    celt_encoder_ctl(celt_enc, CELT_GET_MODE(&mode));
+    AnalysisInfo *info;

    tonal->last_transition++;
    alpha = 1.f/IMIN(20, 1+tonal->count);
@ -181,23 +251,19 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc

    if (tonal->count<4)
       tonal->music_prob = .5;
-    kfft = mode->mdct.kfft[0];
+    kfft = celt_mode->mdct.kfft[0];
    if (tonal->count==0)
       tonal->mem_fill = 240;
-    if (C==1)
-    {
-       for (i=0;i<IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill);i++)
-          tonal->inmem[i+tonal->mem_fill] = x[i];
-    } else {
-       for (i=0;i<IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill);i++)
-          tonal->inmem[i+tonal->mem_fill] = x[2*i]+x[2*i+1];
-    }
+    downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, C);
    if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
    {
       tonal->mem_fill += len;
       /* Don't have enough to update the analysis */
       return;
    }
+    info = &tonal->info[tonal->write_pos++];
+    if (tonal->write_pos>=DETECT_SIZE)
+       tonal->write_pos-=DETECT_SIZE;

    for (i=0;i<N2;i++)
    {
@ -209,15 +275,7 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc
    }
    OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
    remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
-    if (C==1)
-    {
-       for (i=0;i<remaining;i++)
-          tonal->inmem[240+i] = x[ANALYSIS_BUF_SIZE-tonal->mem_fill+i];
-    } else {
-       for (i=0;i<remaining;i++)
-          tonal->inmem[240+i] = x[2*(ANALYSIS_BUF_SIZE-tonal->mem_fill+i)]
-                              + x[2*(ANALYSIS_BUF_SIZE-tonal->mem_fill+i)+1];
-    }
+    downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, C);
    tonal->mem_fill = 240 + remaining;
    opus_fft(kfft, in, out);

@ -450,13 +508,49 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc
       tau = .00005f;
       beta = .1f;
       max_certainty = .01f+1.f/(20.f+.5f*tonal->last_transition);
+       max_certainty = 0;
       p0 = (1-tonal->music_prob)*(1-tau) +    tonal->music_prob *tau;
       p1 =    tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
       p0 *= (float)pow(1-frame_prob, beta);
       p1 *= (float)pow(frame_prob, beta);
       tonal->music_prob = MAX16(max_certainty, MIN16(1-max_certainty, p1/(p0+p1)));
       info->music_prob = tonal->music_prob;
-       /*printf("%f %f\n", frame_prob, info->music_prob);*/
+       info->music_prob = frame_prob;
+
+       float psum=1e-20;
+       float speech0 = (float)pow(1-frame_prob, beta);
+       float music0  = (float)pow(frame_prob, beta);
+       if (tonal->count==1)
+       {
+          tonal->pspeech[0]=.5;
+          tonal->pmusic [0]=.5;
+       }
+       float s0, m0;
+       s0 = tonal->pspeech[0] + tonal->pspeech[1];
+       m0 = tonal->pmusic [0] + tonal->pmusic [1];
+       tonal->pspeech[0] = s0*(1-tau)*speech0;
+       tonal->pmusic [0] = m0*(1-tau)*music0;
+       for (i=1;i<DETECT_SIZE-1;i++)
+       {
+          tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
+          tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
+       }
+       tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
+       tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
+
+       for (i=0;i<DETECT_SIZE;i++)
+          psum += tonal->pspeech[i] + tonal->pmusic[i];
+       psum = 1.f/psum;
+       for (i=0;i<DETECT_SIZE;i++)
+       {
+          tonal->pspeech[i] *= psum;
+          tonal->pmusic [i] *= psum;
+       }
+       psum = tonal->pmusic[0];
+       for (i=1;i<DETECT_SIZE;i++)
+          psum += tonal->pspeech[i];
+
+       /*printf("%f %f %f\n", frame_prob, info->music_prob, psum);*/
    }
    if (tonal->last_music != (tonal->music_prob>.5f))
       tonal->last_transition=0;
@ -484,4 +578,48 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc
    /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
    info->noisiness = frame_noisiness;
    info->valid = 1;
+    if (info_out!=NULL)
+       OPUS_COPY(info_out, info, 1);
+}
+
+int run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *pcm,
+                        const void *analysis_pcm, int frame_size, int variable_duration, int C, opus_int32 Fs, int bitrate_bps,
+                        int delay_compensation, int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
+{
+   int offset;
+   int pcm_len;
+
+   /* Avoid overflow/wrap-around of the analysis buffer */
+   frame_size = IMIN((DETECT_SIZE-5)*Fs/100, frame_size);
+
+   pcm_len = frame_size - analysis->analysis_offset;
+   offset = 0;
+   do {
+      tonality_analysis(analysis, NULL, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, C, lsb_depth, downmix);
+      offset += 480;
+      pcm_len -= 480;
+   } while (pcm_len>0);
+   analysis->analysis_offset = frame_size;
+
+   if (variable_duration == OPUS_FRAMESIZE_VARIABLE && frame_size >= Fs/200)
+   {
+      int LM = 3;
+      LM = optimize_framesize(pcm, frame_size, C, Fs, bitrate_bps,
+            analysis->prev_tonality, analysis->subframe_mem, delay_compensation, downmix);
+      while ((Fs/400<<LM)>frame_size)
+         LM--;
+      frame_size = (Fs/400<<LM);
+   } else {
+      frame_size = frame_size_select(frame_size, variable_duration, Fs);
+   }
+   if (frame_size<0)
+      return -1;
+   analysis->analysis_offset -= frame_size;
+
+   /* Only perform analysis up to 20-ms frames. Longer ones will be split if
+      they're in CELT-only mode. */
+   analysis_info->valid = 0;
+   tonality_get_info(analysis, analysis_info, frame_size);
+
+   return frame_size;
 }