optimisation: another bunch of simplifications to the "simple case" of the

alg_quant() search.
2008-04-15 17:31:23 +10:00 · 2008-04-15 17:31:23 +10:00 · ed317c94c3
commit ed317c94c3
parent 6ea8baed2d
2 changed files with 38 additions and 29 deletions
--- a/libcelt/cwrs.c
+++ b/libcelt/cwrs.c
@ -42,6 +42,7 @@
 #include "config.h"
 #endif

+#include "os_support.h"
 #include <stdlib.h>
 #include <string.h>
 #include "cwrs.h"
--- a/libcelt/vq.c
+++ b/libcelt/vq.c
@ -137,61 +137,67 @@ void alg_quant(celt_norm_t *X, celt_mask_t *W, int N, int K, const celt_norm_t *
   while (pulsesLeft > 0)
   {
      int pulsesAtOnce=1;
-      int sign;
-      celt_word32_t Rxy, Ryy, Ryp;
-      celt_word32_t g;
-      celt_word32_t best_num;
-      celt_word16_t best_den;
      int best_id;
-      
+      celt_word16_t magnitude;
+#ifdef FIXED_POINT
+      int rshift;
+#endif
      /* Decide on how many pulses to find at once */
      pulsesAtOnce = (pulsesLeft*N_1)>>9; /* pulsesLeft/N */
      if (pulsesAtOnce<1)
         pulsesAtOnce = 1;
+#ifdef FIXED_POINT
+      rshift = yshift+1+celt_ilog2(K-pulsesLeft+pulsesAtOnce);
+#endif
+      magnitude = SHL16(pulsesAtOnce, yshift);

-      /* This should ensure that anything we can process will have a better score */
-      best_num = -SHR32(VERY_LARGE32,4);
-      best_den = 0;
      best_id = 0;
+      /* The squared magnitude term gets added anyway, so we might as well 
+         add it outside the loop */
+      yy = ADD32(yy, MULT16_16(magnitude,magnitude));
      /* Choose between fast and accurate strategy depending on where we are in the search */
      if (pulsesLeft>1)
      {
-         /* OPT: This loop is very CPU-intensive */
+         /* This should ensure that anything we can process will have a better score */
+         celt_word32_t best_num = -VERY_LARGE16;
+         celt_word16_t best_den = 0;
         j=0;
         do {
-            celt_word32_t num;
-            celt_word16_t den;
+            celt_word16_t Rxy, Ryy;
            /* Select sign based on X[j] alone */
-            sign = signx[j];
-            s = SHL16(sign*pulsesAtOnce, yshift);
+            s = signx[j]*magnitude;
            /* Temporary sums of the new pulse(s) */
-            Rxy = xy + MULT16_16(s,X[j]);
-            Ryy = yy + 2*MULT16_16(s,y[j]) + MULT16_16(s,s);
+            Rxy = SHR32(xy + MULT16_16(s,X[j]),rshift);
+            /* We're multiplying y[j] by two so we don't have to do it here */
+            Ryy = SHR32(yy + MULT16_16(s,y[j]),rshift);
            
-            /* Approximate score: we maximise Rxy/sqrt(Ryy) */
-            num = MULT16_16(ROUND16(Rxy,14),ABS16(ROUND16(Rxy,14)));
-            den = ROUND16(Ryy,14);
+            /* Approximate score: we maximise Rxy/sqrt(Ryy) (we're guaranteed that 
+               Rxy is positive because the sign is pre-computed) */
+            Rxy = MULT16_16_Q15(Rxy,Rxy);
            /* The idea is to check for num/den >= best_num/best_den, but that way
               we can do it without any division */
-            /* OPT: Make sure to use a conditional move here */
-            if (MULT16_32_Q15(best_den, num) > MULT16_32_Q15(den, best_num))
+            /* OPT: Make sure to use conditional moves here */
+            if (MULT16_16(best_den, Rxy) > MULT16_16(Ryy, best_num))
            {
-               best_den = den;
-               best_num = num;
+               best_den = Ryy;
+               best_num = Rxy;
               best_id = j;
            }
         } while (++j<N); /* Promises we loop at least once */
      } else {
+         celt_word32_t g;
+         celt_word32_t best_num = -VERY_LARGE32;
         for (j=0;j<N;j++)
         {
+            celt_word32_t Rxy, Ryy, Ryp;
            celt_word32_t num;
            /* Select sign based on X[j] alone */
-            sign = signx[j];
-            s = SHL16(sign*pulsesAtOnce, yshift);
+            s = signx[j]*magnitude;
            /* Temporary sums of the new pulse(s) */
            Rxy = xy + MULT16_16(s,X[j]);
-            Ryy = yy + 2*MULT16_16(s,y[j]) + MULT16_16(s,s);
-            Ryp = yp + MULT16_16(s, P[j]);
+            /* We're multiplying y[j] by two so we don't have to do it here */
+            Ryy = yy + MULT16_16(s,y[j]);
+            Ryp = yp + MULT16_16(s,P[j]);

            /* Compute the gain such that ||p + g*y|| = 1 */
            g = MULT16_32_Q15(
@ -218,11 +224,13 @@ void alg_quant(celt_norm_t *X, celt_mask_t *W, int N, int K, const celt_norm_t *

      /* Updating the sums of the new pulse(s) */
      xy = xy + MULT16_16(s,X[j]);
-      yy = yy + 2*MULT16_16(s,y[j]) + MULT16_16(s,s);
+      /* We're multiplying y[j] by two so we don't have to do it here */
+      yy = yy + MULT16_16(s,y[j]);
      yp = yp + MULT16_16(s, P[j]);

      /* Only now that we've made the final choice, update y/iy */
-      y[j] += s;
+      /* Multiplying y[j] by 2 so we don't have to do it everywhere else */
+      y[j] += 2*s;
      iy[j] += is;
      pulsesLeft -= pulsesAtOnce;
   }