Split cwrsi() by pulses vs. dimensions.

This lets us cut out a bunch of work in the large _n, small _k case where most of the dimensions won't have any pulses. It also gets rid of all remaining usage of CELT_PVQ_U() in cwrsi(), leaving just a single test instead of lots of mins and maxes, and makes a bunch of the jump threading more obvious. This is a 1.6% decoder speedup on a 96 kbps comp48-stereo encode on a Cortex A8.
2013-06-14 23:57:19 -07:00 · 2013-06-14 23:57:19 -07:00 · ce15e65319
commit ce15e65319
parent 63f744d583
1 changed files with 35 additions and 20 deletions
--- a/celt/cwrs.c
+++ b/celt/cwrs.c
@ -467,34 +467,49 @@ static void cwrsi(int _n,int _k,opus_uint32 _i,int *_y){
  celt_assert(_k>0);
  celt_assert(_n>1);
  while(_n>2){
    opus_uint32 q;
    /*Lots of pulses case:*/
    if(_k>=_n){
      const opus_uint32 *row;
      row=CELT_PVQ_U_ROW[_n];
      /*Are the pulses in this dimension negative?*/
-    p=CELT_PVQ_U(_n,_k+1);
+      p=row[_k+1];
      s=-(_i>=p);
      _i-=p&s;
      /*Count how many pulses were placed in this dimension.*/
      k0=_k;
    p=CELT_PVQ_U(_n,_k);
    if(_k>_n){
      const opus_uint32 *row;
      opus_uint32        q;
      row=CELT_PVQ_U_ROW[_n];
      q=row[_n];
      if(q>_i){
        celt_assert(p>q);
        /*Setting p=q is unnecessary, but it helps the optimizer prove p>_i,
           allowing it to jump straight past the initial test in the second
           loop below.
          Once it's removed that first comparison, a smart compiler should be
           able to figure out that the result of this assignment isn't used and
           optimize it away anyway.*/
        p=q;
        _k=_n;
        do p=CELT_PVQ_U_ROW[--_k][_n];
        while(p>_i);
      }
-      else for(;p>_i;p=row[_k])_k--;
+      else for(p=row[_k];p>_i;p=row[_k])_k--;
    }
    for(;p>_i;p=CELT_PVQ_U_ROW[_k][_n])_k--;
      _i-=p;
      *_y++=(k0-_k+s)^s;
    }
    /*Lots of dimensions case:*/
    else{
      /*Are there any pulses in this dimension at all?*/
      p=CELT_PVQ_U_ROW[_k][_n];
      q=CELT_PVQ_U_ROW[_k+1][_n];
      if(p<=_i&&_i<q){
        _i-=p;
        *_y++=0;
      }
      else{
        /*Are the pulses in this dimension negative?*/
        s=-(_i>=q);
        _i-=q&s;
        /*Count how many pulses were placed in this dimension.*/
        k0=_k;
        do p=CELT_PVQ_U_ROW[--_k][_n];
        while(p>_i);
        _i-=p;
        *_y++=(k0-_k+s)^s;
      }
    }
    _n--;
  }
  /*_n==2*/