Add rtcd for silk_inner_product_FLP()

2023-11-21 02:56:04 -05:00 · 2023-11-21 02:56:04 -05:00 · 239d223d84
commit 239d223d84
parent b93e4a149c
13 changed files with 65 additions and 28 deletions
--- a/silk/float/SigProc_FLP.h
+++ b/silk/float/SigProc_FLP.h
@ -74,7 +74,8 @@ void silk_autocorrelation_FLP(
    silk_float          *results,           /* O    result (length correlationCount)                            */
    const silk_float    *inputData,         /* I    input data to correlate                                     */
    opus_int            inputDataSize,      /* I    length of input                                             */
-    opus_int            correlationCount    /* I    number of correlation taps to compute                       */
+    opus_int            correlationCount,    /* I    number of correlation taps to compute                       */
+    int                 arch
 );

 opus_int silk_pitch_analysis_core_FLP(      /* O    Voicing estimate: 0 voiced, 1 unvoiced                      */
@ -106,7 +107,8 @@ silk_float silk_burg_modified_FLP(          /* O    returns residual energy
    const silk_float    minInvGain,         /* I    minimum inverse prediction gain                             */
    const opus_int      subfr_length,       /* I    input signal subframe length (incl. D preceding samples)    */
    const opus_int      nb_subfr,           /* I    number of subframes stacked in x                            */
-    const opus_int      D                   /* I    order                                                       */
+    const opus_int      D,                  /* I    order                                                       */
+    int                 arch
 );

 /* multiply a vector by a constant */
@ -132,7 +134,7 @@ double silk_inner_product_FLP_c(
 );

 #ifndef OVERRIDE_inner_product_FLP
-#define silk_inner_product_FLP(data1, data2, dataSize) silk_inner_product_FLP_c(data1, data2, dataSize)
+#define silk_inner_product_FLP(data1, data2, dataSize, arch) ((void)arch,silk_inner_product_FLP_c(data1, data2, dataSize))
 #endif


--- a/silk/float/autocorrelation_FLP.c
+++ b/silk/float/autocorrelation_FLP.c
@ -37,7 +37,8 @@ void silk_autocorrelation_FLP(
    silk_float          *results,           /* O    result (length correlationCount)                            */
    const silk_float    *inputData,         /* I    input data to correlate                                     */
    opus_int            inputDataSize,      /* I    length of input                                             */
-    opus_int            correlationCount    /* I    number of correlation taps to compute                       */
+    opus_int            correlationCount,    /* I    number of correlation taps to compute                       */
+    int                 arch
 )
 {
    opus_int i;
@ -47,6 +48,6 @@ void silk_autocorrelation_FLP(
    }

    for( i = 0; i < correlationCount; i++ ) {
-        results[ i ] =  (silk_float)silk_inner_product_FLP( inputData, inputData + i, inputDataSize - i );
+        results[ i ] =  (silk_float)silk_inner_product_FLP( inputData, inputData + i, inputDataSize - i, arch );
    }
 }
--- a/silk/float/burg_modified_FLP.c
+++ b/silk/float/burg_modified_FLP.c
@ -42,7 +42,8 @@ silk_float silk_burg_modified_FLP(          /* O    returns residual energy
    const silk_float    minInvGain,         /* I    minimum inverse prediction gain                             */
    const opus_int      subfr_length,       /* I    input signal subframe length (incl. D preceding samples)    */
    const opus_int      nb_subfr,           /* I    number of subframes stacked in x                            */
-    const opus_int      D                   /* I    order                                                       */
+    const opus_int      D,                  /* I    order                                                       */
+    int                 arch
 )
 {
    opus_int         k, n, s, reached_max_gain;
@ -60,7 +61,7 @@ silk_float silk_burg_modified_FLP(          /* O    returns residual energy
    for( s = 0; s < nb_subfr; s++ ) {
        x_ptr = x + s * subfr_length;
        for( n = 1; n < D + 1; n++ ) {
-            C_first_row[ n - 1 ] += silk_inner_product_FLP( x_ptr, x_ptr + n, subfr_length - n );
+            C_first_row[ n - 1 ] += silk_inner_product_FLP( x_ptr, x_ptr + n, subfr_length - n, arch );
        }
    }
    silk_memcpy( C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof( double ) );
--- a/silk/float/corrMatrix_FLP.c
+++ b/silk/float/corrMatrix_FLP.c
@ -41,7 +41,8 @@ void silk_corrVector_FLP(
    const silk_float                *t,                                 /* I    Target vector [L]                           */
    const opus_int                  L,                                  /* I    Length of vecors                            */
    const opus_int                  Order,                              /* I    Max lag for correlation                     */
-    silk_float                      *Xt                                 /* O    X'*t correlation vector [order]             */
+    silk_float                      *Xt,                                /* O    X'*t correlation vector [order]             */
+    int                             arch
 )
 {
    opus_int lag;
@ -50,7 +51,7 @@ void silk_corrVector_FLP(
    ptr1 = &x[ Order - 1 ];                     /* Points to first sample of column 0 of X: X[:,0] */
    for( lag = 0; lag < Order; lag++ ) {
        /* Calculate X[:,lag]'*t */
-        Xt[ lag ] = (silk_float)silk_inner_product_FLP( ptr1, t, L );
+        Xt[ lag ] = (silk_float)silk_inner_product_FLP( ptr1, t, L, arch );
        ptr1--;                                 /* Next column of X */
    }
 }
@ -60,7 +61,8 @@ void silk_corrMatrix_FLP(
    const silk_float                *x,                                 /* I    x vector [ L+order-1 ] used to create X     */
    const opus_int                  L,                                  /* I    Length of vectors                           */
    const opus_int                  Order,                              /* I    Max lag for correlation                     */
-    silk_float                      *XX                                 /* O    X'*X correlation matrix [order x order]     */
+    silk_float                      *XX,                                /* O    X'*X correlation matrix [order x order]     */
+    int                             arch
 )
 {
    opus_int j, lag;
@ -79,7 +81,7 @@ void silk_corrMatrix_FLP(
    ptr2 = &x[ Order - 2 ];                     /* First sample of column 1 of X */
    for( lag = 1; lag < Order; lag++ ) {
        /* Calculate X[:,0]'*X[:,lag] */
-        energy = silk_inner_product_FLP( ptr1, ptr2, L );
+        energy = silk_inner_product_FLP( ptr1, ptr2, L, arch );
        matrix_ptr( XX, lag, 0, Order ) = ( silk_float )energy;
        matrix_ptr( XX, 0, lag, Order ) = ( silk_float )energy;
        /* Calculate X[:,j]'*X[:,j + lag] */
--- a/silk/float/find_LPC_FLP.c
+++ b/silk/float/find_LPC_FLP.c
@ -38,7 +38,8 @@ void silk_find_LPC_FLP(
    silk_encoder_state              *psEncC,                            /* I/O  Encoder state                               */
    opus_int16                      NLSF_Q15[],                         /* O    NLSFs                                       */
    const silk_float                x[],                                /* I    Input signal                                */
-    const silk_float                minInvGain                          /* I    Inverse of max prediction gain              */
+    const silk_float                minInvGain,                         /* I    Inverse of max prediction gain              */
+    int                             arch
 )
 {
    opus_int    k, subfr_length;
@ -56,12 +57,12 @@ void silk_find_LPC_FLP(
    psEncC->indices.NLSFInterpCoef_Q2 = 4;

    /* Burg AR analysis for the full frame */
-    res_nrg = silk_burg_modified_FLP( a, x, minInvGain, subfr_length, psEncC->nb_subfr, psEncC->predictLPCOrder );
+    res_nrg = silk_burg_modified_FLP( a, x, minInvGain, subfr_length, psEncC->nb_subfr, psEncC->predictLPCOrder, arch );

    if( psEncC->useInterpolatedNLSFs && !psEncC->first_frame_after_reset && psEncC->nb_subfr == MAX_NB_SUBFR ) {
        /* Optimal solution for last 10 ms; subtract residual energy here, as that's easier than        */
        /* adding it to the residual energy of the first 10 ms in each iteration of the search below    */
-        res_nrg -= silk_burg_modified_FLP( a_tmp, x + ( MAX_NB_SUBFR / 2 ) * subfr_length, minInvGain, subfr_length, MAX_NB_SUBFR / 2, psEncC->predictLPCOrder );
+        res_nrg -= silk_burg_modified_FLP( a_tmp, x + ( MAX_NB_SUBFR / 2 ) * subfr_length, minInvGain, subfr_length, MAX_NB_SUBFR / 2, psEncC->predictLPCOrder, arch );

        /* Convert to NLSFs */
        silk_A2NLSF_FLP( NLSF_Q15, a_tmp, psEncC->predictLPCOrder );
--- a/silk/float/find_LTP_FLP.c
+++ b/silk/float/find_LTP_FLP.c
@ -38,7 +38,8 @@ void silk_find_LTP_FLP(
    const silk_float                r_ptr[],                            /* I    LPC residual                                */
    const opus_int                  lag[ MAX_NB_SUBFR ],                /* I    LTP lags                                    */
    const opus_int                  subfr_length,                       /* I    Subframe length                             */
-    const opus_int                  nb_subfr                            /* I    number of subframes                         */
+    const opus_int                  nb_subfr,                           /* I    number of subframes                         */
+    int                             arch
 )
 {
    opus_int   k;
@ -50,8 +51,8 @@ void silk_find_LTP_FLP(
    XX_ptr = XX;
    for( k = 0; k < nb_subfr; k++ ) {
        lag_ptr = r_ptr - ( lag[ k ] + LTP_ORDER / 2 );
-        silk_corrMatrix_FLP( lag_ptr, subfr_length, LTP_ORDER, XX_ptr );
-        silk_corrVector_FLP( lag_ptr, r_ptr, subfr_length, LTP_ORDER, xX_ptr );
+        silk_corrMatrix_FLP( lag_ptr, subfr_length, LTP_ORDER, XX_ptr, arch );
+        silk_corrVector_FLP( lag_ptr, r_ptr, subfr_length, LTP_ORDER, xX_ptr, arch );
        xx = ( silk_float )silk_energy_FLP( r_ptr, subfr_length + LTP_ORDER );
        temp = 1.0f / silk_max( xx, LTP_CORR_INV_MAX * 0.5f * ( XX_ptr[ 0 ] + XX_ptr[ 24 ] ) + 1.0f );
        silk_scale_vector_FLP( XX_ptr, temp, LTP_ORDER * LTP_ORDER );
--- a/silk/float/find_pitch_lags_FLP.c
+++ b/silk/float/find_pitch_lags_FLP.c
@ -82,7 +82,7 @@ void silk_find_pitch_lags_FLP(
    silk_apply_sine_window_FLP( Wsig_ptr, x_buf_ptr, 2, psEnc->sCmn.la_pitch );

    /* Calculate autocorrelation sequence */
-    silk_autocorrelation_FLP( auto_corr, Wsig, psEnc->sCmn.pitch_LPC_win_length, psEnc->sCmn.pitchEstimationLPCOrder + 1 );
+    silk_autocorrelation_FLP( auto_corr, Wsig, psEnc->sCmn.pitch_LPC_win_length, psEnc->sCmn.pitchEstimationLPCOrder + 1, arch );

    /* Add white noise, as a fraction of the energy */
    auto_corr[ 0 ] += auto_corr[ 0 ] * FIND_PITCH_WHITE_NOISE_FRACTION + 1;
--- a/silk/float/find_pred_coefs_FLP.c
+++ b/silk/float/find_pred_coefs_FLP.c
@ -63,7 +63,7 @@ void silk_find_pred_coefs_FLP(
        celt_assert( psEnc->sCmn.ltp_mem_length - psEnc->sCmn.predictLPCOrder >= psEncCtrl->pitchL[ 0 ] + LTP_ORDER / 2 );

        /* LTP analysis */
-        silk_find_LTP_FLP( XXLTP, xXLTP, res_pitch, psEncCtrl->pitchL, psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr );
+        silk_find_LTP_FLP( XXLTP, xXLTP, res_pitch, psEncCtrl->pitchL, psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr, psEnc->sCmn.arch );

        /* Quantize LTP gain parameters */
        silk_quant_LTP_gains_FLP( psEncCtrl->LTPCoef, psEnc->sCmn.indices.LTPIndex, &psEnc->sCmn.indices.PERIndex,
@ -102,7 +102,7 @@ void silk_find_pred_coefs_FLP(
    }

    /* LPC_in_pre contains the LTP-filtered input for voiced, and the unfiltered input for unvoiced */
-    silk_find_LPC_FLP( &psEnc->sCmn, NLSF_Q15, LPC_in_pre, minInvGain );
+    silk_find_LPC_FLP( &psEnc->sCmn, NLSF_Q15, LPC_in_pre, minInvGain, psEnc->sCmn.arch );

    /* Quantize LSFs */
    silk_process_NLSFs_FLP( &psEnc->sCmn, psEncCtrl->PredCoef, NLSF_Q15, psEnc->sCmn.prev_NLSFq_Q15 );
--- a/silk/float/main_FLP.h
+++ b/silk/float/main_FLP.h
@ -138,7 +138,8 @@ void silk_find_LPC_FLP(
    silk_encoder_state              *psEncC,                            /* I/O  Encoder state                               */
    opus_int16                      NLSF_Q15[],                         /* O    NLSFs                                       */
    const silk_float                x[],                                /* I    Input signal                                */
-    const silk_float                minInvGain                          /* I    Prediction gain from LTP (dB)               */
+    const silk_float                minInvGain,                         /* I    Prediction gain from LTP (dB)               */
+    int                             arch
 );

 /* LTP analysis */
@ -148,7 +149,8 @@ void silk_find_LTP_FLP(
    const silk_float                r_ptr[],                            /* I    LPC residual                                */
    const opus_int                  lag[  MAX_NB_SUBFR ],               /* I    LTP lags                                    */
    const opus_int                  subfr_length,                       /* I    Subframe length                             */
-    const opus_int                  nb_subfr                            /* I    number of subframes                         */
+    const opus_int                  nb_subfr,                           /* I    number of subframes                         */
+    int                             arch
 );

 void silk_LTP_analysis_filter_FLP(
@ -221,7 +223,8 @@ void silk_corrMatrix_FLP(
    const silk_float                *x,                                 /* I    x vector [ L+order-1 ] used to create X     */
    const opus_int                  L,                                  /* I    Length of vectors                           */
    const opus_int                  Order,                              /* I    Max lag for correlation                     */
-    silk_float                      *XX                                 /* O    X'*X correlation matrix [order x order]     */
+    silk_float                      *XX,                                /* O    X'*X correlation matrix [order x order]     */
+    int                             arch
 );

 /* Calculates correlation vector X'*t */
@ -230,7 +233,8 @@ void silk_corrVector_FLP(
    const silk_float                *t,                                 /* I    Target vector [L]                           */
    const opus_int                  L,                                  /* I    Length of vecors                            */
    const opus_int                  Order,                              /* I    Max lag for correlation                     */
-    silk_float                      *Xt                                 /* O    X'*t correlation vector [order]             */
+    silk_float                      *Xt,                                /* O    X'*t correlation vector [order]             */
+    int                             arch
 );

 /* Apply sine window to signal vector.  */
--- a/silk/float/noise_shape_analysis_FLP.c
+++ b/silk/float/noise_shape_analysis_FLP.c
@ -255,7 +255,7 @@ void silk_noise_shape_analysis_FLP(
                psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder );
        } else {
            /* Calculate regular auto correlation */
-            silk_autocorrelation_FLP( auto_corr, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1 );
+            silk_autocorrelation_FLP( auto_corr, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1, psEnc->sCmn.arch );
        }

        /* Add white noise, as a fraction of energy */
--- a/silk/float/pitch_analysis_core_FLP.c
+++ b/silk/float/pitch_analysis_core_FLP.c
@ -291,7 +291,7 @@ opus_int silk_pitch_analysis_core_FLP(      /* O    Voicing estimate: 0 voiced,
        for( j = 0; j < length_d_comp; j++ ) {
            d = d_comp[ j ];
            basis_ptr = target_ptr - d;
-            cross_corr = silk_inner_product_FLP( basis_ptr, target_ptr, sf_length_8kHz );
+            cross_corr = silk_inner_product_FLP( basis_ptr, target_ptr, sf_length_8kHz, arch );
            if( cross_corr > 0.0f ) {
                energy = silk_energy_FLP( basis_ptr, sf_length_8kHz );
                C[ k ][ d ] = (silk_float)( 2 * cross_corr / ( energy + energy_tmp ) );
--- a/silk/x86/main_sse.h
+++ b/silk/x86/main_sse.h
@ -278,11 +278,18 @@ double silk_inner_product_FLP_avx2(
 #if defined (OPUS_X86_PRESUME_AVX2)

 #define OVERRIDE_inner_product_FLP
-#define silk_inner_product_FLP(data1, data2, dataSize) silk_inner_product_FLP_avx2(data1, data2, dataSize)
+#define silk_inner_product_FLP(data1, data2, dataSize, arch) ((void)arch,silk_inner_product_FLP_avx2(data1, data2, dataSize))

 #elif defined(OPUS_HAVE_RTCD) && defined(OPUS_X86_MAY_HAVE_AVX2)

-/*#define OVERRIDE_inner_product_FLP*/
+#define OVERRIDE_inner_product_FLP
+extern double (*const SILK_INNER_PRODUCT_FLP_IMPL[OPUS_ARCHMASK + 1])(
+    const silk_float    *data1,
+    const silk_float    *data2,
+    opus_int            dataSize
+);
+
+#define silk_inner_product_FLP(data1, data2, dataSize, arch) ((void)arch,(*SILK_INNER_PRODUCT_FLP_IMPL[(arch) & OPUS_ARCHMASK])(data1, data2, dataSize))

 #endif

--- a/silk/x86/x86_silk_map.c
+++ b/silk/x86/x86_silk_map.c
@ -32,6 +32,7 @@
 #include "celt/x86/x86cpu.h"
 #include "structs.h"
 #include "SigProc_FIX.h"
+#include "SigProc_FLP.h"
 #include "pitch.h"
 #include "main.h"

@ -156,4 +157,21 @@ void (*const SILK_BURG_MODIFIED_IMPL[ OPUS_ARCHMASK + 1 ] )(
 };

 #endif
+
+#ifndef FIXED_POINT
+
+double (*const SILK_INNER_PRODUCT_FLP_IMPL[ OPUS_ARCHMASK + 1 ] )(
+    const silk_float    *data1,
+    const silk_float    *data2,
+    opus_int            dataSize
+) = {
+  silk_inner_product_FLP_c,                  /* non-sse */
+  silk_inner_product_FLP_c,
+  silk_inner_product_FLP_c,
+  silk_inner_product_FLP_c, /* sse4.1 */
+  MAY_HAVE_AVX2( silk_inner_product_FLP )  /* avx */
+};
+
+#endif
+
 #endif