Update and re-enable SILK SSE4.1 optimisations

2025-05-29 14:49:14 +00:00 · 2020-05-18 17:15:05 +01:00 · 2020-05-18 17:15:05 +01:00 · c6f9857771
commit c6f9857771
parent 37aba6e9b3
17 changed files with 645 additions and 634 deletions
--- a/silk/NSQ.c
+++ b/silk/NSQ.c
@ -173,9 +173,9 @@ void silk_NSQ_c
    RESTORE_STACK;
 }

-/***********************************/
+/******************************/
 /* silk_noise_shape_quantizer */
-/***********************************/
+/******************************/

 #if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
 static OPUS_INLINE
--- a/silk/SigProc_FIX.h
+++ b/silk/SigProc_FIX.h
@ -381,7 +381,7 @@ opus_int32 silk_inner_prod_aligned_scale(
    const opus_int              len                 /*    I vector lengths                                              */
 );

-opus_int64 silk_inner_prod16_aligned_64_c(
+opus_int64 silk_inner_prod16_c(
    const opus_int16            *inVec1,            /*    I input vector 1                                              */
    const opus_int16            *inVec2,            /*    I input vector 2                                              */
    const opus_int              len                 /*    I vector lengths                                              */
@ -613,8 +613,8 @@ static OPUS_INLINE opus_int64 silk_max_64(opus_int64 a, opus_int64 b)
 #define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
    ((void)(arch), silk_burg_modified_c(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))

-#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
-    ((void)(arch),silk_inner_prod16_aligned_64_c(inVec1, inVec2, len))
+#define silk_inner_prod16(inVec1, inVec2, len, arch) \
+    ((void)(arch),silk_inner_prod16_c(inVec1, inVec2, len))
 #endif

 #include "Inlines.h"
--- a/silk/VQ_WMat_EC.c
+++ b/silk/VQ_WMat_EC.c
@ -64,7 +64,7 @@ void silk_VQ_WMat_EC_c(
    *rate_dist_Q8 = silk_int32_MAX;
    *res_nrg_Q15 = silk_int32_MAX;
    cb_row_Q7 = cb_Q7;
-    /* In things go really bad, at least *ind is set to something safe. */
+    /* If things go really bad, at least *ind is set to something safe. */
    *ind = 0;
    for( k = 0; k < L; k++ ) {
        opus_int32 penalty;
@ -115,7 +115,7 @@ void silk_VQ_WMat_EC_c(
        if( sum1_Q15 >= 0 ) {
            /* Translate residual energy to bits using high-rate assumption (6 dB ==> 1 bit/sample) */
            bits_res_Q8 = silk_SMULBB( subfr_len, silk_lin2log( sum1_Q15 + penalty) - (15 << 7) );
-            /* In the following line we reduce the codelength component by half ("-1"); seems to slghtly improve quality */
+            /* In the following line we reduce the codelength component by half ("-1"); seems to slightly improve quality */
            bits_tot_Q8 = silk_ADD_LSHIFT32( bits_res_Q8, cl_Q5[ k ], 3-1 );
            if( bits_tot_Q8 <= *rate_dist_Q8 ) {
                *rate_dist_Q8 = bits_tot_Q8;
--- a/silk/fixed/burg_modified_FIX.c
+++ b/silk/fixed/burg_modified_FIX.c
@ -68,7 +68,7 @@ void silk_burg_modified_c(
    celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );

    /* Compute autocorrelations, added over subframes */
-    C0_64 = silk_inner_prod16_aligned_64( x, x, subfr_length*nb_subfr, arch );
+    C0_64 = silk_inner_prod16( x, x, subfr_length*nb_subfr, arch );
    lz = silk_CLZ64(C0_64);
    rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz;
    if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS;
@ -87,7 +87,7 @@ void silk_burg_modified_c(
            x_ptr = x + s * subfr_length;
            for( n = 1; n < D + 1; n++ ) {
                C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
-                    silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
+                    silk_inner_prod16( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
            }
        }
    } else {
@ -150,7 +150,7 @@ void silk_burg_modified_c(
                    C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ]            ); /* Q( -rshifts ) */
                    C_last_row[ k ]  = silk_MLA( C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
                    Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 );                                   /* Q17 */
-                    /* We sometimes have get overflows in the multiplications (even beyond +/- 2^32),
+                    /* We sometimes get overflows in the multiplications (even beyond +/- 2^32),
                       but they cancel each other and the real result seems to always fit in a 32-bit
                       signed integer. This was determined experimentally, not theoretically (unfortunately). */
                    tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ],            Atmp1 );                      /* Q17 */
@ -253,7 +253,7 @@ void silk_burg_modified_c(
        if( rshifts > 0 ) {
            for( s = 0; s < nb_subfr; s++ ) {
                x_ptr = x + s * subfr_length;
-                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
+                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16( x_ptr, x_ptr, D, arch ), rshifts );
            }
        } else {
            for( s = 0; s < nb_subfr; s++ ) {
--- a/silk/fixed/vector_ops_FIX.c
+++ b/silk/fixed/vector_ops_FIX.c
@ -87,7 +87,7 @@ opus_int32 silk_inner_prod_aligned(
 #endif
 }

-opus_int64 silk_inner_prod16_aligned_64_c(
+opus_int64 silk_inner_prod16_c(
    const opus_int16            *inVec1,            /*    I input vector 1                                              */
    const opus_int16            *inVec2,            /*    I input vector 2                                              */
    const opus_int              len                 /*    I vector lengths                                              */
--- a/silk/fixed/x86/burg_modified_FIX_sse4_1.c
+++ b/silk/fixed/x86/burg_modified_FIX_sse4_1.c
@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
-   Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@ -42,7 +42,7 @@
 #define MAX_FRAME_SIZE              384             /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384 */

 #define QA                          25
-#define N_BITS_HEAD_ROOM            2
+#define N_BITS_HEAD_ROOM            3
 #define MIN_RSHIFTS                 -16
 #define MAX_RSHIFTS                 (32 - QA)

@ -59,7 +59,7 @@ void silk_burg_modified_sse4_1(
    int                         arch                /* I    Run-time architecture                                       */
 )
 {
-    opus_int         k, n, s, lz, rshifts, rshifts_extra, reached_max_gain;
+    opus_int         k, n, s, lz, rshifts, reached_max_gain;
    opus_int32       C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2;
    const opus_int16 *x_ptr;
    opus_int32       C_first_row[ SILK_MAX_ORDER_LPC ];
@ -68,6 +68,7 @@ void silk_burg_modified_sse4_1(
    opus_int32       CAf[ SILK_MAX_ORDER_LPC + 1 ];
    opus_int32       CAb[ SILK_MAX_ORDER_LPC + 1 ];
    opus_int32       xcorr[ SILK_MAX_ORDER_LPC ];
+    opus_int64       C0_64;

    __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210;
    __m128i CONST1 = _mm_set1_epi32(1);
@ -75,23 +76,18 @@ void silk_burg_modified_sse4_1(
    celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );

    /* Compute autocorrelations, added over subframes */
-    silk_sum_sqr_shift( &C0, &rshifts, x, nb_subfr * subfr_length );
-    if( rshifts > MAX_RSHIFTS ) {
-        C0 = silk_LSHIFT32( C0, rshifts - MAX_RSHIFTS );
-        silk_assert( C0 > 0 );
-        rshifts = MAX_RSHIFTS;
+    C0_64 = silk_inner_prod16( x, x, subfr_length*nb_subfr, arch );
+    lz = silk_CLZ64(C0_64);
+    rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz;
+    if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS;
+    if (rshifts < MIN_RSHIFTS) rshifts = MIN_RSHIFTS;
+
+    if (rshifts > 0) {
+        C0 = (opus_int32)silk_RSHIFT64(C0_64, rshifts );
    } else {
-        lz = silk_CLZ32( C0 ) - 1;
-        rshifts_extra = N_BITS_HEAD_ROOM - lz;
-        if( rshifts_extra > 0 ) {
-            rshifts_extra = silk_min( rshifts_extra, MAX_RSHIFTS - rshifts );
-            C0 = silk_RSHIFT32( C0, rshifts_extra );
-        } else {
-            rshifts_extra = silk_max( rshifts_extra, MIN_RSHIFTS - rshifts );
-            C0 = silk_LSHIFT32( C0, -rshifts_extra );
-        }
-        rshifts += rshifts_extra;
+        C0 = silk_LSHIFT32((opus_int32)C0_64, -rshifts );
    }
+
    CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1;                                /* Q(-rshifts) */
    silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
    if( rshifts > 0 ) {
@ -99,7 +95,7 @@ void silk_burg_modified_sse4_1(
            x_ptr = x + s * subfr_length;
            for( n = 1; n < D + 1; n++ ) {
                C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
-                    silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
+                    silk_inner_prod16( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
            }
        }
    } else {
@ -203,8 +199,11 @@ void silk_burg_modified_sse4_1(
                    C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ]            ); /* Q( -rshifts ) */
                    C_last_row[ k ]  = silk_MLA( C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
                    Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 );                                   /* Q17 */
-                    tmp1 = silk_MLA( tmp1, x_ptr[ n - k - 1 ],            Atmp1 );                      /* Q17 */
-                    tmp2 = silk_MLA( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 );                      /* Q17 */
+                    /* We sometimes get overflows in the multiplications (even beyond +/- 2^32),
+                       but they cancel each other and the real result seems to always fit in a 32-bit
+                       signed integer. This was determined experimentally, not theoretically (unfortunately). */
+                    tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ],            Atmp1 );                      /* Q17 */
+                    tmp2 = silk_MLA_ovflw( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 );                      /* Q17 */
                }

                tmp1 = -tmp1;                /* Q17 */
@ -350,7 +349,7 @@ void silk_burg_modified_sse4_1(
        if( rshifts > 0 ) {
            for( s = 0; s < nb_subfr; s++ ) {
                x_ptr = x + s * subfr_length;
-                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
+                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16( x_ptr, x_ptr, D, arch ), rshifts );
            }
        } else {
            for( s = 0; s < nb_subfr; s++ ) {
@ -374,4 +373,28 @@ void silk_burg_modified_sse4_1(
        *res_nrg = silk_SMLAWW( nrg, silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ), -tmp1 );/* Q( -rshifts ) */
        *res_nrg_Q = -rshifts;
    }
+
+#ifdef OPUS_CHECK_ASM
+    {
+        opus_int32 res_nrg_c = 0;
+        opus_int res_nrg_Q_c = 0;
+        opus_int32 A_Q16_c[ MAX_LPC_ORDER ] = {0};
+
+        silk_burg_modified_c(
+            &res_nrg_c,
+            &res_nrg_Q_c,
+            A_Q16_c,
+            x,
+            minInvGain_Q30,
+            subfr_length,
+            nb_subfr,
+            D,
+            0
+        );
+
+        silk_assert( *res_nrg == res_nrg_c );
+        silk_assert( *res_nrg_Q == res_nrg_Q_c );
+        silk_assert( !memcmp( A_Q16, A_Q16_c, D * sizeof( *A_Q16 ) ) );
+    }
+#endif
 }
--- a/silk/fixed/x86/prefilter_FIX_sse.c
+++ b/silk/fixed/x86/prefilter_FIX_sse.c
@ -1,160 +0,0 @@
-/* Copyright (c) 2014, Cisco Systems, INC
-   Written by XiangMingZhu WeiZhou MinPeng YanWang
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions
-   are met:
-
-   - Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-
-   - Redistributions in binary form must reproduce the above copyright
-   notice, this list of conditions and the following disclaimer in the
-   documentation and/or other materials provided with the distribution.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include <xmmintrin.h>
-#include <emmintrin.h>
-#include <smmintrin.h>
-#include "main.h"
-#include "celt/x86/x86cpu.h"
-
-void silk_warped_LPC_analysis_filter_FIX_sse4_1(
-    opus_int32                  state[],                    /* I/O  State [order + 1]                   */
-    opus_int32                  res_Q2[],                   /* O    Residual signal [length]            */
-    const opus_int16            coef_Q13[],                 /* I    Coefficients [order]                */
-    const opus_int16            input[],                    /* I    Input signal [length]               */
-    const opus_int16            lambda_Q16,                 /* I    Warping factor                      */
-    const opus_int              length,                     /* I    Length of input signal              */
-    const opus_int              order                       /* I    Filter order (even)                 */
-)
-{
-    opus_int     n, i;
-    opus_int32   acc_Q11, tmp1, tmp2;
-
-    /* Order must be even */
-    celt_assert( ( order & 1 ) == 0 );
-
-    if (order == 10)
-    {
-        if (0 == lambda_Q16)
-        {
-            __m128i coef_Q13_3210, coef_Q13_7654;
-            __m128i coef_Q13_0123, coef_Q13_4567;
-            __m128i state_0123, state_4567;
-            __m128i xmm_product1, xmm_product2;
-            __m128i xmm_tempa, xmm_tempb;
-
-            register opus_int32 sum;
-            register opus_int32 state_8, state_9, state_a;
-            register opus_int64 coef_Q13_8, coef_Q13_9;
-
-            celt_assert( length > 0 );
-
-            coef_Q13_3210 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 0 ] );
-            coef_Q13_7654 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 4 ] );
-
-            coef_Q13_0123 = _mm_shuffle_epi32( coef_Q13_3210, _MM_SHUFFLE( 0, 1, 2, 3 ) );
-            coef_Q13_4567 = _mm_shuffle_epi32( coef_Q13_7654, _MM_SHUFFLE( 0, 1, 2, 3 ) );
-
-            coef_Q13_8 = (opus_int64) coef_Q13[ 8 ];
-            coef_Q13_9 = (opus_int64) coef_Q13[ 9 ];
-
-            state_0123 = _mm_loadu_si128( (__m128i *)(&state[ 0 ] ) );
-            state_4567 = _mm_loadu_si128( (__m128i *)(&state[ 4 ] ) );
-
-            state_0123 = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
-            state_4567 = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
-
-            state_8 = state[ 8 ];
-            state_9 = state[ 9 ];
-            state_a = 0;
-
-            for( n = 0; n < length; n++ )
-            {
-                xmm_product1 = _mm_mul_epi32( coef_Q13_0123, state_0123 ); /* 64-bit multiply, only 2 pairs */
-                xmm_product2 = _mm_mul_epi32( coef_Q13_4567, state_4567 );
-
-                xmm_tempa = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
-                xmm_tempb = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
-
-                xmm_product1 = _mm_srli_epi64( xmm_product1, 16 ); /* >> 16, zero extending works */
-                xmm_product2 = _mm_srli_epi64( xmm_product2, 16 );
-
-                xmm_tempa = _mm_mul_epi32( coef_Q13_3210, xmm_tempa );
-                xmm_tempb = _mm_mul_epi32( coef_Q13_7654, xmm_tempb );
-
-                xmm_tempa = _mm_srli_epi64( xmm_tempa, 16 );
-                xmm_tempb = _mm_srli_epi64( xmm_tempb, 16 );
-
-                xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_product1 );
-                xmm_tempb = _mm_add_epi32( xmm_tempb, xmm_product2 );
-                xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_tempb );
-
-                sum  = (opus_int32)((coef_Q13_8 * state_8) >> 16);
-                sum += (opus_int32)((coef_Q13_9 * state_9) >> 16);
-
-                xmm_tempa = _mm_add_epi32( xmm_tempa, _mm_shuffle_epi32( xmm_tempa, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
-                sum += _mm_cvtsi128_si32( xmm_tempa);
-                res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( ( 5 + sum ), 9);
-
-                /* move right */
-                state_a = state_9;
-                state_9 = state_8;
-                state_8 = _mm_cvtsi128_si32( state_4567 );
-                state_4567 = _mm_alignr_epi8( state_0123, state_4567, 4 );
-
-                state_0123 = _mm_alignr_epi8( _mm_cvtsi32_si128( silk_LSHIFT( input[ n ], 14 ) ), state_0123, 4 );
-            }
-
-            _mm_storeu_si128( (__m128i *)( &state[ 0 ] ), _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
-            _mm_storeu_si128( (__m128i *)( &state[ 4 ] ), _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
-            state[ 8 ] = state_8;
-            state[ 9 ] = state_9;
-            state[ 10 ] = state_a;
-
-            return;
-        }
-    }
-
-    for( n = 0; n < length; n++ ) {
-        /* Output of lowpass section */
-        tmp2 = silk_SMLAWB( state[ 0 ], state[ 1 ], lambda_Q16 );
-        state[ 0 ] = silk_LSHIFT( input[ n ], 14 );
-        /* Output of allpass section */
-        tmp1 = silk_SMLAWB( state[ 1 ], state[ 2 ] - tmp2, lambda_Q16 );
-        state[ 1 ] = tmp2;
-        acc_Q11 = silk_RSHIFT( order, 1 );
-        acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ 0 ] );
-        /* Loop over allpass sections */
-        for( i = 2; i < order; i += 2 ) {
-            /* Output of allpass section */
-            tmp2 = silk_SMLAWB( state[ i ], state[ i + 1 ] - tmp1, lambda_Q16 );
-            state[ i ] = tmp1;
-            acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ i - 1 ] );
-            /* Output of allpass section */
-            tmp1 = silk_SMLAWB( state[ i + 1 ], state[ i + 2 ] - tmp2, lambda_Q16 );
-            state[ i + 1 ] = tmp2;
-            acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ i ] );
-        }
-        state[ order ] = tmp1;
-        acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ order - 1 ] );
-        res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( acc_Q11, 9 );
-    }
-}
--- a/silk/fixed/x86/vector_ops_FIX_sse4_1.c
+++ b/silk/fixed/x86/vector_ops_FIX_sse4_1.c
@ -37,39 +37,36 @@
 #include "SigProc_FIX.h"
 #include "pitch.h"

-opus_int64 silk_inner_prod16_aligned_64_sse4_1(
+opus_int64 silk_inner_prod16_sse4_1(
    const opus_int16            *inVec1,            /*    I input vector 1                                              */
    const opus_int16            *inVec2,            /*    I input vector 2                                              */
    const opus_int              len                 /*    I vector lengths                                              */
 )
 {
-    opus_int  i, dataSize8;
+    opus_int  i, dataSize4;
    opus_int64 sum;

-    __m128i xmm_tempa;
-    __m128i inVec1_76543210, acc1;
-    __m128i inVec2_76543210, acc2;
+    __m128i xmm_prod_20, xmm_prod_31;
+    __m128i inVec1_3210, acc1;
+    __m128i inVec2_3210, acc2;

    sum = 0;
-    dataSize8 = len & ~7;
+    dataSize4 = len & ~3;

    acc1 = _mm_setzero_si128();
    acc2 = _mm_setzero_si128();

-    for( i = 0; i < dataSize8; i += 8 ) {
-        inVec1_76543210 = _mm_loadu_si128( (__m128i *)(&inVec1[i + 0] ) );
-        inVec2_76543210 = _mm_loadu_si128( (__m128i *)(&inVec2[i + 0] ) );
+    for( i = 0; i < dataSize4; i += 4 ) {
+        inVec1_3210 = OP_CVTEPI16_EPI32_M64( &inVec1[i + 0] );
+        inVec2_3210 = OP_CVTEPI16_EPI32_M64( &inVec2[i + 0] );
+        xmm_prod_20 = _mm_mul_epi32( inVec1_3210, inVec2_3210 );

-        /* only when all 4 operands are -32768 (0x8000), this results in wrap around */
-        inVec1_76543210 = _mm_madd_epi16( inVec1_76543210, inVec2_76543210 );
+        inVec1_3210 = _mm_shuffle_epi32( inVec1_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+        inVec2_3210 = _mm_shuffle_epi32( inVec2_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+        xmm_prod_31 = _mm_mul_epi32( inVec1_3210, inVec2_3210 );

-        xmm_tempa       = _mm_cvtepi32_epi64( inVec1_76543210 );
-        /* equal shift right 8 bytes */
-        inVec1_76543210 = _mm_shuffle_epi32( inVec1_76543210, _MM_SHUFFLE( 0, 0, 3, 2 ) );
-        inVec1_76543210 = _mm_cvtepi32_epi64( inVec1_76543210 );
-
-        acc1 = _mm_add_epi64( acc1, xmm_tempa );
-        acc2 = _mm_add_epi64( acc2, inVec1_76543210 );
+        acc1 = _mm_add_epi64( acc1, xmm_prod_20 );
+        acc2 = _mm_add_epi64( acc2, xmm_prod_31 );
    }

    acc1 = _mm_add_epi64( acc1, acc2 );
@ -84,5 +81,12 @@ opus_int64 silk_inner_prod16_aligned_64_sse4_1(
        sum = silk_SMLABB( sum, inVec1[ i ], inVec2[ i ] );
    }

+#ifdef OPUS_CHECK_ASM
+    {
+        opus_int64 sum_c = silk_inner_prod16_c( inVec1, inVec2, len );
+        silk_assert( sum == sum_c );
+    }
+#endif
+
    return sum;
 }
--- a/silk/x86/NSQ_del_dec_sse4_1.c
+++ b/silk/x86/NSQ_del_dec_sse4_1.c
@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
-   Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@ -46,6 +46,7 @@ typedef struct {
    opus_int32 Shape_Q14[ DECISION_DELAY ];
    opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ];
    opus_int32 LF_AR_Q14;
+    opus_int32 Diff_Q14;
    opus_int32 Seed;
    opus_int32 SeedInit;
    opus_int32 RD_Q10;
@ -56,6 +57,7 @@ typedef struct {
    opus_int32 RD_Q10;
    opus_int32 xq_Q14;
    opus_int32 LF_AR_Q14;
+    opus_int32 Diff_Q14;
    opus_int32 sLTP_shp_Q14;
    opus_int32 LPC_exc_Q14;
 } NSQ_sample_struct;
@ -66,7 +68,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
    const silk_encoder_state *psEncC,               /* I    Encoder State                       */
    silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */
    NSQ_del_dec_struct  psDelDec[],                 /* I/O  Delayed decision states             */
-    const opus_int32    x_Q3[],                     /* I    Input in Q3                         */
+    const opus_int16    x16[],                      /* I    Input                               */
    opus_int32          x_sc_Q10[],                 /* O    Input scaled with 1/Gain in Q10     */
    const opus_int16    sLTP[],                     /* I    Re-whitened LTP state in Q0         */
    opus_int32          sLTP_Q15[],                 /* O    LTP state matching scaled input     */
@ -115,11 +117,11 @@ void silk_NSQ_del_dec_sse4_1(
    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
+    const opus_int16            x16[],                                        /* I    Input                           */
    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
@ -142,8 +144,39 @@ void silk_NSQ_del_dec_sse4_1(
    VARDECL( opus_int32, delayedGain_Q10 );
    VARDECL( NSQ_del_dec_struct, psDelDec );
    NSQ_del_dec_struct  *psDD;
+#ifdef OPUS_CHECK_ASM
+    silk_nsq_state NSQ_c;
+    SideInfoIndices psIndices_c;
+    opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
+    const opus_int8 *const pulses_a = pulses;
+#endif
    SAVE_STACK;

+#ifdef OPUS_CHECK_ASM
+    ( void )pulses_a;
+    silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
+    silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
+    silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH );
+    silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) );
+    silk_NSQ_del_dec_c(
+        psEncC,
+        &NSQ_c,
+        &psIndices_c,
+        x16,
+        pulses_c,
+        PredCoef_Q12,
+        LTPCoef_Q14,
+        AR_Q13,
+        HarmShapeGain_Q14,
+        Tilt_Q14,
+        LF_shp_Q14,
+        Gains_Q16,
+        pitchL,
+        Lambda_Q10,
+        LTP_scale_Q14
+    );
+#endif
+
    /* Set unvoiced lag to the previous one, overwrite later for voiced */
    lag = NSQ->lagPrev;

@ -158,6 +191,7 @@ void silk_NSQ_del_dec_sse4_1(
        psDD->SeedInit       = psDD->Seed;
        psDD->RD_Q10         = 0;
        psDD->LF_AR_Q14      = NSQ->sLF_AR_shp_Q14;
+        psDD->Diff_Q14       = NSQ->sDiff_shp_Q14;
        psDD->Shape_Q14[ 0 ] = NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ];
        silk_memcpy( psDD->sLPC_Q14, NSQ->sLPC_Q14, NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
        silk_memcpy( psDD->sAR2_Q14, NSQ->sAR2_Q14, sizeof( NSQ->sAR2_Q14 ) );
@ -185,8 +219,7 @@ void silk_NSQ_del_dec_sse4_1(
        LSF_interpolation_flag = 1;
    }

-    ALLOC( sLTP_Q15,
-           psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
+    ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
    ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
    ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
    ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );
@ -198,7 +231,7 @@ void silk_NSQ_del_dec_sse4_1(
    for( k = 0; k < psEncC->nb_subfr; k++ ) {
        A_Q12      = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ];
        B_Q14      = &LTPCoef_Q14[ k * LTP_ORDER           ];
-        AR_shp_Q13 = &AR2_Q13[     k * MAX_SHAPE_LPC_ORDER ];
+        AR_shp_Q13 = &AR_Q13[     k * MAX_SHAPE_LPC_ORDER ];

        /* Noise shape parameters */
        silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
@ -257,7 +290,7 @@ void silk_NSQ_del_dec_sse4_1(
            }
        }

-        silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k,
+        silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
            psEncC->nStatesDelayedDecision, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay );

        silk_noise_shape_quantizer_del_dec_sse4_1( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
@ -265,7 +298,7 @@ void silk_NSQ_del_dec_sse4_1(
            Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
            psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );

-        x_Q3   += psEncC->subfr_length;
+        x16    += psEncC->subfr_length;
        pulses += psEncC->subfr_length;
        pxq    += psEncC->subfr_length;
    }
@ -288,6 +321,7 @@ void silk_NSQ_del_dec_sse4_1(
    for( i = 0; i < decisionDelay; i++ ) {
        last_smple_idx = ( last_smple_idx - 1 ) % DECISION_DELAY;
        if( last_smple_idx < 0 ) last_smple_idx += DECISION_DELAY;
+
        pulses[   i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
        pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
            silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gain_Q10 ), 8 ) );
@ -298,11 +332,19 @@ void silk_NSQ_del_dec_sse4_1(

    /* Update states */
    NSQ->sLF_AR_shp_Q14 = psDD->LF_AR_Q14;
+    NSQ->sDiff_shp_Q14  = psDD->Diff_Q14;
    NSQ->lagPrev        = pitchL[ psEncC->nb_subfr - 1 ];

    /* Save quantized speech signal */
    silk_memmove( NSQ->xq,           &NSQ->xq[           psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
    silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
+
+#ifdef OPUS_CHECK_ASM
+    silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
+    silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
+    silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) );
+#endif
+
    RESTORE_STACK;
 }

@ -345,6 +387,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
    opus_int32   q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
    opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;
    opus_int32   *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14;
+
    VARDECL( NSQ_sample_pair, psSampleState );
    NSQ_del_dec_struct *psDD;
    NSQ_sample_struct  *psSS;
@ -356,6 +399,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
    celt_assert( nStatesDelayedDecision > 0 );
    ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair );

+    int rdo_offset = (Lambda_Q10 >> 1) - 512;
+
    shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
    pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
    Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );
@ -407,7 +452,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
        /* Long-term shaping */
        if( lag > 0 ) {
            /* Symmetric, packed FIR coefficients */
-            n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
+            n_LTP_Q14 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
            n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
            n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 );            /* Q12 -> Q14 */
            shp_lag_ptr++;
@ -478,7 +523,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
                    psLPC_Q14_tmp   = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
                    tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );

-                    /* setp 4 */
+                    /* step 4 */
                    psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -15 ] ) );
                    psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
                    tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_CDEF );
@ -511,9 +556,9 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
                LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */

                /* Noise shape feedback */
-                silk_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */
+                celt_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */
                /* Output of lowpass section */
-                tmp2 = silk_SMLAWB( psLPC_Q14[ 0 ], psDD->sAR2_Q14[ 0 ], warping_Q16 );
+                tmp2 = silk_SMLAWB( psDD->Diff_Q14, psDD->sAR2_Q14[ 0 ], warping_Q16 );
                /* Output of allpass section */
                tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - tmp2, warping_Q16 );
                psDD->sAR2_Q14[ 0 ] = tmp2;
@ -543,9 +588,9 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(

                /* Input minus prediction plus noise feedback                       */
                /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP  */
-                tmp1 = silk_ADD32( n_AR_Q14, n_LF_Q14 );                                    /* Q14 */
+                tmp1 = silk_ADD_SAT32( n_AR_Q14, n_LF_Q14 );                                /* Q14 */
                tmp2 = silk_ADD32( n_LTP_Q14, LPC_pred_Q14 );                               /* Q13 */
-                tmp1 = silk_SUB32( tmp2, tmp1 );                                            /* Q13 */
+                tmp1 = silk_SUB_SAT32( tmp2, tmp1 );                                        /* Q13 */
                tmp1 = silk_RSHIFT_ROUND( tmp1, 4 );                                        /* Q10 */

                r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 );                                     /* residual error Q10 */
@ -559,6 +604,18 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
                /* Find two quantization level candidates and measure their rate-distortion */
                q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
                q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
+                if (Lambda_Q10 > 2048) {
+                    /* For aggressive RDO, the bias becomes more than one pulse. */
+                    if (q1_Q10 > rdo_offset) {
+                        q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 );
+                    } else if (q1_Q10 < -rdo_offset) {
+                        q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 );
+                    } else if (q1_Q10 < 0) {
+                        q1_Q0 = -1;
+                    } else {
+                        q1_Q0 = 0;
+                    }
+                }
                if( q1_Q0 > 0 ) {
                    q1_Q10  = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
                    q1_Q10  = silk_ADD32( q1_Q10, offset_Q10 );
@ -612,8 +669,9 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
                xq_Q14      = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );

                /* Update states */
-                sLF_AR_shp_Q14         = silk_SUB32( xq_Q14, n_AR_Q14 );
-                psSS[ 0 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
+                psSS[ 0 ].Diff_Q14     = silk_SUB_LSHIFT32( xq_Q14, x_Q10[ i ], 4 );
+                sLF_AR_shp_Q14         = silk_SUB32( psSS[ 0 ].Diff_Q14, n_AR_Q14 );
+                psSS[ 0 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 );
                psSS[ 0 ].LF_AR_Q14    = sLF_AR_shp_Q14;
                psSS[ 0 ].LPC_exc_Q14  = LPC_exc_Q14;
                psSS[ 0 ].xq_Q14       = xq_Q14;
@ -626,14 +684,14 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
                    exc_Q14 = -exc_Q14;
                }

-
                /* Add predictions */
                LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
                xq_Q14      = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );

                /* Update states */
-                sLF_AR_shp_Q14         = silk_SUB32( xq_Q14, n_AR_Q14 );
-                psSS[ 1 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
+                psSS[ 1 ].Diff_Q14     = silk_SUB_LSHIFT32( xq_Q14, x_Q10[ i ], 4 );
+                sLF_AR_shp_Q14         = silk_SUB32( psSS[ 1 ].Diff_Q14, n_AR_Q14 );
+                psSS[ 1 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 );
                psSS[ 1 ].LF_AR_Q14    = sLF_AR_shp_Q14;
                psSS[ 1 ].LPC_exc_Q14  = LPC_exc_Q14;
                psSS[ 1 ].xq_Q14       = xq_Q14;
@ -705,6 +763,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
            psDD                                     = &psDelDec[ k ];
            psSS                                     = &psSampleState[ k ][ 0 ];
            psDD->LF_AR_Q14                          = psSS->LF_AR_Q14;
+            psDD->Diff_Q14                           = psSS->Diff_Q14;
            psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14;
            psDD->Xq_Q14[    *smpl_buf_idx ]         = psSS->xq_Q14;
            psDD->Q_Q10[     *smpl_buf_idx ]         = psSS->Q_Q10;
@ -728,7 +787,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
    const silk_encoder_state *psEncC,               /* I    Encoder State                       */
    silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */
    NSQ_del_dec_struct  psDelDec[],                 /* I/O  Delayed decision states             */
-    const opus_int32    x_Q3[],                     /* I    Input in Q3                         */
+    const opus_int16    x16[],                      /* I    Input                               */
    opus_int32          x_sc_Q10[],                 /* O    Input scaled with 1/Gain in Q10     */
    const opus_int16    sLTP[],                     /* I    Re-whitened LTP state in Q0         */
    opus_int32          sLTP_Q15[],                 /* O    LTP state matching scaled input     */
@ -742,51 +801,41 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
 )
 {
    opus_int            i, k, lag;
-    opus_int32          gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
+    opus_int32          gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
    NSQ_del_dec_struct  *psDD;
-    __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
+    __m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1;

    lag          = pitchL[ subfr ];
    inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
-
    silk_assert( inv_gain_Q31 != 0 );

-    /* Calculate gain adjustment factor */
-    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
-        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
-    } else {
-        gain_adj_Q16 = (opus_int32)1 << 16;
-    }
-
    /* Scale input */
-    inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
+    inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );

-    /* prepare inv_gain_Q23 in packed 4 32-bits */
-    xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
+    /* prepare inv_gain_Q26 in packed 4 32-bits */
+    xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26);

    for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
-        xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
+        xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) );
+
        /* equal shift right 4 bytes*/
-        xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+        xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );

-        xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
-        xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
+        xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 );
+        xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 );

-        xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
-        xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
+        xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 );
+        xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 );

-        xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
+        xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC );

-        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ])), xmm_x_Q3_x2x0 );
+        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );
    }

    for( ; i < psEncC->subfr_length; i++ ) {
-        x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
+        x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 );
    }

-    /* Save inverse gain */
-    NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
-
    /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
    if( NSQ->rewhite_flag ) {
        if( subfr == 0 ) {
@ -800,7 +849,9 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
    }

    /* Adjust for changing gain */
-    if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
+    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
+        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
+
        /* Scale long-term shaping state */
        {
            __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
@ -841,6 +892,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(

                /* Scale scalar states */
                psDD->LF_AR_Q14 = silk_SMULWW( gain_adj_Q16, psDD->LF_AR_Q14 );
+                psDD->Diff_Q14 = silk_SMULWW( gain_adj_Q16, psDD->Diff_Q14 );

                /* Scale short-term prediction and shaping states */
                for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
@ -855,5 +907,8 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
                }
            }
        }
+
+        /* Save inverse gain */
+        NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
    }
 }
--- a/silk/x86/NSQ_sse4_1.c
+++ b/silk/x86/NSQ_sse4_1.c
@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
-   Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@ -39,7 +39,7 @@
 static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
    const silk_encoder_state *psEncC,              /* I    Encoder State                   */
    silk_nsq_state      *NSQ,                      /* I/O  NSQ state                       */
-    const opus_int32    x_Q3[],                 /* I    input in Q3                     */
+    const opus_int16    x16[],                     /* I    input                           */
    opus_int32          x_sc_Q10[],                /* O    input scaled with 1/Gain        */
    const opus_int16    sLTP[],                    /* I    re-whitened LTP state in Q0     */
    opus_int32          sLTP_Q15[],                /* O    LTP state matching scaled input */
@ -65,6 +65,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
    opus_int            Tilt_Q14,               /* I    Spectral tilt                   */
    opus_int32          LF_shp_Q14,             /* I                                    */
    opus_int32          Gain_Q16,               /* I                                    */
+    opus_int            Lambda_Q10,             /* I                                    */
    opus_int            offset_Q10,             /* I                                    */
    opus_int            length,                 /* I    Input length                    */
    opus_int32          table[][4]              /* I                                    */
@ -74,11 +75,11 @@ void silk_NSQ_sse4_1(
    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
+    const opus_int16            x16[],                                        /* I    Input                           */
    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
@ -101,8 +102,41 @@ void silk_NSQ_sse4_1(
    opus_int32   tmp1;
    opus_int32   q1_Q10, q2_Q10, rd1_Q20, rd2_Q20;

+#ifdef OPUS_CHECK_ASM
+    silk_nsq_state NSQ_c;
+    SideInfoIndices psIndices_c;
+    opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
+    const opus_int8 *const pulses_a = pulses;
+#endif
+
    SAVE_STACK;

+#ifdef OPUS_CHECK_ASM
+    ( void )pulses_a;
+    silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
+    silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
+    silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH );
+    silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) );
+
+    silk_NSQ_c(
+        psEncC,
+        &NSQ_c,
+        &psIndices_c,
+        x16,
+        pulses_c,
+        PredCoef_Q12,
+        LTPCoef_Q14,
+        AR_Q13,
+        HarmShapeGain_Q14,
+        Tilt_Q14,
+        LF_shp_Q14,
+        Gains_Q16,
+        pitchL,
+        Lambda_Q10,
+        LTP_scale_Q14
+    );
+#endif
+
    NSQ->rand_seed = psIndices->Seed;

    /* Set unvoiced lag to the previous one, overwrite later for voiced */
@ -172,8 +206,7 @@ void silk_NSQ_sse4_1(
        LSF_interpolation_flag = 1;
    }

-    ALLOC( sLTP_Q15,
-           psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
+    ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
    ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
    ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
    /* Set up pointers to start of sub frame */
@ -183,7 +216,7 @@ void silk_NSQ_sse4_1(
    for( k = 0; k < psEncC->nb_subfr; k++ ) {
        A_Q12      = &PredCoef_Q12[ (( k >> 1 ) | ( 1 - LSF_interpolation_flag )) * MAX_LPC_ORDER ];
        B_Q14      = &LTPCoef_Q14[ k * LTP_ORDER ];
-        AR_shp_Q13 = &AR2_Q13[     k * MAX_SHAPE_LPC_ORDER ];
+        AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ];

        /* Noise shape parameters */
        silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
@ -209,12 +242,12 @@ void silk_NSQ_sse4_1(
            }
        }

-        silk_nsq_scale_states_sse4_1( psEncC, NSQ, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );
+        silk_nsq_scale_states_sse4_1( psEncC, NSQ, x16, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );

        if ( opus_likely( ( 10 == psEncC->shapingLPCOrder ) && ( 16 == psEncC->predictLPCOrder) ) )
        {
            silk_noise_shape_quantizer_10_16_sse4_1( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
-                AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ],
+                AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10,
                offset_Q10, psEncC->subfr_length, &(table[32]) );
        }
        else
@ -224,7 +257,7 @@ void silk_NSQ_sse4_1(
                offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->arch );
        }

-        x_Q3   += psEncC->subfr_length;
+        x16    += psEncC->subfr_length;
        pulses += psEncC->subfr_length;
        pxq    += psEncC->subfr_length;
    }
@ -235,12 +268,19 @@ void silk_NSQ_sse4_1(
    /* Save quantized speech and noise shaping signals */
    silk_memmove( NSQ->xq,           &NSQ->xq[           psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
    silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
+
+#ifdef OPUS_CHECK_ASM
+    silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
+    silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
+    silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) );
+#endif
+
    RESTORE_STACK;
 }

-/***********************************/
+/************************************/
 /* silk_noise_shape_quantizer_10_16 */
-/***********************************/
+/************************************/
 static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
    opus_int            signalType,             /* I    Signal type                     */
@ -256,6 +296,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
    opus_int            Tilt_Q14,               /* I    Spectral tilt                   */
    opus_int32          LF_shp_Q14,             /* I                                    */
    opus_int32          Gain_Q16,               /* I                                    */
+    opus_int            Lambda_Q10,             /* I                                    */
    opus_int            offset_Q10,             /* I                                    */
    opus_int            length,                 /* I    Input length                    */
    opus_int32          table[][4]              /* I                                    */
@ -264,7 +305,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
    opus_int     i;
    opus_int32   LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13;
    opus_int32   n_LF_Q12, r_Q10, q1_Q0, q1_Q10, q2_Q10;
-    opus_int32   exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
+    opus_int32   exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10, sDiff_shp_Q14;
    opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;
    opus_int32   *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr;

@ -279,6 +320,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
    __m128i sAR2_Q14_hi_76543210, sAR2_Q14_lo_76543210;
    __m128i AR_shp_Q13_76543210;

+    int rdo_offset = (Lambda_Q10 >> 1) - 512;
+
    shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
    pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
    Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );
@ -288,6 +331,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(

    sLF_AR_shp_Q14 = NSQ->sLF_AR_shp_Q14;
    xq_Q14         = psLPC_Q14[ 0 ];
+    sDiff_shp_Q14  = NSQ->sDiff_shp_Q14;
    LTP_pred_Q13   = 0;

    /* load a_Q12 */
@ -430,8 +474,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
        sAR2_Q14_hi_76543210 = _mm_slli_si128( sAR2_Q14_hi_76543210, 2 );
        sAR2_Q14_lo_76543210 = _mm_slli_si128( sAR2_Q14_lo_76543210, 2 );

-        sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (xq_Q14 >> 16), 0 );
-        sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (xq_Q14),       0 );
+        sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (sDiff_shp_Q14 >> 16), 0 );
+        sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (sDiff_shp_Q14),       0 );

        /* high part, use pmaddwd, results in 4 32-bit */
        xmm_hi_07 = _mm_madd_epi16( sAR2_Q14_hi_76543210, AR_shp_Q13_76543210 );
@ -462,14 +506,14 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
        n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 );
        n_LF_Q12 = silk_SMLAWT( n_LF_Q12, sLF_AR_shp_Q14, LF_shp_Q14 );

-        silk_assert( lag > 0 || signalType != TYPE_VOICED );
+        celt_assert( lag > 0 || signalType != TYPE_VOICED );

        /* Combine prediction and noise shaping signals */
        tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 );        /* Q12 */
        tmp1 = silk_SUB32( tmp1, n_LF_Q12 );                                    /* Q12 */
        if( lag > 0 ) {
            /* Symmetric, packed FIR coefficients */
-            n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
+            n_LTP_Q13 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
            n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );
            n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 );
            shp_lag_ptr++;
@ -495,6 +539,18 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
        /* Find two quantization level candidates and measure their rate-distortion */
        q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
        q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
+        if (Lambda_Q10 > 2048) {
+            /* For aggressive RDO, the bias becomes more than one pulse. */
+            if (q1_Q10 > rdo_offset) {
+                q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 );
+            } else if (q1_Q10 < -rdo_offset) {
+                q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 );
+            } else if (q1_Q10 < 0) {
+                q1_Q0 = -1;
+            } else {
+                q1_Q0 = 0;
+            }
+        }

        q1_Q10 = table[q1_Q0][0];
        q2_Q10 = table[q1_Q0][1];
@ -519,7 +575,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
        /* Update states */
        psLPC_Q14++;
        *psLPC_Q14 = xq_Q14;
-        sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, n_AR_Q12, 2 );
+        NSQ->sDiff_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_sc_Q10[ i ], 4 );
+        sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( NSQ->sDiff_shp_Q14, n_AR_Q12, 2 );

        NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 );
        sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 );
@ -602,7 +659,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
 static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
    const silk_encoder_state *psEncC,              /* I    Encoder State                   */
    silk_nsq_state      *NSQ,                      /* I/O  NSQ state                       */
-    const opus_int32    x_Q3[],                 /* I    input in Q3                     */
+    const opus_int16    x16[],                     /* I    input                           */
    opus_int32          x_sc_Q10[],                /* O    input scaled with 1/Gain        */
    const opus_int16    sLTP[],                    /* I    re-whitened LTP state in Q0     */
    opus_int32          sLTP_Q15[],                /* O    LTP state matching scaled input */
@ -614,50 +671,40 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
 )
 {
    opus_int   i, lag;
-    opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
-    __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
+    opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
+    __m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1;

    lag          = pitchL[ subfr ];
    inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
    silk_assert( inv_gain_Q31 != 0 );

-    /* Calculate gain adjustment factor */
-    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
-        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
-    } else {
-        gain_adj_Q16 = (opus_int32)1 << 16;
-    }
-
    /* Scale input */
-    inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
+    inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );

-    /* prepare inv_gain_Q23 in packed 4 32-bits */
-    xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
+    /* prepare inv_gain_Q26 in packed 4 32-bits */
+    xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26);

    for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
-        xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
+        xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) );

        /* equal shift right 4 bytes*/
-        xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+        xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );

-        xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
-        xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
+        xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 );
+        xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 );

-        xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
-        xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
+        xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 );
+        xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 );

-        xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
+        xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC );

-        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x_Q3_x2x0 );
+        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );
    }

    for( ; i < psEncC->subfr_length; i++ ) {
-        x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
+        x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 );
    }

-    /* Save inverse gain */
-    NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
-
    /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
    if( NSQ->rewhite_flag ) {
        if( subfr == 0 ) {
@ -671,7 +718,9 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
    }

    /* Adjust for changing gain */
-    if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
+    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
+        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
+
        /* Scale long-term shaping state */
        __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;

@ -707,6 +756,7 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
        }

        NSQ->sLF_AR_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sLF_AR_shp_Q14 );
+        NSQ->sDiff_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sDiff_shp_Q14 );

        /* Scale short-term prediction and shaping states */
        for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
@ -715,5 +765,8 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
        for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
            NSQ->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sAR2_Q14[ i ] );
        }
+
+        /* Save inverse gain */
+        NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
    }
 }
--- a/silk/x86/SigProc_FIX_sse.h
+++ b/silk/x86/SigProc_FIX_sse.h
@ -67,7 +67,7 @@ extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])(

 #endif

-opus_int64 silk_inner_prod16_aligned_64_sse4_1(
+opus_int64 silk_inner_prod16_sse4_1(
    const opus_int16 *inVec1,
    const opus_int16 *inVec2,
    const opus_int   len
@ -76,18 +76,18 @@ opus_int64 silk_inner_prod16_aligned_64_sse4_1(

 #if defined(OPUS_X86_PRESUME_SSE4_1)

-#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
-    ((void)(arch),silk_inner_prod16_aligned_64_sse4_1(inVec1, inVec2, len))
+#define silk_inner_prod16(inVec1, inVec2, len, arch) \
+    ((void)(arch),silk_inner_prod16_sse4_1(inVec1, inVec2, len))

 #else

-extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])(
+extern opus_int64 (*const SILK_INNER_PROD16_IMPL[OPUS_ARCHMASK + 1])(
                    const opus_int16 *inVec1,
                    const opus_int16 *inVec2,
                    const opus_int   len);

-#  define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
-    ((*SILK_INNER_PROD16_ALIGNED_64_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len))
+#  define silk_inner_prod16(inVec1, inVec2, len, arch) \
+    ((*SILK_INNER_PROD16_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len))

 #endif
 #endif
--- a/silk/x86/VAD_sse4_1.c
+++ b/silk/x86/VAD_sse4_1.c
@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
-   Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@ -63,6 +63,14 @@ opus_int silk_VAD_GetSA_Q8_sse4_1(                  /* O    Return value, 0 if s

    SAVE_STACK;

+#ifdef OPUS_CHECK_ASM
+    silk_encoder_state psEncC_c;
+    opus_int ret_c;
+
+    silk_memcpy( &psEncC_c, psEncC, sizeof( psEncC_c ) );
+    ret_c = silk_VAD_GetSA_Q8_c( &psEncC_c, pIn );
+#endif
+
    /* Safety checks */
    silk_assert( VAD_N_BANDS == 4 );
    celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
@ -233,15 +241,14 @@ opus_int silk_VAD_GetSA_Q8_sse4_1(                  /* O    Return value, 0 if s
        speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
    }

+    if( psEncC->frame_length == 20 * psEncC->fs_kHz ) {
+        speech_nrg = silk_RSHIFT32( speech_nrg, 1 );
+    }
    /* Power scaling */
    if( speech_nrg <= 0 ) {
        SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
-    } else if( speech_nrg < 32768 ) {
-        if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
-            speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 16 );
-        } else {
-            speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 15 );
-        }
+    } else if( speech_nrg < 16384 ) {
+        speech_nrg = silk_LSHIFT32( speech_nrg, 16 );

        /* square-root */
        speech_nrg = silk_SQRT_APPROX( speech_nrg );
@ -272,6 +279,11 @@ opus_int silk_VAD_GetSA_Q8_sse4_1(                  /* O    Return value, 0 if s
        psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
    }

+#ifdef OPUS_CHECK_ASM
+    silk_assert( ret == ret_c );
+    silk_assert( !memcmp( &psEncC_c, psEncC, sizeof( psEncC_c ) ) );
+#endif
+
    RESTORE_STACK;
    return( ret );
 }
--- a/silk/x86/VQ_WMat_EC_sse4_1.c
+++ b/silk/x86/VQ_WMat_EC_sse4_1.c
@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
-   Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@ -38,105 +38,136 @@
 /* Entropy constrained matrix-weighted VQ, hard-coded to 5-element vectors, for a single input data vector */
 void silk_VQ_WMat_EC_sse4_1(
    opus_int8                   *ind,                           /* O    index of best codebook vector               */
-    opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */
+    opus_int32                  *res_nrg_Q15,                   /* O    best residual energy                        */
+    opus_int32                  *rate_dist_Q8,                  /* O    best total bitrate                          */
    opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */
-    const opus_int16            *in_Q14,                        /* I    input vector to be quantized                */
-    const opus_int32            *W_Q18,                         /* I    weighting matrix                            */
+    const opus_int32            *XX_Q17,                        /* I    correlation matrix                          */
+    const opus_int32            *xX_Q17,                        /* I    correlation vector                          */
    const opus_int8             *cb_Q7,                         /* I    codebook                                    */
    const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */
    const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */
-    const opus_int              mu_Q9,                          /* I    tradeoff betw. weighted error and rate      */
+    const opus_int              subfr_len,                      /* I    number of samples per subframe              */
    const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */
-    opus_int                    L                               /* I    number of vectors in codebook               */
+    const opus_int              L                               /* I    number of vectors in codebook               */
 )
 {
    opus_int   k, gain_tmp_Q7;
    const opus_int8 *cb_row_Q7;
-    opus_int16 diff_Q14[ 5 ];
-    opus_int32 sum1_Q14, sum2_Q16;
+    opus_int32 neg_xX_Q24[ 5 ];
+    opus_int32 sum1_Q15, sum2_Q24;
+    opus_int32 bits_res_Q8, bits_tot_Q8;
+    __m128i v_XX_31_Q17, v_XX_42_Q17, v_cb_row_31_Q7, v_cb_row_42_Q7, v_acc1_Q24, v_acc2_Q24;
+
+    /* Negate and convert to new Q domain */
+    neg_xX_Q24[ 0 ] = -silk_LSHIFT32( xX_Q17[ 0 ], 7 );
+    neg_xX_Q24[ 1 ] = -silk_LSHIFT32( xX_Q17[ 1 ], 7 );
+    neg_xX_Q24[ 2 ] = -silk_LSHIFT32( xX_Q17[ 2 ], 7 );
+    neg_xX_Q24[ 3 ] = -silk_LSHIFT32( xX_Q17[ 3 ], 7 );
+    neg_xX_Q24[ 4 ] = -silk_LSHIFT32( xX_Q17[ 4 ], 7 );
+
+    v_XX_31_Q17 = _mm_loadu_si128( (__m128i *)(&XX_Q17[ 1 ] ) );
+    v_XX_42_Q17 = _mm_shuffle_epi32( v_XX_31_Q17, _MM_SHUFFLE( 0, 3, 2, 1 ) );

-    __m128i C_tmp1, C_tmp2, C_tmp3, C_tmp4, C_tmp5;
    /* Loop over codebook */
-    *rate_dist_Q14 = silk_int32_MAX;
+    *rate_dist_Q8 = silk_int32_MAX;
+    *res_nrg_Q15 = silk_int32_MAX;
    cb_row_Q7 = cb_Q7;
+    /* If things go really bad, at least *ind is set to something safe. */
+    *ind = 0;
    for( k = 0; k < L; k++ ) {
+        opus_int32 penalty;
        gain_tmp_Q7 = cb_gain_Q7[k];
-
-        diff_Q14[ 0 ] = in_Q14[ 0 ] - silk_LSHIFT( cb_row_Q7[ 0 ], 7 );
-
-        C_tmp1 = OP_CVTEPI16_EPI32_M64( &in_Q14[ 1 ] );
-        C_tmp2 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] );
-        C_tmp2 = _mm_slli_epi32( C_tmp2, 7 );
-        C_tmp1 = _mm_sub_epi32( C_tmp1, C_tmp2 );
-
-        diff_Q14[ 1 ] = _mm_extract_epi16( C_tmp1, 0 );
-        diff_Q14[ 2 ] = _mm_extract_epi16( C_tmp1, 2 );
-        diff_Q14[ 3 ] = _mm_extract_epi16( C_tmp1, 4 );
-        diff_Q14[ 4 ] = _mm_extract_epi16( C_tmp1, 6 );
-
        /* Weighted rate */
-        sum1_Q14 = silk_SMULBB( mu_Q9, cl_Q5[ k ] );
+        /* Quantization error: 1 - 2 * xX * cb + cb' * XX * cb */
+        sum1_Q15 = SILK_FIX_CONST( 1.001, 15 );

        /* Penalty for too large gain */
-        sum1_Q14 = silk_ADD_LSHIFT32( sum1_Q14, silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 10 );
+        penalty = silk_LSHIFT32( silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 11 );

-        silk_assert( sum1_Q14 >= 0 );
+        /* first row of XX_Q17 */
+        v_cb_row_31_Q7 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] );
+        v_cb_row_42_Q7 = _mm_shuffle_epi32( v_cb_row_31_Q7, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+        v_cb_row_31_Q7 = _mm_mul_epi32( v_XX_31_Q17, v_cb_row_31_Q7 );
+        v_cb_row_42_Q7 = _mm_mul_epi32( v_XX_42_Q17, v_cb_row_42_Q7 );
+        v_acc1_Q24 = _mm_add_epi64( v_cb_row_31_Q7, v_cb_row_42_Q7);
+        v_acc2_Q24 = _mm_shuffle_epi32( v_acc1_Q24, _MM_SHUFFLE( 1, 0, 3, 2 ) );
+        v_acc1_Q24 = _mm_add_epi64( v_acc1_Q24, v_acc2_Q24);
+        sum2_Q24 = _mm_cvtsi128_si32( v_acc1_Q24 );
+        sum2_Q24 = silk_ADD32( neg_xX_Q24[ 0 ], sum2_Q24 );
+        sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[  0 ], cb_row_Q7[ 0 ] );
+        sum1_Q15 = silk_SMLAWB( sum1_Q15,        sum2_Q24,  cb_row_Q7[ 0 ] );

-        /* first row of W_Q18 */
-        C_tmp3 = _mm_loadu_si128( (__m128i *)(&W_Q18[ 1 ] ) );
-        C_tmp4 = _mm_mul_epi32( C_tmp3, C_tmp1 );
-        C_tmp4 = _mm_srli_si128( C_tmp4, 2 );
+        /* second row of XX_Q17 */
+        sum2_Q24 = silk_MLA( neg_xX_Q24[ 1 ], XX_Q17[  7 ], cb_row_Q7[ 2 ] );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[  8 ], cb_row_Q7[ 3 ] );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[  9 ], cb_row_Q7[ 4 ] );
+        sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[  6 ], cb_row_Q7[ 1 ] );
+        sum1_Q15 = silk_SMLAWB( sum1_Q15,        sum2_Q24,  cb_row_Q7[ 1 ] );

-        C_tmp1 = _mm_shuffle_epi32( C_tmp1, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */
-        C_tmp3 = _mm_shuffle_epi32( C_tmp3, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */
+        /* third row of XX_Q17 */
+        sum2_Q24 = silk_MLA( neg_xX_Q24[ 2 ], XX_Q17[ 13 ], cb_row_Q7[ 3 ] );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[ 14 ], cb_row_Q7[ 4 ] );
+        sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[ 12 ], cb_row_Q7[ 2 ] );
+        sum1_Q15 = silk_SMLAWB( sum1_Q15,        sum2_Q24,  cb_row_Q7[ 2 ] );

-        C_tmp5 = _mm_mul_epi32( C_tmp3, C_tmp1 );
-        C_tmp5 = _mm_srli_si128( C_tmp5, 2 );
+        /* fourth row of XX_Q17 */
+        sum2_Q24 = silk_MLA( neg_xX_Q24[ 3 ], XX_Q17[ 19 ], cb_row_Q7[ 4 ] );
+        sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[ 18 ], cb_row_Q7[ 3 ] );
+        sum1_Q15 = silk_SMLAWB( sum1_Q15,        sum2_Q24,  cb_row_Q7[ 3 ] );

-        C_tmp5 = _mm_add_epi32( C_tmp4, C_tmp5 );
-        C_tmp5 = _mm_slli_epi32( C_tmp5, 1 );
-
-        C_tmp5 = _mm_add_epi32( C_tmp5, _mm_shuffle_epi32( C_tmp5, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
-        sum2_Q16 = _mm_cvtsi128_si32( C_tmp5 );
-
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  0 ], diff_Q14[ 0 ] );
-        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 0 ] );
-
-        /* second row of W_Q18 */
-        sum2_Q16 = silk_SMULWB(           W_Q18[  7 ], diff_Q14[ 2 ] );
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  8 ], diff_Q14[ 3 ] );
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  9 ], diff_Q14[ 4 ] );
-        sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  6 ], diff_Q14[ 1 ] );
-        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 1 ] );
-
-        /* third row of W_Q18 */
-        sum2_Q16 = silk_SMULWB(           W_Q18[ 13 ], diff_Q14[ 3 ] );
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 14 ], diff_Q14[ 4 ] );
-        sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 12 ], diff_Q14[ 2 ] );
-        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 2 ] );
-
-        /* fourth row of W_Q18 */
-        sum2_Q16 = silk_SMULWB(           W_Q18[ 19 ], diff_Q14[ 4 ] );
-        sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 18 ], diff_Q14[ 3 ] );
-        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 3 ] );
-
-        /* last row of W_Q18 */
-        sum2_Q16 = silk_SMULWB(           W_Q18[ 24 ], diff_Q14[ 4 ] );
-        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 4 ] );
-
-        silk_assert( sum1_Q14 >= 0 );
+        /* last row of XX_Q17 */
+        sum2_Q24 = silk_LSHIFT32( neg_xX_Q24[ 4 ], 1 );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[ 24 ], cb_row_Q7[ 4 ] );
+        sum1_Q15 = silk_SMLAWB( sum1_Q15,        sum2_Q24,  cb_row_Q7[ 4 ] );

        /* find best */
-        if( sum1_Q14 < *rate_dist_Q14 ) {
-            *rate_dist_Q14 = sum1_Q14;
+        if( sum1_Q15 >= 0 ) {
+            /* Translate residual energy to bits using high-rate assumption (6 dB ==> 1 bit/sample) */
+            bits_res_Q8 = silk_SMULBB( subfr_len, silk_lin2log( sum1_Q15 + penalty) - (15 << 7) );
+            /* In the following line we reduce the codelength component by half ("-1"); seems to slightly improve quality */
+            bits_tot_Q8 = silk_ADD_LSHIFT32( bits_res_Q8, cl_Q5[ k ], 3-1 );
+            if( bits_tot_Q8 <= *rate_dist_Q8 ) {
+                *rate_dist_Q8 = bits_tot_Q8;
+                *res_nrg_Q15 = sum1_Q15 + penalty;
                *ind = (opus_int8)k;
                *gain_Q7 = gain_tmp_Q7;
            }
+        }

        /* Go to next cbk vector */
        cb_row_Q7 += LTP_ORDER;
    }
+
+#ifdef OPUS_CHECK_ASM
+    {
+        opus_int8  ind_c = 0;
+        opus_int32 res_nrg_Q15_c = 0;
+        opus_int32 rate_dist_Q8_c = 0;
+        opus_int   gain_Q7_c = 0;
+
+        silk_VQ_WMat_EC_c(
+            &ind_c,
+            &res_nrg_Q15_c,
+            &rate_dist_Q8_c,
+            &gain_Q7_c,
+            XX_Q17,
+            xX_Q17,
+            cb_Q7,
+            cb_gain_Q7,
+            cl_Q5,
+            subfr_len,
+            max_gain_Q7,
+            L
+        );
+
+        silk_assert( *ind == ind_c );
+        silk_assert( *res_nrg_Q15 == res_nrg_Q15_c );
+        silk_assert( *rate_dist_Q8 == rate_dist_Q8_c );
+        silk_assert( *gain_Q7 == gain_Q7_c );
+    }
+#endif
 }
--- a/silk/x86/main_sse.h
+++ b/silk/x86/main_sse.h
@ -34,66 +34,65 @@

 # if defined(OPUS_X86_MAY_HAVE_SSE4_1)

-#if 0 /* FIXME: SSE disabled until silk_VQ_WMat_EC_sse4_1() gets updated. */
 #  define OVERRIDE_silk_VQ_WMat_EC

 void silk_VQ_WMat_EC_sse4_1(
    opus_int8                   *ind,                           /* O    index of best codebook vector               */
-    opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */
+    opus_int32                  *res_nrg_Q15,                   /* O    best residual energy                        */
+    opus_int32                  *rate_dist_Q8,                  /* O    best total bitrate                          */
    opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */
-    const opus_int16            *in_Q14,                        /* I    input vector to be quantized                */
-    const opus_int32            *W_Q18,                         /* I    weighting matrix                            */
+    const opus_int32            *XX_Q17,                        /* I    correlation matrix                          */
+    const opus_int32            *xX_Q17,                        /* I    correlation vector                          */
    const opus_int8             *cb_Q7,                         /* I    codebook                                    */
    const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */
    const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */
-    const opus_int              mu_Q9,                          /* I    tradeoff betw. weighted error and rate      */
+    const opus_int              subfr_len,                      /* I    number of samples per subframe              */
    const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */
-    opus_int                    L                               /* I    number of vectors in codebook               */
+    const opus_int              L                               /* I    number of vectors in codebook               */
 );

 #if defined OPUS_X86_PRESUME_SSE4_1

-#define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
-                          mu_Q9, max_gain_Q7, L, arch) \
-    ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
-                          mu_Q9, max_gain_Q7, L))
+#define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
+                          subfr_len, max_gain_Q7, L, arch) \
+    ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
+                          subfr_len, max_gain_Q7, L))

 #else

 extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])(
    opus_int8                   *ind,                           /* O    index of best codebook vector               */
-    opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */
+    opus_int32                  *res_nrg_Q15,                   /* O    best residual energy                        */
+    opus_int32                  *rate_dist_Q8,                  /* O    best total bitrate                          */
    opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */
-    const opus_int16            *in_Q14,                        /* I    input vector to be quantized                */
-    const opus_int32            *W_Q18,                         /* I    weighting matrix                            */
+    const opus_int32            *XX_Q17,                        /* I    correlation matrix                          */
+    const opus_int32            *xX_Q17,                        /* I    correlation vector                          */
    const opus_int8             *cb_Q7,                         /* I    codebook                                    */
    const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */
    const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */
-    const opus_int              mu_Q9,                          /* I    tradeoff betw. weighted error and rate      */
+    const opus_int              subfr_len,                      /* I    number of samples per subframe              */
    const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */
-    opus_int                    L                               /* I    number of vectors in codebook               */
+    const opus_int              L                               /* I    number of vectors in codebook               */
 );

-#  define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
-                          mu_Q9, max_gain_Q7, L, arch) \
-    ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
-                          mu_Q9, max_gain_Q7, L))
+#  define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
+                          subfr_len, max_gain_Q7, L, arch) \
+    ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
+                          subfr_len, max_gain_Q7, L))

 #endif
-#endif

-#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
 #  define OVERRIDE_silk_NSQ

 void silk_NSQ_sse4_1(
    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
+    const opus_int16            x16[],                                        /* I    Input                           */
    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
@ -116,11 +115,11 @@ extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(
    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
+    const opus_int16            x16[],                                        /* I    Input                           */
    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
@ -143,11 +142,11 @@ void silk_NSQ_del_dec_sse4_1(
    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
+    const opus_int16            x16[],                                        /* I    Input                           */
    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
@ -159,9 +158,9 @@ void silk_NSQ_del_dec_sse4_1(

 #if defined OPUS_X86_PRESUME_SSE4_1

-#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
                           HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
-    ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+    ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
                           HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))

 #else
@ -170,11 +169,11 @@ extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
+    const opus_int16            x16[],                                        /* I    Input                           */
    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
@ -184,13 +183,12 @@ extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 );

-#  define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+#  define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
                           HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
-    ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+    ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
                           HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))

 #endif
-#endif

 void silk_noise_shape_quantizer(
    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
--- a/silk/x86/x86_silk_map.c
+++ b/silk/x86/x86_silk_map.c
@ -41,16 +41,16 @@

 #include "fixed/main_FIX.h"

-opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )(
+opus_int64 (*const SILK_INNER_PROD16_IMPL[ OPUS_ARCHMASK + 1 ] )(
    const opus_int16 *inVec1,
    const opus_int16 *inVec2,
    const opus_int   len
 ) = {
-  silk_inner_prod16_aligned_64_c,                  /* non-sse */
-  silk_inner_prod16_aligned_64_c,
-  silk_inner_prod16_aligned_64_c,
-  MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ), /* sse4.1 */
-  MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 )  /* avx */
+  silk_inner_prod16_c,                  /* non-sse */
+  silk_inner_prod16_c,
+  silk_inner_prod16_c,
+  MAY_HAVE_SSE4_1( silk_inner_prod16 ), /* sse4.1 */
+  MAY_HAVE_SSE4_1( silk_inner_prod16 )  /* avx */
 };

 #endif
@ -66,16 +66,15 @@ opus_int (*const SILK_VAD_GETSA_Q8_IMPL[ OPUS_ARCHMASK + 1 ] )(
  MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 )  /* avx */
 };

-#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
 void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
+    const opus_int16            x16[],                                        /* I    Input                           */
    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
@ -90,21 +89,20 @@ void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
  MAY_HAVE_SSE4_1( silk_NSQ ), /* sse4.1 */
  MAY_HAVE_SSE4_1( silk_NSQ )  /* avx */
 };
-#endif

-#if 0 /* FIXME: SSE disabled until silk_VQ_WMat_EC_sse4_1() gets updated. */
 void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(
    opus_int8                   *ind,                           /* O    index of best codebook vector               */
-    opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */
+    opus_int32                  *res_nrg_Q15,                   /* O    best residual energy                        */
+    opus_int32                  *rate_dist_Q8,                  /* O    best total bitrate                          */
    opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */
-    const opus_int16            *in_Q14,                        /* I    input vector to be quantized                */
-    const opus_int32            *W_Q18,                         /* I    weighting matrix                            */
+    const opus_int32            *XX_Q17,                        /* I    correlation matrix                          */
+    const opus_int32            *xX_Q17,                        /* I    correlation vector                          */
    const opus_int8             *cb_Q7,                         /* I    codebook                                    */
    const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */
    const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */
-    const opus_int              mu_Q9,                          /* I    tradeoff betw. weighted error and rate      */
+    const opus_int              subfr_len,                      /* I    number of samples per subframe              */
    const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */
-    opus_int                    L                               /* I    number of vectors in codebook               */
+    const opus_int              L                               /* I    number of vectors in codebook               */
 ) = {
  silk_VQ_WMat_EC_c,                  /* non-sse */
  silk_VQ_WMat_EC_c,
@ -112,18 +110,16 @@ void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(
  MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ), /* sse4.1 */
  MAY_HAVE_SSE4_1( silk_VQ_WMat_EC )  /* avx */
 };
-#endif

-#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
 void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
+    const opus_int16            x16[],                                        /* I    Input                           */
    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
@ -138,7 +134,6 @@ void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
  MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */
  MAY_HAVE_SSE4_1( silk_NSQ_del_dec )  /* avx */
 };
-#endif

 #if defined(FIXED_POINT)