Update and re-enable SILK SSE4.1 optimisations

This commit is contained in:
Francis Quiers 2020-05-18 17:15:05 +01:00 committed by Timothy B. Terriberry
parent 37aba6e9b3
commit c6f9857771
No known key found for this signature in database
GPG key ID: 5A1149C19C699E4F
17 changed files with 645 additions and 634 deletions

View file

@ -173,9 +173,9 @@ void silk_NSQ_c
RESTORE_STACK;
}
/***********************************/
/******************************/
/* silk_noise_shape_quantizer */
/***********************************/
/******************************/
#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
static OPUS_INLINE

View file

@ -381,7 +381,7 @@ opus_int32 silk_inner_prod_aligned_scale(
const opus_int len /* I vector lengths */
);
opus_int64 silk_inner_prod16_aligned_64_c(
opus_int64 silk_inner_prod16_c(
const opus_int16 *inVec1, /* I input vector 1 */
const opus_int16 *inVec2, /* I input vector 2 */
const opus_int len /* I vector lengths */
@ -613,8 +613,8 @@ static OPUS_INLINE opus_int64 silk_max_64(opus_int64 a, opus_int64 b)
#define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
((void)(arch), silk_burg_modified_c(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
((void)(arch),silk_inner_prod16_aligned_64_c(inVec1, inVec2, len))
#define silk_inner_prod16(inVec1, inVec2, len, arch) \
((void)(arch),silk_inner_prod16_c(inVec1, inVec2, len))
#endif
#include "Inlines.h"

View file

@ -64,7 +64,7 @@ void silk_VQ_WMat_EC_c(
*rate_dist_Q8 = silk_int32_MAX;
*res_nrg_Q15 = silk_int32_MAX;
cb_row_Q7 = cb_Q7;
/* In things go really bad, at least *ind is set to something safe. */
/* If things go really bad, at least *ind is set to something safe. */
*ind = 0;
for( k = 0; k < L; k++ ) {
opus_int32 penalty;
@ -115,7 +115,7 @@ void silk_VQ_WMat_EC_c(
if( sum1_Q15 >= 0 ) {
/* Translate residual energy to bits using high-rate assumption (6 dB ==> 1 bit/sample) */
bits_res_Q8 = silk_SMULBB( subfr_len, silk_lin2log( sum1_Q15 + penalty) - (15 << 7) );
/* In the following line we reduce the codelength component by half ("-1"); seems to slghtly improve quality */
/* In the following line we reduce the codelength component by half ("-1"); seems to slightly improve quality */
bits_tot_Q8 = silk_ADD_LSHIFT32( bits_res_Q8, cl_Q5[ k ], 3-1 );
if( bits_tot_Q8 <= *rate_dist_Q8 ) {
*rate_dist_Q8 = bits_tot_Q8;

View file

@ -68,7 +68,7 @@ void silk_burg_modified_c(
celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
/* Compute autocorrelations, added over subframes */
C0_64 = silk_inner_prod16_aligned_64( x, x, subfr_length*nb_subfr, arch );
C0_64 = silk_inner_prod16( x, x, subfr_length*nb_subfr, arch );
lz = silk_CLZ64(C0_64);
rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz;
if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS;
@ -87,7 +87,7 @@ void silk_burg_modified_c(
x_ptr = x + s * subfr_length;
for( n = 1; n < D + 1; n++ ) {
C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
silk_inner_prod16( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
}
}
} else {
@ -150,7 +150,7 @@ void silk_burg_modified_c(
C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */
C_last_row[ k ] = silk_MLA( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 ); /* Q17 */
/* We sometimes have get overflows in the multiplications (even beyond +/- 2^32),
/* We sometimes get overflows in the multiplications (even beyond +/- 2^32),
but they cancel each other and the real result seems to always fit in a 32-bit
signed integer. This was determined experimentally, not theoretically (unfortunately). */
tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ], Atmp1 ); /* Q17 */
@ -253,7 +253,7 @@ void silk_burg_modified_c(
if( rshifts > 0 ) {
for( s = 0; s < nb_subfr; s++ ) {
x_ptr = x + s * subfr_length;
C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16( x_ptr, x_ptr, D, arch ), rshifts );
}
} else {
for( s = 0; s < nb_subfr; s++ ) {

View file

@ -87,7 +87,7 @@ opus_int32 silk_inner_prod_aligned(
#endif
}
opus_int64 silk_inner_prod16_aligned_64_c(
opus_int64 silk_inner_prod16_c(
const opus_int16 *inVec1, /* I input vector 1 */
const opus_int16 *inVec2, /* I input vector 2 */
const opus_int len /* I vector lengths */

View file

@ -1,5 +1,5 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
/* Copyright (c) 2014-2020, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
@ -42,7 +42,7 @@
#define MAX_FRAME_SIZE 384 /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384 */
#define QA 25
#define N_BITS_HEAD_ROOM 2
#define N_BITS_HEAD_ROOM 3
#define MIN_RSHIFTS -16
#define MAX_RSHIFTS (32 - QA)
@ -59,7 +59,7 @@ void silk_burg_modified_sse4_1(
int arch /* I Run-time architecture */
)
{
opus_int k, n, s, lz, rshifts, rshifts_extra, reached_max_gain;
opus_int k, n, s, lz, rshifts, reached_max_gain;
opus_int32 C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2;
const opus_int16 *x_ptr;
opus_int32 C_first_row[ SILK_MAX_ORDER_LPC ];
@ -68,6 +68,7 @@ void silk_burg_modified_sse4_1(
opus_int32 CAf[ SILK_MAX_ORDER_LPC + 1 ];
opus_int32 CAb[ SILK_MAX_ORDER_LPC + 1 ];
opus_int32 xcorr[ SILK_MAX_ORDER_LPC ];
opus_int64 C0_64;
__m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210;
__m128i CONST1 = _mm_set1_epi32(1);
@ -75,23 +76,18 @@ void silk_burg_modified_sse4_1(
celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
/* Compute autocorrelations, added over subframes */
silk_sum_sqr_shift( &C0, &rshifts, x, nb_subfr * subfr_length );
if( rshifts > MAX_RSHIFTS ) {
C0 = silk_LSHIFT32( C0, rshifts - MAX_RSHIFTS );
silk_assert( C0 > 0 );
rshifts = MAX_RSHIFTS;
C0_64 = silk_inner_prod16( x, x, subfr_length*nb_subfr, arch );
lz = silk_CLZ64(C0_64);
rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz;
if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS;
if (rshifts < MIN_RSHIFTS) rshifts = MIN_RSHIFTS;
if (rshifts > 0) {
C0 = (opus_int32)silk_RSHIFT64(C0_64, rshifts );
} else {
lz = silk_CLZ32( C0 ) - 1;
rshifts_extra = N_BITS_HEAD_ROOM - lz;
if( rshifts_extra > 0 ) {
rshifts_extra = silk_min( rshifts_extra, MAX_RSHIFTS - rshifts );
C0 = silk_RSHIFT32( C0, rshifts_extra );
} else {
rshifts_extra = silk_max( rshifts_extra, MIN_RSHIFTS - rshifts );
C0 = silk_LSHIFT32( C0, -rshifts_extra );
}
rshifts += rshifts_extra;
C0 = silk_LSHIFT32((opus_int32)C0_64, -rshifts );
}
CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1; /* Q(-rshifts) */
silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
if( rshifts > 0 ) {
@ -99,7 +95,7 @@ void silk_burg_modified_sse4_1(
x_ptr = x + s * subfr_length;
for( n = 1; n < D + 1; n++ ) {
C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
silk_inner_prod16( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
}
}
} else {
@ -203,8 +199,11 @@ void silk_burg_modified_sse4_1(
C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */
C_last_row[ k ] = silk_MLA( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 ); /* Q17 */
tmp1 = silk_MLA( tmp1, x_ptr[ n - k - 1 ], Atmp1 ); /* Q17 */
tmp2 = silk_MLA( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 ); /* Q17 */
/* We sometimes get overflows in the multiplications (even beyond +/- 2^32),
but they cancel each other and the real result seems to always fit in a 32-bit
signed integer. This was determined experimentally, not theoretically (unfortunately). */
tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ], Atmp1 ); /* Q17 */
tmp2 = silk_MLA_ovflw( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 ); /* Q17 */
}
tmp1 = -tmp1; /* Q17 */
@ -350,7 +349,7 @@ void silk_burg_modified_sse4_1(
if( rshifts > 0 ) {
for( s = 0; s < nb_subfr; s++ ) {
x_ptr = x + s * subfr_length;
C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16( x_ptr, x_ptr, D, arch ), rshifts );
}
} else {
for( s = 0; s < nb_subfr; s++ ) {
@ -374,4 +373,28 @@ void silk_burg_modified_sse4_1(
*res_nrg = silk_SMLAWW( nrg, silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ), -tmp1 );/* Q( -rshifts ) */
*res_nrg_Q = -rshifts;
}
#ifdef OPUS_CHECK_ASM
{
opus_int32 res_nrg_c = 0;
opus_int res_nrg_Q_c = 0;
opus_int32 A_Q16_c[ MAX_LPC_ORDER ] = {0};
silk_burg_modified_c(
&res_nrg_c,
&res_nrg_Q_c,
A_Q16_c,
x,
minInvGain_Q30,
subfr_length,
nb_subfr,
D,
0
);
silk_assert( *res_nrg == res_nrg_c );
silk_assert( *res_nrg_Q == res_nrg_Q_c );
silk_assert( !memcmp( A_Q16, A_Q16_c, D * sizeof( *A_Q16 ) ) );
}
#endif
}

View file

@ -1,160 +0,0 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <xmmintrin.h>
#include <emmintrin.h>
#include <smmintrin.h>
#include "main.h"
#include "celt/x86/x86cpu.h"
void silk_warped_LPC_analysis_filter_FIX_sse4_1(
opus_int32 state[], /* I/O State [order + 1] */
opus_int32 res_Q2[], /* O Residual signal [length] */
const opus_int16 coef_Q13[], /* I Coefficients [order] */
const opus_int16 input[], /* I Input signal [length] */
const opus_int16 lambda_Q16, /* I Warping factor */
const opus_int length, /* I Length of input signal */
const opus_int order /* I Filter order (even) */
)
{
opus_int n, i;
opus_int32 acc_Q11, tmp1, tmp2;
/* Order must be even */
celt_assert( ( order & 1 ) == 0 );
if (order == 10)
{
if (0 == lambda_Q16)
{
__m128i coef_Q13_3210, coef_Q13_7654;
__m128i coef_Q13_0123, coef_Q13_4567;
__m128i state_0123, state_4567;
__m128i xmm_product1, xmm_product2;
__m128i xmm_tempa, xmm_tempb;
register opus_int32 sum;
register opus_int32 state_8, state_9, state_a;
register opus_int64 coef_Q13_8, coef_Q13_9;
celt_assert( length > 0 );
coef_Q13_3210 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 0 ] );
coef_Q13_7654 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 4 ] );
coef_Q13_0123 = _mm_shuffle_epi32( coef_Q13_3210, _MM_SHUFFLE( 0, 1, 2, 3 ) );
coef_Q13_4567 = _mm_shuffle_epi32( coef_Q13_7654, _MM_SHUFFLE( 0, 1, 2, 3 ) );
coef_Q13_8 = (opus_int64) coef_Q13[ 8 ];
coef_Q13_9 = (opus_int64) coef_Q13[ 9 ];
state_0123 = _mm_loadu_si128( (__m128i *)(&state[ 0 ] ) );
state_4567 = _mm_loadu_si128( (__m128i *)(&state[ 4 ] ) );
state_0123 = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
state_4567 = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
state_8 = state[ 8 ];
state_9 = state[ 9 ];
state_a = 0;
for( n = 0; n < length; n++ )
{
xmm_product1 = _mm_mul_epi32( coef_Q13_0123, state_0123 ); /* 64-bit multiply, only 2 pairs */
xmm_product2 = _mm_mul_epi32( coef_Q13_4567, state_4567 );
xmm_tempa = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
xmm_tempb = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
xmm_product1 = _mm_srli_epi64( xmm_product1, 16 ); /* >> 16, zero extending works */
xmm_product2 = _mm_srli_epi64( xmm_product2, 16 );
xmm_tempa = _mm_mul_epi32( coef_Q13_3210, xmm_tempa );
xmm_tempb = _mm_mul_epi32( coef_Q13_7654, xmm_tempb );
xmm_tempa = _mm_srli_epi64( xmm_tempa, 16 );
xmm_tempb = _mm_srli_epi64( xmm_tempb, 16 );
xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_product1 );
xmm_tempb = _mm_add_epi32( xmm_tempb, xmm_product2 );
xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_tempb );
sum = (opus_int32)((coef_Q13_8 * state_8) >> 16);
sum += (opus_int32)((coef_Q13_9 * state_9) >> 16);
xmm_tempa = _mm_add_epi32( xmm_tempa, _mm_shuffle_epi32( xmm_tempa, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
sum += _mm_cvtsi128_si32( xmm_tempa);
res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( ( 5 + sum ), 9);
/* move right */
state_a = state_9;
state_9 = state_8;
state_8 = _mm_cvtsi128_si32( state_4567 );
state_4567 = _mm_alignr_epi8( state_0123, state_4567, 4 );
state_0123 = _mm_alignr_epi8( _mm_cvtsi32_si128( silk_LSHIFT( input[ n ], 14 ) ), state_0123, 4 );
}
_mm_storeu_si128( (__m128i *)( &state[ 0 ] ), _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
_mm_storeu_si128( (__m128i *)( &state[ 4 ] ), _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
state[ 8 ] = state_8;
state[ 9 ] = state_9;
state[ 10 ] = state_a;
return;
}
}
for( n = 0; n < length; n++ ) {
/* Output of lowpass section */
tmp2 = silk_SMLAWB( state[ 0 ], state[ 1 ], lambda_Q16 );
state[ 0 ] = silk_LSHIFT( input[ n ], 14 );
/* Output of allpass section */
tmp1 = silk_SMLAWB( state[ 1 ], state[ 2 ] - tmp2, lambda_Q16 );
state[ 1 ] = tmp2;
acc_Q11 = silk_RSHIFT( order, 1 );
acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ 0 ] );
/* Loop over allpass sections */
for( i = 2; i < order; i += 2 ) {
/* Output of allpass section */
tmp2 = silk_SMLAWB( state[ i ], state[ i + 1 ] - tmp1, lambda_Q16 );
state[ i ] = tmp1;
acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ i - 1 ] );
/* Output of allpass section */
tmp1 = silk_SMLAWB( state[ i + 1 ], state[ i + 2 ] - tmp2, lambda_Q16 );
state[ i + 1 ] = tmp2;
acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ i ] );
}
state[ order ] = tmp1;
acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ order - 1 ] );
res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( acc_Q11, 9 );
}
}

View file

@ -37,39 +37,36 @@
#include "SigProc_FIX.h"
#include "pitch.h"
opus_int64 silk_inner_prod16_aligned_64_sse4_1(
opus_int64 silk_inner_prod16_sse4_1(
const opus_int16 *inVec1, /* I input vector 1 */
const opus_int16 *inVec2, /* I input vector 2 */
const opus_int len /* I vector lengths */
)
{
opus_int i, dataSize8;
opus_int i, dataSize4;
opus_int64 sum;
__m128i xmm_tempa;
__m128i inVec1_76543210, acc1;
__m128i inVec2_76543210, acc2;
__m128i xmm_prod_20, xmm_prod_31;
__m128i inVec1_3210, acc1;
__m128i inVec2_3210, acc2;
sum = 0;
dataSize8 = len & ~7;
dataSize4 = len & ~3;
acc1 = _mm_setzero_si128();
acc2 = _mm_setzero_si128();
for( i = 0; i < dataSize8; i += 8 ) {
inVec1_76543210 = _mm_loadu_si128( (__m128i *)(&inVec1[i + 0] ) );
inVec2_76543210 = _mm_loadu_si128( (__m128i *)(&inVec2[i + 0] ) );
for( i = 0; i < dataSize4; i += 4 ) {
inVec1_3210 = OP_CVTEPI16_EPI32_M64( &inVec1[i + 0] );
inVec2_3210 = OP_CVTEPI16_EPI32_M64( &inVec2[i + 0] );
xmm_prod_20 = _mm_mul_epi32( inVec1_3210, inVec2_3210 );
/* only when all 4 operands are -32768 (0x8000), this results in wrap around */
inVec1_76543210 = _mm_madd_epi16( inVec1_76543210, inVec2_76543210 );
inVec1_3210 = _mm_shuffle_epi32( inVec1_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
inVec2_3210 = _mm_shuffle_epi32( inVec2_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
xmm_prod_31 = _mm_mul_epi32( inVec1_3210, inVec2_3210 );
xmm_tempa = _mm_cvtepi32_epi64( inVec1_76543210 );
/* equal shift right 8 bytes */
inVec1_76543210 = _mm_shuffle_epi32( inVec1_76543210, _MM_SHUFFLE( 0, 0, 3, 2 ) );
inVec1_76543210 = _mm_cvtepi32_epi64( inVec1_76543210 );
acc1 = _mm_add_epi64( acc1, xmm_tempa );
acc2 = _mm_add_epi64( acc2, inVec1_76543210 );
acc1 = _mm_add_epi64( acc1, xmm_prod_20 );
acc2 = _mm_add_epi64( acc2, xmm_prod_31 );
}
acc1 = _mm_add_epi64( acc1, acc2 );
@ -84,5 +81,12 @@ opus_int64 silk_inner_prod16_aligned_64_sse4_1(
sum = silk_SMLABB( sum, inVec1[ i ], inVec2[ i ] );
}
#ifdef OPUS_CHECK_ASM
{
opus_int64 sum_c = silk_inner_prod16_c( inVec1, inVec2, len );
silk_assert( sum == sum_c );
}
#endif
return sum;
}

View file

@ -1,5 +1,5 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
/* Copyright (c) 2014-2020, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
@ -46,6 +46,7 @@ typedef struct {
opus_int32 Shape_Q14[ DECISION_DELAY ];
opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ];
opus_int32 LF_AR_Q14;
opus_int32 Diff_Q14;
opus_int32 Seed;
opus_int32 SeedInit;
opus_int32 RD_Q10;
@ -56,6 +57,7 @@ typedef struct {
opus_int32 RD_Q10;
opus_int32 xq_Q14;
opus_int32 LF_AR_Q14;
opus_int32 Diff_Q14;
opus_int32 sLTP_shp_Q14;
opus_int32 LPC_exc_Q14;
} NSQ_sample_struct;
@ -66,7 +68,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
const silk_encoder_state *psEncC, /* I Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */
const opus_int32 x_Q3[], /* I Input in Q3 */
const opus_int16 x16[], /* I Input */
opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */
const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
@ -115,11 +117,11 @@ void silk_NSQ_del_dec_sse4_1(
const silk_encoder_state *psEncC, /* I Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
SideInfoIndices *psIndices, /* I/O Quantization Indices */
const opus_int32 x_Q3[], /* I Prefiltered input signal */
const opus_int16 x16[], /* I Input */
opus_int8 pulses[], /* O Quantized pulse signal */
const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
@ -142,8 +144,39 @@ void silk_NSQ_del_dec_sse4_1(
VARDECL( opus_int32, delayedGain_Q10 );
VARDECL( NSQ_del_dec_struct, psDelDec );
NSQ_del_dec_struct *psDD;
#ifdef OPUS_CHECK_ASM
silk_nsq_state NSQ_c;
SideInfoIndices psIndices_c;
opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
const opus_int8 *const pulses_a = pulses;
#endif
SAVE_STACK;
#ifdef OPUS_CHECK_ASM
( void )pulses_a;
silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH );
silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) );
silk_NSQ_del_dec_c(
psEncC,
&NSQ_c,
&psIndices_c,
x16,
pulses_c,
PredCoef_Q12,
LTPCoef_Q14,
AR_Q13,
HarmShapeGain_Q14,
Tilt_Q14,
LF_shp_Q14,
Gains_Q16,
pitchL,
Lambda_Q10,
LTP_scale_Q14
);
#endif
/* Set unvoiced lag to the previous one, overwrite later for voiced */
lag = NSQ->lagPrev;
@ -158,6 +191,7 @@ void silk_NSQ_del_dec_sse4_1(
psDD->SeedInit = psDD->Seed;
psDD->RD_Q10 = 0;
psDD->LF_AR_Q14 = NSQ->sLF_AR_shp_Q14;
psDD->Diff_Q14 = NSQ->sDiff_shp_Q14;
psDD->Shape_Q14[ 0 ] = NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ];
silk_memcpy( psDD->sLPC_Q14, NSQ->sLPC_Q14, NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
silk_memcpy( psDD->sAR2_Q14, NSQ->sAR2_Q14, sizeof( NSQ->sAR2_Q14 ) );
@ -185,8 +219,7 @@ void silk_NSQ_del_dec_sse4_1(
LSF_interpolation_flag = 1;
}
ALLOC( sLTP_Q15,
psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );
@ -198,7 +231,7 @@ void silk_NSQ_del_dec_sse4_1(
for( k = 0; k < psEncC->nb_subfr; k++ ) {
A_Q12 = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ];
B_Q14 = &LTPCoef_Q14[ k * LTP_ORDER ];
AR_shp_Q13 = &AR2_Q13[ k * MAX_SHAPE_LPC_ORDER ];
AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ];
/* Noise shape parameters */
silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
@ -257,7 +290,7 @@ void silk_NSQ_del_dec_sse4_1(
}
}
silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k,
silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
psEncC->nStatesDelayedDecision, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay );
silk_noise_shape_quantizer_del_dec_sse4_1( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
@ -265,7 +298,7 @@ void silk_NSQ_del_dec_sse4_1(
Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );
x_Q3 += psEncC->subfr_length;
x16 += psEncC->subfr_length;
pulses += psEncC->subfr_length;
pxq += psEncC->subfr_length;
}
@ -288,6 +321,7 @@ void silk_NSQ_del_dec_sse4_1(
for( i = 0; i < decisionDelay; i++ ) {
last_smple_idx = ( last_smple_idx - 1 ) % DECISION_DELAY;
if( last_smple_idx < 0 ) last_smple_idx += DECISION_DELAY;
pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gain_Q10 ), 8 ) );
@ -298,11 +332,19 @@ void silk_NSQ_del_dec_sse4_1(
/* Update states */
NSQ->sLF_AR_shp_Q14 = psDD->LF_AR_Q14;
NSQ->sDiff_shp_Q14 = psDD->Diff_Q14;
NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ];
/* Save quantized speech signal */
silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
#ifdef OPUS_CHECK_ASM
silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) );
#endif
RESTORE_STACK;
}
@ -345,6 +387,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
opus_int32 q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
opus_int32 tmp1, tmp2, sLF_AR_shp_Q14;
opus_int32 *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14;
VARDECL( NSQ_sample_pair, psSampleState );
NSQ_del_dec_struct *psDD;
NSQ_sample_struct *psSS;
@ -356,6 +399,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
celt_assert( nStatesDelayedDecision > 0 );
ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair );
int rdo_offset = (Lambda_Q10 >> 1) - 512;
shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 );
@ -407,7 +452,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
/* Long-term shaping */
if( lag > 0 ) {
/* Symmetric, packed FIR coefficients */
n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
n_LTP_Q14 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 ); /* Q12 -> Q14 */
shp_lag_ptr++;
@ -478,7 +523,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
/* setp 4 */
/* step 4 */
psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -15 ] ) );
psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_CDEF );
@ -511,9 +556,9 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */
/* Noise shape feedback */
silk_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */
celt_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */
/* Output of lowpass section */
tmp2 = silk_SMLAWB( psLPC_Q14[ 0 ], psDD->sAR2_Q14[ 0 ], warping_Q16 );
tmp2 = silk_SMLAWB( psDD->Diff_Q14, psDD->sAR2_Q14[ 0 ], warping_Q16 );
/* Output of allpass section */
tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - tmp2, warping_Q16 );
psDD->sAR2_Q14[ 0 ] = tmp2;
@ -543,9 +588,9 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
/* Input minus prediction plus noise feedback */
/* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */
tmp1 = silk_ADD32( n_AR_Q14, n_LF_Q14 ); /* Q14 */
tmp1 = silk_ADD_SAT32( n_AR_Q14, n_LF_Q14 ); /* Q14 */
tmp2 = silk_ADD32( n_LTP_Q14, LPC_pred_Q14 ); /* Q13 */
tmp1 = silk_SUB32( tmp2, tmp1 ); /* Q13 */
tmp1 = silk_SUB_SAT32( tmp2, tmp1 ); /* Q13 */
tmp1 = silk_RSHIFT_ROUND( tmp1, 4 ); /* Q10 */
r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 ); /* residual error Q10 */
@ -559,6 +604,18 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
/* Find two quantization level candidates and measure their rate-distortion */
q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
if (Lambda_Q10 > 2048) {
/* For aggressive RDO, the bias becomes more than one pulse. */
if (q1_Q10 > rdo_offset) {
q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 );
} else if (q1_Q10 < -rdo_offset) {
q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 );
} else if (q1_Q10 < 0) {
q1_Q0 = -1;
} else {
q1_Q0 = 0;
}
}
if( q1_Q0 > 0 ) {
q1_Q10 = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 );
@ -612,8 +669,9 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
/* Update states */
sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 );
psSS[ 0 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
psSS[ 0 ].Diff_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_Q10[ i ], 4 );
sLF_AR_shp_Q14 = silk_SUB32( psSS[ 0 ].Diff_Q14, n_AR_Q14 );
psSS[ 0 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 );
psSS[ 0 ].LF_AR_Q14 = sLF_AR_shp_Q14;
psSS[ 0 ].LPC_exc_Q14 = LPC_exc_Q14;
psSS[ 0 ].xq_Q14 = xq_Q14;
@ -626,14 +684,14 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
exc_Q14 = -exc_Q14;
}
/* Add predictions */
LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
/* Update states */
sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 );
psSS[ 1 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
psSS[ 1 ].Diff_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_Q10[ i ], 4 );
sLF_AR_shp_Q14 = silk_SUB32( psSS[ 1 ].Diff_Q14, n_AR_Q14 );
psSS[ 1 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 );
psSS[ 1 ].LF_AR_Q14 = sLF_AR_shp_Q14;
psSS[ 1 ].LPC_exc_Q14 = LPC_exc_Q14;
psSS[ 1 ].xq_Q14 = xq_Q14;
@ -705,6 +763,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
psDD = &psDelDec[ k ];
psSS = &psSampleState[ k ][ 0 ];
psDD->LF_AR_Q14 = psSS->LF_AR_Q14;
psDD->Diff_Q14 = psSS->Diff_Q14;
psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14;
psDD->Xq_Q14[ *smpl_buf_idx ] = psSS->xq_Q14;
psDD->Q_Q10[ *smpl_buf_idx ] = psSS->Q_Q10;
@ -728,7 +787,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
const silk_encoder_state *psEncC, /* I Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */
const opus_int32 x_Q3[], /* I Input in Q3 */
const opus_int16 x16[], /* I Input */
opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */
const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
@ -742,51 +801,41 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
)
{
opus_int i, k, lag;
opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
NSQ_del_dec_struct *psDD;
__m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
__m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1;
lag = pitchL[ subfr ];
inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
silk_assert( inv_gain_Q31 != 0 );
/* Calculate gain adjustment factor */
if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
} else {
gain_adj_Q16 = (opus_int32)1 << 16;
}
/* Scale input */
inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );
/* prepare inv_gain_Q23 in packed 4 32-bits */
xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
/* prepare inv_gain_Q26 in packed 4 32-bits */
xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26);
for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) );
/* equal shift right 4 bytes*/
xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 );
xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 );
xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 );
xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 );
xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC );
_mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ])), xmm_x_Q3_x2x0 );
_mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );
}
for( ; i < psEncC->subfr_length; i++ ) {
x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 );
}
/* Save inverse gain */
NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
/* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
if( NSQ->rewhite_flag ) {
if( subfr == 0 ) {
@ -800,7 +849,9 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
}
/* Adjust for changing gain */
if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
/* Scale long-term shaping state */
{
__m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
@ -841,6 +892,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
/* Scale scalar states */
psDD->LF_AR_Q14 = silk_SMULWW( gain_adj_Q16, psDD->LF_AR_Q14 );
psDD->Diff_Q14 = silk_SMULWW( gain_adj_Q16, psDD->Diff_Q14 );
/* Scale short-term prediction and shaping states */
for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
@ -855,5 +907,8 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
}
}
}
/* Save inverse gain */
NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
}
}

View file

@ -1,5 +1,5 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
/* Copyright (c) 2014-2020, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
@ -39,7 +39,7 @@
static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
const silk_encoder_state *psEncC, /* I Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
const opus_int32 x_Q3[], /* I input in Q3 */
const opus_int16 x16[], /* I input */
opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */
const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */
opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
@ -65,6 +65,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
opus_int Tilt_Q14, /* I Spectral tilt */
opus_int32 LF_shp_Q14, /* I */
opus_int32 Gain_Q16, /* I */
opus_int Lambda_Q10, /* I */
opus_int offset_Q10, /* I */
opus_int length, /* I Input length */
opus_int32 table[][4] /* I */
@ -74,11 +75,11 @@ void silk_NSQ_sse4_1(
const silk_encoder_state *psEncC, /* I Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
SideInfoIndices *psIndices, /* I/O Quantization Indices */
const opus_int32 x_Q3[], /* I Prefiltered input signal */
const opus_int16 x16[], /* I Input */
opus_int8 pulses[], /* O Quantized pulse signal */
const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
@ -101,8 +102,41 @@ void silk_NSQ_sse4_1(
opus_int32 tmp1;
opus_int32 q1_Q10, q2_Q10, rd1_Q20, rd2_Q20;
#ifdef OPUS_CHECK_ASM
silk_nsq_state NSQ_c;
SideInfoIndices psIndices_c;
opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
const opus_int8 *const pulses_a = pulses;
#endif
SAVE_STACK;
#ifdef OPUS_CHECK_ASM
( void )pulses_a;
silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH );
silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) );
silk_NSQ_c(
psEncC,
&NSQ_c,
&psIndices_c,
x16,
pulses_c,
PredCoef_Q12,
LTPCoef_Q14,
AR_Q13,
HarmShapeGain_Q14,
Tilt_Q14,
LF_shp_Q14,
Gains_Q16,
pitchL,
Lambda_Q10,
LTP_scale_Q14
);
#endif
NSQ->rand_seed = psIndices->Seed;
/* Set unvoiced lag to the previous one, overwrite later for voiced */
@ -172,8 +206,7 @@ void silk_NSQ_sse4_1(
LSF_interpolation_flag = 1;
}
ALLOC( sLTP_Q15,
psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
/* Set up pointers to start of sub frame */
@ -183,7 +216,7 @@ void silk_NSQ_sse4_1(
for( k = 0; k < psEncC->nb_subfr; k++ ) {
A_Q12 = &PredCoef_Q12[ (( k >> 1 ) | ( 1 - LSF_interpolation_flag )) * MAX_LPC_ORDER ];
B_Q14 = &LTPCoef_Q14[ k * LTP_ORDER ];
AR_shp_Q13 = &AR2_Q13[ k * MAX_SHAPE_LPC_ORDER ];
AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ];
/* Noise shape parameters */
silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
@ -209,12 +242,12 @@ void silk_NSQ_sse4_1(
}
}
silk_nsq_scale_states_sse4_1( psEncC, NSQ, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );
silk_nsq_scale_states_sse4_1( psEncC, NSQ, x16, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );
if ( opus_likely( ( 10 == psEncC->shapingLPCOrder ) && ( 16 == psEncC->predictLPCOrder) ) )
{
silk_noise_shape_quantizer_10_16_sse4_1( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ],
AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10,
offset_Q10, psEncC->subfr_length, &(table[32]) );
}
else
@ -224,7 +257,7 @@ void silk_NSQ_sse4_1(
offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->arch );
}
x_Q3 += psEncC->subfr_length;
x16 += psEncC->subfr_length;
pulses += psEncC->subfr_length;
pxq += psEncC->subfr_length;
}
@ -235,12 +268,19 @@ void silk_NSQ_sse4_1(
/* Save quantized speech and noise shaping signals */
silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
#ifdef OPUS_CHECK_ASM
silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) );
#endif
RESTORE_STACK;
}
/***********************************/
/************************************/
/* silk_noise_shape_quantizer_10_16 */
/***********************************/
/************************************/
static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
silk_nsq_state *NSQ, /* I/O NSQ state */
opus_int signalType, /* I Signal type */
@ -256,6 +296,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
opus_int Tilt_Q14, /* I Spectral tilt */
opus_int32 LF_shp_Q14, /* I */
opus_int32 Gain_Q16, /* I */
opus_int Lambda_Q10, /* I */
opus_int offset_Q10, /* I */
opus_int length, /* I Input length */
opus_int32 table[][4] /* I */
@ -264,7 +305,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
opus_int i;
opus_int32 LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13;
opus_int32 n_LF_Q12, r_Q10, q1_Q0, q1_Q10, q2_Q10;
opus_int32 exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
opus_int32 exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10, sDiff_shp_Q14;
opus_int32 tmp1, tmp2, sLF_AR_shp_Q14;
opus_int32 *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr;
@ -279,6 +320,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
__m128i sAR2_Q14_hi_76543210, sAR2_Q14_lo_76543210;
__m128i AR_shp_Q13_76543210;
int rdo_offset = (Lambda_Q10 >> 1) - 512;
shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 );
@ -288,6 +331,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
sLF_AR_shp_Q14 = NSQ->sLF_AR_shp_Q14;
xq_Q14 = psLPC_Q14[ 0 ];
sDiff_shp_Q14 = NSQ->sDiff_shp_Q14;
LTP_pred_Q13 = 0;
/* load a_Q12 */
@ -430,8 +474,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
sAR2_Q14_hi_76543210 = _mm_slli_si128( sAR2_Q14_hi_76543210, 2 );
sAR2_Q14_lo_76543210 = _mm_slli_si128( sAR2_Q14_lo_76543210, 2 );
sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (xq_Q14 >> 16), 0 );
sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (xq_Q14), 0 );
sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (sDiff_shp_Q14 >> 16), 0 );
sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (sDiff_shp_Q14), 0 );
/* high part, use pmaddwd, results in 4 32-bit */
xmm_hi_07 = _mm_madd_epi16( sAR2_Q14_hi_76543210, AR_shp_Q13_76543210 );
@ -462,14 +506,14 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 );
n_LF_Q12 = silk_SMLAWT( n_LF_Q12, sLF_AR_shp_Q14, LF_shp_Q14 );
silk_assert( lag > 0 || signalType != TYPE_VOICED );
celt_assert( lag > 0 || signalType != TYPE_VOICED );
/* Combine prediction and noise shaping signals */
tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 ); /* Q12 */
tmp1 = silk_SUB32( tmp1, n_LF_Q12 ); /* Q12 */
if( lag > 0 ) {
/* Symmetric, packed FIR coefficients */
n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
n_LTP_Q13 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 );
shp_lag_ptr++;
@ -495,6 +539,18 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
/* Find two quantization level candidates and measure their rate-distortion */
q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
if (Lambda_Q10 > 2048) {
/* For aggressive RDO, the bias becomes more than one pulse. */
if (q1_Q10 > rdo_offset) {
q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 );
} else if (q1_Q10 < -rdo_offset) {
q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 );
} else if (q1_Q10 < 0) {
q1_Q0 = -1;
} else {
q1_Q0 = 0;
}
}
q1_Q10 = table[q1_Q0][0];
q2_Q10 = table[q1_Q0][1];
@ -519,7 +575,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
/* Update states */
psLPC_Q14++;
*psLPC_Q14 = xq_Q14;
sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, n_AR_Q12, 2 );
NSQ->sDiff_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_sc_Q10[ i ], 4 );
sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( NSQ->sDiff_shp_Q14, n_AR_Q12, 2 );
NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 );
sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 );
@ -602,7 +659,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
const silk_encoder_state *psEncC, /* I Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
const opus_int32 x_Q3[], /* I input in Q3 */
const opus_int16 x16[], /* I input */
opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */
const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */
opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
@ -614,50 +671,40 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
)
{
opus_int i, lag;
opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
__m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
__m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1;
lag = pitchL[ subfr ];
inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
silk_assert( inv_gain_Q31 != 0 );
/* Calculate gain adjustment factor */
if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
} else {
gain_adj_Q16 = (opus_int32)1 << 16;
}
/* Scale input */
inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );
/* prepare inv_gain_Q23 in packed 4 32-bits */
xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
/* prepare inv_gain_Q26 in packed 4 32-bits */
xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26);
for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) );
/* equal shift right 4 bytes*/
xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 );
xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 );
xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 );
xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 );
xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC );
_mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x_Q3_x2x0 );
_mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );
}
for( ; i < psEncC->subfr_length; i++ ) {
x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 );
}
/* Save inverse gain */
NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
/* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
if( NSQ->rewhite_flag ) {
if( subfr == 0 ) {
@ -671,7 +718,9 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
}
/* Adjust for changing gain */
if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
/* Scale long-term shaping state */
__m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
@ -707,6 +756,7 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
}
NSQ->sLF_AR_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sLF_AR_shp_Q14 );
NSQ->sDiff_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sDiff_shp_Q14 );
/* Scale short-term prediction and shaping states */
for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
@ -715,5 +765,8 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
NSQ->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sAR2_Q14[ i ] );
}
/* Save inverse gain */
NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
}
}

View file

@ -67,7 +67,7 @@ extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])(
#endif
opus_int64 silk_inner_prod16_aligned_64_sse4_1(
opus_int64 silk_inner_prod16_sse4_1(
const opus_int16 *inVec1,
const opus_int16 *inVec2,
const opus_int len
@ -76,18 +76,18 @@ opus_int64 silk_inner_prod16_aligned_64_sse4_1(
#if defined(OPUS_X86_PRESUME_SSE4_1)
#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
((void)(arch),silk_inner_prod16_aligned_64_sse4_1(inVec1, inVec2, len))
#define silk_inner_prod16(inVec1, inVec2, len, arch) \
((void)(arch),silk_inner_prod16_sse4_1(inVec1, inVec2, len))
#else
extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])(
extern opus_int64 (*const SILK_INNER_PROD16_IMPL[OPUS_ARCHMASK + 1])(
const opus_int16 *inVec1,
const opus_int16 *inVec2,
const opus_int len);
# define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
((*SILK_INNER_PROD16_ALIGNED_64_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len))
# define silk_inner_prod16(inVec1, inVec2, len, arch) \
((*SILK_INNER_PROD16_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len))
#endif
#endif

View file

@ -1,5 +1,5 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
/* Copyright (c) 2014-2020, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
@ -63,6 +63,14 @@ opus_int silk_VAD_GetSA_Q8_sse4_1( /* O Return value, 0 if s
SAVE_STACK;
#ifdef OPUS_CHECK_ASM
silk_encoder_state psEncC_c;
opus_int ret_c;
silk_memcpy( &psEncC_c, psEncC, sizeof( psEncC_c ) );
ret_c = silk_VAD_GetSA_Q8_c( &psEncC_c, pIn );
#endif
/* Safety checks */
silk_assert( VAD_N_BANDS == 4 );
celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
@ -233,15 +241,14 @@ opus_int silk_VAD_GetSA_Q8_sse4_1( /* O Return value, 0 if s
speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
}
if( psEncC->frame_length == 20 * psEncC->fs_kHz ) {
speech_nrg = silk_RSHIFT32( speech_nrg, 1 );
}
/* Power scaling */
if( speech_nrg <= 0 ) {
SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
} else if( speech_nrg < 32768 ) {
if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 16 );
} else {
speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 15 );
}
} else if( speech_nrg < 16384 ) {
speech_nrg = silk_LSHIFT32( speech_nrg, 16 );
/* square-root */
speech_nrg = silk_SQRT_APPROX( speech_nrg );
@ -272,6 +279,11 @@ opus_int silk_VAD_GetSA_Q8_sse4_1( /* O Return value, 0 if s
psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
}
#ifdef OPUS_CHECK_ASM
silk_assert( ret == ret_c );
silk_assert( !memcmp( &psEncC_c, psEncC, sizeof( psEncC_c ) ) );
#endif
RESTORE_STACK;
return( ret );
}

View file

@ -1,5 +1,5 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
/* Copyright (c) 2014-2020, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
@ -38,105 +38,136 @@
/* Entropy constrained matrix-weighted VQ, hard-coded to 5-element vectors, for a single input data vector */
void silk_VQ_WMat_EC_sse4_1(
opus_int8 *ind, /* O index of best codebook vector */
opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
opus_int32 *res_nrg_Q15, /* O best residual energy */
opus_int32 *rate_dist_Q8, /* O best total bitrate */
opus_int *gain_Q7, /* O sum of absolute LTP coefficients */
const opus_int16 *in_Q14, /* I input vector to be quantized */
const opus_int32 *W_Q18, /* I weighting matrix */
const opus_int32 *XX_Q17, /* I correlation matrix */
const opus_int32 *xX_Q17, /* I correlation vector */
const opus_int8 *cb_Q7, /* I codebook */
const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */
const opus_uint8 *cl_Q5, /* I code length for each codebook vector */
const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */
const opus_int subfr_len, /* I number of samples per subframe */
const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */
opus_int L /* I number of vectors in codebook */
const opus_int L /* I number of vectors in codebook */
)
{
opus_int k, gain_tmp_Q7;
const opus_int8 *cb_row_Q7;
opus_int16 diff_Q14[ 5 ];
opus_int32 sum1_Q14, sum2_Q16;
opus_int32 neg_xX_Q24[ 5 ];
opus_int32 sum1_Q15, sum2_Q24;
opus_int32 bits_res_Q8, bits_tot_Q8;
__m128i v_XX_31_Q17, v_XX_42_Q17, v_cb_row_31_Q7, v_cb_row_42_Q7, v_acc1_Q24, v_acc2_Q24;
/* Negate and convert to new Q domain */
neg_xX_Q24[ 0 ] = -silk_LSHIFT32( xX_Q17[ 0 ], 7 );
neg_xX_Q24[ 1 ] = -silk_LSHIFT32( xX_Q17[ 1 ], 7 );
neg_xX_Q24[ 2 ] = -silk_LSHIFT32( xX_Q17[ 2 ], 7 );
neg_xX_Q24[ 3 ] = -silk_LSHIFT32( xX_Q17[ 3 ], 7 );
neg_xX_Q24[ 4 ] = -silk_LSHIFT32( xX_Q17[ 4 ], 7 );
v_XX_31_Q17 = _mm_loadu_si128( (__m128i *)(&XX_Q17[ 1 ] ) );
v_XX_42_Q17 = _mm_shuffle_epi32( v_XX_31_Q17, _MM_SHUFFLE( 0, 3, 2, 1 ) );
__m128i C_tmp1, C_tmp2, C_tmp3, C_tmp4, C_tmp5;
/* Loop over codebook */
*rate_dist_Q14 = silk_int32_MAX;
*rate_dist_Q8 = silk_int32_MAX;
*res_nrg_Q15 = silk_int32_MAX;
cb_row_Q7 = cb_Q7;
/* If things go really bad, at least *ind is set to something safe. */
*ind = 0;
for( k = 0; k < L; k++ ) {
opus_int32 penalty;
gain_tmp_Q7 = cb_gain_Q7[k];
diff_Q14[ 0 ] = in_Q14[ 0 ] - silk_LSHIFT( cb_row_Q7[ 0 ], 7 );
C_tmp1 = OP_CVTEPI16_EPI32_M64( &in_Q14[ 1 ] );
C_tmp2 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] );
C_tmp2 = _mm_slli_epi32( C_tmp2, 7 );
C_tmp1 = _mm_sub_epi32( C_tmp1, C_tmp2 );
diff_Q14[ 1 ] = _mm_extract_epi16( C_tmp1, 0 );
diff_Q14[ 2 ] = _mm_extract_epi16( C_tmp1, 2 );
diff_Q14[ 3 ] = _mm_extract_epi16( C_tmp1, 4 );
diff_Q14[ 4 ] = _mm_extract_epi16( C_tmp1, 6 );
/* Weighted rate */
sum1_Q14 = silk_SMULBB( mu_Q9, cl_Q5[ k ] );
/* Quantization error: 1 - 2 * xX * cb + cb' * XX * cb */
sum1_Q15 = SILK_FIX_CONST( 1.001, 15 );
/* Penalty for too large gain */
sum1_Q14 = silk_ADD_LSHIFT32( sum1_Q14, silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 10 );
penalty = silk_LSHIFT32( silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 11 );
silk_assert( sum1_Q14 >= 0 );
/* first row of XX_Q17 */
v_cb_row_31_Q7 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] );
v_cb_row_42_Q7 = _mm_shuffle_epi32( v_cb_row_31_Q7, _MM_SHUFFLE( 0, 3, 2, 1 ) );
v_cb_row_31_Q7 = _mm_mul_epi32( v_XX_31_Q17, v_cb_row_31_Q7 );
v_cb_row_42_Q7 = _mm_mul_epi32( v_XX_42_Q17, v_cb_row_42_Q7 );
v_acc1_Q24 = _mm_add_epi64( v_cb_row_31_Q7, v_cb_row_42_Q7);
v_acc2_Q24 = _mm_shuffle_epi32( v_acc1_Q24, _MM_SHUFFLE( 1, 0, 3, 2 ) );
v_acc1_Q24 = _mm_add_epi64( v_acc1_Q24, v_acc2_Q24);
sum2_Q24 = _mm_cvtsi128_si32( v_acc1_Q24 );
sum2_Q24 = silk_ADD32( neg_xX_Q24[ 0 ], sum2_Q24 );
sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 0 ], cb_row_Q7[ 0 ] );
sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 0 ] );
/* first row of W_Q18 */
C_tmp3 = _mm_loadu_si128( (__m128i *)(&W_Q18[ 1 ] ) );
C_tmp4 = _mm_mul_epi32( C_tmp3, C_tmp1 );
C_tmp4 = _mm_srli_si128( C_tmp4, 2 );
/* second row of XX_Q17 */
sum2_Q24 = silk_MLA( neg_xX_Q24[ 1 ], XX_Q17[ 7 ], cb_row_Q7[ 2 ] );
sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 8 ], cb_row_Q7[ 3 ] );
sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 9 ], cb_row_Q7[ 4 ] );
sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 6 ], cb_row_Q7[ 1 ] );
sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 1 ] );
C_tmp1 = _mm_shuffle_epi32( C_tmp1, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */
C_tmp3 = _mm_shuffle_epi32( C_tmp3, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */
/* third row of XX_Q17 */
sum2_Q24 = silk_MLA( neg_xX_Q24[ 2 ], XX_Q17[ 13 ], cb_row_Q7[ 3 ] );
sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 14 ], cb_row_Q7[ 4 ] );
sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 12 ], cb_row_Q7[ 2 ] );
sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 2 ] );
C_tmp5 = _mm_mul_epi32( C_tmp3, C_tmp1 );
C_tmp5 = _mm_srli_si128( C_tmp5, 2 );
/* fourth row of XX_Q17 */
sum2_Q24 = silk_MLA( neg_xX_Q24[ 3 ], XX_Q17[ 19 ], cb_row_Q7[ 4 ] );
sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 18 ], cb_row_Q7[ 3 ] );
sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 3 ] );
C_tmp5 = _mm_add_epi32( C_tmp4, C_tmp5 );
C_tmp5 = _mm_slli_epi32( C_tmp5, 1 );
C_tmp5 = _mm_add_epi32( C_tmp5, _mm_shuffle_epi32( C_tmp5, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
sum2_Q16 = _mm_cvtsi128_si32( C_tmp5 );
sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 0 ], diff_Q14[ 0 ] );
sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 0 ] );
/* second row of W_Q18 */
sum2_Q16 = silk_SMULWB( W_Q18[ 7 ], diff_Q14[ 2 ] );
sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 8 ], diff_Q14[ 3 ] );
sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 9 ], diff_Q14[ 4 ] );
sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 6 ], diff_Q14[ 1 ] );
sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 1 ] );
/* third row of W_Q18 */
sum2_Q16 = silk_SMULWB( W_Q18[ 13 ], diff_Q14[ 3 ] );
sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 14 ], diff_Q14[ 4 ] );
sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 12 ], diff_Q14[ 2 ] );
sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 2 ] );
/* fourth row of W_Q18 */
sum2_Q16 = silk_SMULWB( W_Q18[ 19 ], diff_Q14[ 4 ] );
sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 18 ], diff_Q14[ 3 ] );
sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 3 ] );
/* last row of W_Q18 */
sum2_Q16 = silk_SMULWB( W_Q18[ 24 ], diff_Q14[ 4 ] );
sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 4 ] );
silk_assert( sum1_Q14 >= 0 );
/* last row of XX_Q17 */
sum2_Q24 = silk_LSHIFT32( neg_xX_Q24[ 4 ], 1 );
sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 24 ], cb_row_Q7[ 4 ] );
sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 4 ] );
/* find best */
if( sum1_Q14 < *rate_dist_Q14 ) {
*rate_dist_Q14 = sum1_Q14;
if( sum1_Q15 >= 0 ) {
/* Translate residual energy to bits using high-rate assumption (6 dB ==> 1 bit/sample) */
bits_res_Q8 = silk_SMULBB( subfr_len, silk_lin2log( sum1_Q15 + penalty) - (15 << 7) );
/* In the following line we reduce the codelength component by half ("-1"); seems to slightly improve quality */
bits_tot_Q8 = silk_ADD_LSHIFT32( bits_res_Q8, cl_Q5[ k ], 3-1 );
if( bits_tot_Q8 <= *rate_dist_Q8 ) {
*rate_dist_Q8 = bits_tot_Q8;
*res_nrg_Q15 = sum1_Q15 + penalty;
*ind = (opus_int8)k;
*gain_Q7 = gain_tmp_Q7;
}
}
/* Go to next cbk vector */
cb_row_Q7 += LTP_ORDER;
}
#ifdef OPUS_CHECK_ASM
{
opus_int8 ind_c = 0;
opus_int32 res_nrg_Q15_c = 0;
opus_int32 rate_dist_Q8_c = 0;
opus_int gain_Q7_c = 0;
silk_VQ_WMat_EC_c(
&ind_c,
&res_nrg_Q15_c,
&rate_dist_Q8_c,
&gain_Q7_c,
XX_Q17,
xX_Q17,
cb_Q7,
cb_gain_Q7,
cl_Q5,
subfr_len,
max_gain_Q7,
L
);
silk_assert( *ind == ind_c );
silk_assert( *res_nrg_Q15 == res_nrg_Q15_c );
silk_assert( *rate_dist_Q8 == rate_dist_Q8_c );
silk_assert( *gain_Q7 == gain_Q7_c );
}
#endif
}

View file

@ -34,66 +34,65 @@
# if defined(OPUS_X86_MAY_HAVE_SSE4_1)
#if 0 /* FIXME: SSE disabled until silk_VQ_WMat_EC_sse4_1() gets updated. */
# define OVERRIDE_silk_VQ_WMat_EC
void silk_VQ_WMat_EC_sse4_1(
opus_int8 *ind, /* O index of best codebook vector */
opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
opus_int32 *res_nrg_Q15, /* O best residual energy */
opus_int32 *rate_dist_Q8, /* O best total bitrate */
opus_int *gain_Q7, /* O sum of absolute LTP coefficients */
const opus_int16 *in_Q14, /* I input vector to be quantized */
const opus_int32 *W_Q18, /* I weighting matrix */
const opus_int32 *XX_Q17, /* I correlation matrix */
const opus_int32 *xX_Q17, /* I correlation vector */
const opus_int8 *cb_Q7, /* I codebook */
const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */
const opus_uint8 *cl_Q5, /* I code length for each codebook vector */
const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */
const opus_int subfr_len, /* I number of samples per subframe */
const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */
opus_int L /* I number of vectors in codebook */
const opus_int L /* I number of vectors in codebook */
);
#if defined OPUS_X86_PRESUME_SSE4_1
#define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
mu_Q9, max_gain_Q7, L, arch) \
((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
mu_Q9, max_gain_Q7, L))
#define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
subfr_len, max_gain_Q7, L, arch) \
((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
subfr_len, max_gain_Q7, L))
#else
extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])(
opus_int8 *ind, /* O index of best codebook vector */
opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
opus_int32 *res_nrg_Q15, /* O best residual energy */
opus_int32 *rate_dist_Q8, /* O best total bitrate */
opus_int *gain_Q7, /* O sum of absolute LTP coefficients */
const opus_int16 *in_Q14, /* I input vector to be quantized */
const opus_int32 *W_Q18, /* I weighting matrix */
const opus_int32 *XX_Q17, /* I correlation matrix */
const opus_int32 *xX_Q17, /* I correlation vector */
const opus_int8 *cb_Q7, /* I codebook */
const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */
const opus_uint8 *cl_Q5, /* I code length for each codebook vector */
const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */
const opus_int subfr_len, /* I number of samples per subframe */
const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */
opus_int L /* I number of vectors in codebook */
const opus_int L /* I number of vectors in codebook */
);
# define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
mu_Q9, max_gain_Q7, L, arch) \
((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
mu_Q9, max_gain_Q7, L))
# define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
subfr_len, max_gain_Q7, L, arch) \
((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
subfr_len, max_gain_Q7, L))
#endif
#endif
#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
# define OVERRIDE_silk_NSQ
void silk_NSQ_sse4_1(
const silk_encoder_state *psEncC, /* I Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
SideInfoIndices *psIndices, /* I/O Quantization Indices */
const opus_int32 x_Q3[], /* I Prefiltered input signal */
const opus_int16 x16[], /* I Input */
opus_int8 pulses[], /* O Quantized pulse signal */
const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
@ -116,11 +115,11 @@ extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(
const silk_encoder_state *psEncC, /* I Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
SideInfoIndices *psIndices, /* I/O Quantization Indices */
const opus_int32 x_Q3[], /* I Prefiltered input signal */
const opus_int16 x16[], /* I Input */
opus_int8 pulses[], /* O Quantized pulse signal */
const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
@ -143,11 +142,11 @@ void silk_NSQ_del_dec_sse4_1(
const silk_encoder_state *psEncC, /* I Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
SideInfoIndices *psIndices, /* I/O Quantization Indices */
const opus_int32 x_Q3[], /* I Prefiltered input signal */
const opus_int16 x16[], /* I Input */
opus_int8 pulses[], /* O Quantized pulse signal */
const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
@ -159,9 +158,9 @@ void silk_NSQ_del_dec_sse4_1(
#if defined OPUS_X86_PRESUME_SSE4_1
#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
#else
@ -170,11 +169,11 @@ extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
const silk_encoder_state *psEncC, /* I Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
SideInfoIndices *psIndices, /* I/O Quantization Indices */
const opus_int32 x_Q3[], /* I Prefiltered input signal */
const opus_int16 x16[], /* I Input */
opus_int8 pulses[], /* O Quantized pulse signal */
const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
@ -184,13 +183,12 @@ extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
const opus_int LTP_scale_Q14 /* I LTP state scaling */
);
# define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
# define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
#endif
#endif
void silk_noise_shape_quantizer(
silk_nsq_state *NSQ, /* I/O NSQ state */

View file

@ -41,16 +41,16 @@
#include "fixed/main_FIX.h"
opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )(
opus_int64 (*const SILK_INNER_PROD16_IMPL[ OPUS_ARCHMASK + 1 ] )(
const opus_int16 *inVec1,
const opus_int16 *inVec2,
const opus_int len
) = {
silk_inner_prod16_aligned_64_c, /* non-sse */
silk_inner_prod16_aligned_64_c,
silk_inner_prod16_aligned_64_c,
MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ), /* sse4.1 */
MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ) /* avx */
silk_inner_prod16_c, /* non-sse */
silk_inner_prod16_c,
silk_inner_prod16_c,
MAY_HAVE_SSE4_1( silk_inner_prod16 ), /* sse4.1 */
MAY_HAVE_SSE4_1( silk_inner_prod16 ) /* avx */
};
#endif
@ -66,16 +66,15 @@ opus_int (*const SILK_VAD_GETSA_Q8_IMPL[ OPUS_ARCHMASK + 1 ] )(
MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 ) /* avx */
};
#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
const silk_encoder_state *psEncC, /* I Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
SideInfoIndices *psIndices, /* I/O Quantization Indices */
const opus_int32 x_Q3[], /* I Prefiltered input signal */
const opus_int16 x16[], /* I Input */
opus_int8 pulses[], /* O Quantized pulse signal */
const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
@ -90,21 +89,20 @@ void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
MAY_HAVE_SSE4_1( silk_NSQ ), /* sse4.1 */
MAY_HAVE_SSE4_1( silk_NSQ ) /* avx */
};
#endif
#if 0 /* FIXME: SSE disabled until silk_VQ_WMat_EC_sse4_1() gets updated. */
void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(
opus_int8 *ind, /* O index of best codebook vector */
opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
opus_int32 *res_nrg_Q15, /* O best residual energy */
opus_int32 *rate_dist_Q8, /* O best total bitrate */
opus_int *gain_Q7, /* O sum of absolute LTP coefficients */
const opus_int16 *in_Q14, /* I input vector to be quantized */
const opus_int32 *W_Q18, /* I weighting matrix */
const opus_int32 *XX_Q17, /* I correlation matrix */
const opus_int32 *xX_Q17, /* I correlation vector */
const opus_int8 *cb_Q7, /* I codebook */
const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */
const opus_uint8 *cl_Q5, /* I code length for each codebook vector */
const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */
const opus_int subfr_len, /* I number of samples per subframe */
const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */
opus_int L /* I number of vectors in codebook */
const opus_int L /* I number of vectors in codebook */
) = {
silk_VQ_WMat_EC_c, /* non-sse */
silk_VQ_WMat_EC_c,
@ -112,18 +110,16 @@ void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(
MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ), /* sse4.1 */
MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ) /* avx */
};
#endif
#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
const silk_encoder_state *psEncC, /* I Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
SideInfoIndices *psIndices, /* I/O Quantization Indices */
const opus_int32 x_Q3[], /* I Prefiltered input signal */
const opus_int16 x16[], /* I Input */
opus_int8 pulses[], /* O Quantized pulse signal */
const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
@ -138,7 +134,6 @@ void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */
MAY_HAVE_SSE4_1( silk_NSQ_del_dec ) /* avx */
};
#endif
#if defined(FIXED_POINT)