mirror of
https://github.com/xiph/opus.git
synced 2025-05-29 14:49:14 +00:00
Update and re-enable SILK SSE4.1 optimisations
This commit is contained in:
parent
37aba6e9b3
commit
c6f9857771
17 changed files with 645 additions and 634 deletions
|
@ -173,9 +173,9 @@ void silk_NSQ_c
|
|||
RESTORE_STACK;
|
||||
}
|
||||
|
||||
/***********************************/
|
||||
/******************************/
|
||||
/* silk_noise_shape_quantizer */
|
||||
/***********************************/
|
||||
/******************************/
|
||||
|
||||
#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
|
||||
static OPUS_INLINE
|
||||
|
|
|
@ -381,7 +381,7 @@ opus_int32 silk_inner_prod_aligned_scale(
|
|||
const opus_int len /* I vector lengths */
|
||||
);
|
||||
|
||||
opus_int64 silk_inner_prod16_aligned_64_c(
|
||||
opus_int64 silk_inner_prod16_c(
|
||||
const opus_int16 *inVec1, /* I input vector 1 */
|
||||
const opus_int16 *inVec2, /* I input vector 2 */
|
||||
const opus_int len /* I vector lengths */
|
||||
|
@ -613,8 +613,8 @@ static OPUS_INLINE opus_int64 silk_max_64(opus_int64 a, opus_int64 b)
|
|||
#define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
|
||||
((void)(arch), silk_burg_modified_c(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
|
||||
|
||||
#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
|
||||
((void)(arch),silk_inner_prod16_aligned_64_c(inVec1, inVec2, len))
|
||||
#define silk_inner_prod16(inVec1, inVec2, len, arch) \
|
||||
((void)(arch),silk_inner_prod16_c(inVec1, inVec2, len))
|
||||
#endif
|
||||
|
||||
#include "Inlines.h"
|
||||
|
|
|
@ -64,7 +64,7 @@ void silk_VQ_WMat_EC_c(
|
|||
*rate_dist_Q8 = silk_int32_MAX;
|
||||
*res_nrg_Q15 = silk_int32_MAX;
|
||||
cb_row_Q7 = cb_Q7;
|
||||
/* In things go really bad, at least *ind is set to something safe. */
|
||||
/* If things go really bad, at least *ind is set to something safe. */
|
||||
*ind = 0;
|
||||
for( k = 0; k < L; k++ ) {
|
||||
opus_int32 penalty;
|
||||
|
@ -115,7 +115,7 @@ void silk_VQ_WMat_EC_c(
|
|||
if( sum1_Q15 >= 0 ) {
|
||||
/* Translate residual energy to bits using high-rate assumption (6 dB ==> 1 bit/sample) */
|
||||
bits_res_Q8 = silk_SMULBB( subfr_len, silk_lin2log( sum1_Q15 + penalty) - (15 << 7) );
|
||||
/* In the following line we reduce the codelength component by half ("-1"); seems to slghtly improve quality */
|
||||
/* In the following line we reduce the codelength component by half ("-1"); seems to slightly improve quality */
|
||||
bits_tot_Q8 = silk_ADD_LSHIFT32( bits_res_Q8, cl_Q5[ k ], 3-1 );
|
||||
if( bits_tot_Q8 <= *rate_dist_Q8 ) {
|
||||
*rate_dist_Q8 = bits_tot_Q8;
|
||||
|
|
|
@ -68,7 +68,7 @@ void silk_burg_modified_c(
|
|||
celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
|
||||
|
||||
/* Compute autocorrelations, added over subframes */
|
||||
C0_64 = silk_inner_prod16_aligned_64( x, x, subfr_length*nb_subfr, arch );
|
||||
C0_64 = silk_inner_prod16( x, x, subfr_length*nb_subfr, arch );
|
||||
lz = silk_CLZ64(C0_64);
|
||||
rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz;
|
||||
if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS;
|
||||
|
@ -87,7 +87,7 @@ void silk_burg_modified_c(
|
|||
x_ptr = x + s * subfr_length;
|
||||
for( n = 1; n < D + 1; n++ ) {
|
||||
C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
|
||||
silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
|
||||
silk_inner_prod16( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
@ -150,7 +150,7 @@ void silk_burg_modified_c(
|
|||
C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */
|
||||
C_last_row[ k ] = silk_MLA( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
|
||||
Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 ); /* Q17 */
|
||||
/* We sometimes have get overflows in the multiplications (even beyond +/- 2^32),
|
||||
/* We sometimes get overflows in the multiplications (even beyond +/- 2^32),
|
||||
but they cancel each other and the real result seems to always fit in a 32-bit
|
||||
signed integer. This was determined experimentally, not theoretically (unfortunately). */
|
||||
tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ], Atmp1 ); /* Q17 */
|
||||
|
@ -253,7 +253,7 @@ void silk_burg_modified_c(
|
|||
if( rshifts > 0 ) {
|
||||
for( s = 0; s < nb_subfr; s++ ) {
|
||||
x_ptr = x + s * subfr_length;
|
||||
C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
|
||||
C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16( x_ptr, x_ptr, D, arch ), rshifts );
|
||||
}
|
||||
} else {
|
||||
for( s = 0; s < nb_subfr; s++ ) {
|
||||
|
|
|
@ -87,7 +87,7 @@ opus_int32 silk_inner_prod_aligned(
|
|||
#endif
|
||||
}
|
||||
|
||||
opus_int64 silk_inner_prod16_aligned_64_c(
|
||||
opus_int64 silk_inner_prod16_c(
|
||||
const opus_int16 *inVec1, /* I input vector 1 */
|
||||
const opus_int16 *inVec2, /* I input vector 2 */
|
||||
const opus_int len /* I vector lengths */
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/* Copyright (c) 2014, Cisco Systems, INC
|
||||
Written by XiangMingZhu WeiZhou MinPeng YanWang
|
||||
/* Copyright (c) 2014-2020, Cisco Systems, INC
|
||||
Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
|
@ -42,7 +42,7 @@
|
|||
#define MAX_FRAME_SIZE 384 /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384 */
|
||||
|
||||
#define QA 25
|
||||
#define N_BITS_HEAD_ROOM 2
|
||||
#define N_BITS_HEAD_ROOM 3
|
||||
#define MIN_RSHIFTS -16
|
||||
#define MAX_RSHIFTS (32 - QA)
|
||||
|
||||
|
@ -59,7 +59,7 @@ void silk_burg_modified_sse4_1(
|
|||
int arch /* I Run-time architecture */
|
||||
)
|
||||
{
|
||||
opus_int k, n, s, lz, rshifts, rshifts_extra, reached_max_gain;
|
||||
opus_int k, n, s, lz, rshifts, reached_max_gain;
|
||||
opus_int32 C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2;
|
||||
const opus_int16 *x_ptr;
|
||||
opus_int32 C_first_row[ SILK_MAX_ORDER_LPC ];
|
||||
|
@ -68,6 +68,7 @@ void silk_burg_modified_sse4_1(
|
|||
opus_int32 CAf[ SILK_MAX_ORDER_LPC + 1 ];
|
||||
opus_int32 CAb[ SILK_MAX_ORDER_LPC + 1 ];
|
||||
opus_int32 xcorr[ SILK_MAX_ORDER_LPC ];
|
||||
opus_int64 C0_64;
|
||||
|
||||
__m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210;
|
||||
__m128i CONST1 = _mm_set1_epi32(1);
|
||||
|
@ -75,23 +76,18 @@ void silk_burg_modified_sse4_1(
|
|||
celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
|
||||
|
||||
/* Compute autocorrelations, added over subframes */
|
||||
silk_sum_sqr_shift( &C0, &rshifts, x, nb_subfr * subfr_length );
|
||||
if( rshifts > MAX_RSHIFTS ) {
|
||||
C0 = silk_LSHIFT32( C0, rshifts - MAX_RSHIFTS );
|
||||
silk_assert( C0 > 0 );
|
||||
rshifts = MAX_RSHIFTS;
|
||||
C0_64 = silk_inner_prod16( x, x, subfr_length*nb_subfr, arch );
|
||||
lz = silk_CLZ64(C0_64);
|
||||
rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz;
|
||||
if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS;
|
||||
if (rshifts < MIN_RSHIFTS) rshifts = MIN_RSHIFTS;
|
||||
|
||||
if (rshifts > 0) {
|
||||
C0 = (opus_int32)silk_RSHIFT64(C0_64, rshifts );
|
||||
} else {
|
||||
lz = silk_CLZ32( C0 ) - 1;
|
||||
rshifts_extra = N_BITS_HEAD_ROOM - lz;
|
||||
if( rshifts_extra > 0 ) {
|
||||
rshifts_extra = silk_min( rshifts_extra, MAX_RSHIFTS - rshifts );
|
||||
C0 = silk_RSHIFT32( C0, rshifts_extra );
|
||||
} else {
|
||||
rshifts_extra = silk_max( rshifts_extra, MIN_RSHIFTS - rshifts );
|
||||
C0 = silk_LSHIFT32( C0, -rshifts_extra );
|
||||
}
|
||||
rshifts += rshifts_extra;
|
||||
C0 = silk_LSHIFT32((opus_int32)C0_64, -rshifts );
|
||||
}
|
||||
|
||||
CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1; /* Q(-rshifts) */
|
||||
silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
|
||||
if( rshifts > 0 ) {
|
||||
|
@ -99,7 +95,7 @@ void silk_burg_modified_sse4_1(
|
|||
x_ptr = x + s * subfr_length;
|
||||
for( n = 1; n < D + 1; n++ ) {
|
||||
C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
|
||||
silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
|
||||
silk_inner_prod16( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
@ -203,8 +199,11 @@ void silk_burg_modified_sse4_1(
|
|||
C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */
|
||||
C_last_row[ k ] = silk_MLA( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
|
||||
Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 ); /* Q17 */
|
||||
tmp1 = silk_MLA( tmp1, x_ptr[ n - k - 1 ], Atmp1 ); /* Q17 */
|
||||
tmp2 = silk_MLA( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 ); /* Q17 */
|
||||
/* We sometimes get overflows in the multiplications (even beyond +/- 2^32),
|
||||
but they cancel each other and the real result seems to always fit in a 32-bit
|
||||
signed integer. This was determined experimentally, not theoretically (unfortunately). */
|
||||
tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ], Atmp1 ); /* Q17 */
|
||||
tmp2 = silk_MLA_ovflw( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 ); /* Q17 */
|
||||
}
|
||||
|
||||
tmp1 = -tmp1; /* Q17 */
|
||||
|
@ -350,7 +349,7 @@ void silk_burg_modified_sse4_1(
|
|||
if( rshifts > 0 ) {
|
||||
for( s = 0; s < nb_subfr; s++ ) {
|
||||
x_ptr = x + s * subfr_length;
|
||||
C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
|
||||
C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16( x_ptr, x_ptr, D, arch ), rshifts );
|
||||
}
|
||||
} else {
|
||||
for( s = 0; s < nb_subfr; s++ ) {
|
||||
|
@ -374,4 +373,28 @@ void silk_burg_modified_sse4_1(
|
|||
*res_nrg = silk_SMLAWW( nrg, silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ), -tmp1 );/* Q( -rshifts ) */
|
||||
*res_nrg_Q = -rshifts;
|
||||
}
|
||||
|
||||
#ifdef OPUS_CHECK_ASM
|
||||
{
|
||||
opus_int32 res_nrg_c = 0;
|
||||
opus_int res_nrg_Q_c = 0;
|
||||
opus_int32 A_Q16_c[ MAX_LPC_ORDER ] = {0};
|
||||
|
||||
silk_burg_modified_c(
|
||||
&res_nrg_c,
|
||||
&res_nrg_Q_c,
|
||||
A_Q16_c,
|
||||
x,
|
||||
minInvGain_Q30,
|
||||
subfr_length,
|
||||
nb_subfr,
|
||||
D,
|
||||
0
|
||||
);
|
||||
|
||||
silk_assert( *res_nrg == res_nrg_c );
|
||||
silk_assert( *res_nrg_Q == res_nrg_Q_c );
|
||||
silk_assert( !memcmp( A_Q16, A_Q16_c, D * sizeof( *A_Q16 ) ) );
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -1,160 +0,0 @@
|
|||
/* Copyright (c) 2014, Cisco Systems, INC
|
||||
Written by XiangMingZhu WeiZhou MinPeng YanWang
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#include <xmmintrin.h>
|
||||
#include <emmintrin.h>
|
||||
#include <smmintrin.h>
|
||||
#include "main.h"
|
||||
#include "celt/x86/x86cpu.h"
|
||||
|
||||
void silk_warped_LPC_analysis_filter_FIX_sse4_1(
|
||||
opus_int32 state[], /* I/O State [order + 1] */
|
||||
opus_int32 res_Q2[], /* O Residual signal [length] */
|
||||
const opus_int16 coef_Q13[], /* I Coefficients [order] */
|
||||
const opus_int16 input[], /* I Input signal [length] */
|
||||
const opus_int16 lambda_Q16, /* I Warping factor */
|
||||
const opus_int length, /* I Length of input signal */
|
||||
const opus_int order /* I Filter order (even) */
|
||||
)
|
||||
{
|
||||
opus_int n, i;
|
||||
opus_int32 acc_Q11, tmp1, tmp2;
|
||||
|
||||
/* Order must be even */
|
||||
celt_assert( ( order & 1 ) == 0 );
|
||||
|
||||
if (order == 10)
|
||||
{
|
||||
if (0 == lambda_Q16)
|
||||
{
|
||||
__m128i coef_Q13_3210, coef_Q13_7654;
|
||||
__m128i coef_Q13_0123, coef_Q13_4567;
|
||||
__m128i state_0123, state_4567;
|
||||
__m128i xmm_product1, xmm_product2;
|
||||
__m128i xmm_tempa, xmm_tempb;
|
||||
|
||||
register opus_int32 sum;
|
||||
register opus_int32 state_8, state_9, state_a;
|
||||
register opus_int64 coef_Q13_8, coef_Q13_9;
|
||||
|
||||
celt_assert( length > 0 );
|
||||
|
||||
coef_Q13_3210 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 0 ] );
|
||||
coef_Q13_7654 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 4 ] );
|
||||
|
||||
coef_Q13_0123 = _mm_shuffle_epi32( coef_Q13_3210, _MM_SHUFFLE( 0, 1, 2, 3 ) );
|
||||
coef_Q13_4567 = _mm_shuffle_epi32( coef_Q13_7654, _MM_SHUFFLE( 0, 1, 2, 3 ) );
|
||||
|
||||
coef_Q13_8 = (opus_int64) coef_Q13[ 8 ];
|
||||
coef_Q13_9 = (opus_int64) coef_Q13[ 9 ];
|
||||
|
||||
state_0123 = _mm_loadu_si128( (__m128i *)(&state[ 0 ] ) );
|
||||
state_4567 = _mm_loadu_si128( (__m128i *)(&state[ 4 ] ) );
|
||||
|
||||
state_0123 = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
|
||||
state_4567 = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
|
||||
|
||||
state_8 = state[ 8 ];
|
||||
state_9 = state[ 9 ];
|
||||
state_a = 0;
|
||||
|
||||
for( n = 0; n < length; n++ )
|
||||
{
|
||||
xmm_product1 = _mm_mul_epi32( coef_Q13_0123, state_0123 ); /* 64-bit multiply, only 2 pairs */
|
||||
xmm_product2 = _mm_mul_epi32( coef_Q13_4567, state_4567 );
|
||||
|
||||
xmm_tempa = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
|
||||
xmm_tempb = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
|
||||
|
||||
xmm_product1 = _mm_srli_epi64( xmm_product1, 16 ); /* >> 16, zero extending works */
|
||||
xmm_product2 = _mm_srli_epi64( xmm_product2, 16 );
|
||||
|
||||
xmm_tempa = _mm_mul_epi32( coef_Q13_3210, xmm_tempa );
|
||||
xmm_tempb = _mm_mul_epi32( coef_Q13_7654, xmm_tempb );
|
||||
|
||||
xmm_tempa = _mm_srli_epi64( xmm_tempa, 16 );
|
||||
xmm_tempb = _mm_srli_epi64( xmm_tempb, 16 );
|
||||
|
||||
xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_product1 );
|
||||
xmm_tempb = _mm_add_epi32( xmm_tempb, xmm_product2 );
|
||||
xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_tempb );
|
||||
|
||||
sum = (opus_int32)((coef_Q13_8 * state_8) >> 16);
|
||||
sum += (opus_int32)((coef_Q13_9 * state_9) >> 16);
|
||||
|
||||
xmm_tempa = _mm_add_epi32( xmm_tempa, _mm_shuffle_epi32( xmm_tempa, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
|
||||
sum += _mm_cvtsi128_si32( xmm_tempa);
|
||||
res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( ( 5 + sum ), 9);
|
||||
|
||||
/* move right */
|
||||
state_a = state_9;
|
||||
state_9 = state_8;
|
||||
state_8 = _mm_cvtsi128_si32( state_4567 );
|
||||
state_4567 = _mm_alignr_epi8( state_0123, state_4567, 4 );
|
||||
|
||||
state_0123 = _mm_alignr_epi8( _mm_cvtsi32_si128( silk_LSHIFT( input[ n ], 14 ) ), state_0123, 4 );
|
||||
}
|
||||
|
||||
_mm_storeu_si128( (__m128i *)( &state[ 0 ] ), _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
|
||||
_mm_storeu_si128( (__m128i *)( &state[ 4 ] ), _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
|
||||
state[ 8 ] = state_8;
|
||||
state[ 9 ] = state_9;
|
||||
state[ 10 ] = state_a;
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
for( n = 0; n < length; n++ ) {
|
||||
/* Output of lowpass section */
|
||||
tmp2 = silk_SMLAWB( state[ 0 ], state[ 1 ], lambda_Q16 );
|
||||
state[ 0 ] = silk_LSHIFT( input[ n ], 14 );
|
||||
/* Output of allpass section */
|
||||
tmp1 = silk_SMLAWB( state[ 1 ], state[ 2 ] - tmp2, lambda_Q16 );
|
||||
state[ 1 ] = tmp2;
|
||||
acc_Q11 = silk_RSHIFT( order, 1 );
|
||||
acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ 0 ] );
|
||||
/* Loop over allpass sections */
|
||||
for( i = 2; i < order; i += 2 ) {
|
||||
/* Output of allpass section */
|
||||
tmp2 = silk_SMLAWB( state[ i ], state[ i + 1 ] - tmp1, lambda_Q16 );
|
||||
state[ i ] = tmp1;
|
||||
acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ i - 1 ] );
|
||||
/* Output of allpass section */
|
||||
tmp1 = silk_SMLAWB( state[ i + 1 ], state[ i + 2 ] - tmp2, lambda_Q16 );
|
||||
state[ i + 1 ] = tmp2;
|
||||
acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ i ] );
|
||||
}
|
||||
state[ order ] = tmp1;
|
||||
acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ order - 1 ] );
|
||||
res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( acc_Q11, 9 );
|
||||
}
|
||||
}
|
|
@ -37,39 +37,36 @@
|
|||
#include "SigProc_FIX.h"
|
||||
#include "pitch.h"
|
||||
|
||||
opus_int64 silk_inner_prod16_aligned_64_sse4_1(
|
||||
opus_int64 silk_inner_prod16_sse4_1(
|
||||
const opus_int16 *inVec1, /* I input vector 1 */
|
||||
const opus_int16 *inVec2, /* I input vector 2 */
|
||||
const opus_int len /* I vector lengths */
|
||||
)
|
||||
{
|
||||
opus_int i, dataSize8;
|
||||
opus_int i, dataSize4;
|
||||
opus_int64 sum;
|
||||
|
||||
__m128i xmm_tempa;
|
||||
__m128i inVec1_76543210, acc1;
|
||||
__m128i inVec2_76543210, acc2;
|
||||
__m128i xmm_prod_20, xmm_prod_31;
|
||||
__m128i inVec1_3210, acc1;
|
||||
__m128i inVec2_3210, acc2;
|
||||
|
||||
sum = 0;
|
||||
dataSize8 = len & ~7;
|
||||
dataSize4 = len & ~3;
|
||||
|
||||
acc1 = _mm_setzero_si128();
|
||||
acc2 = _mm_setzero_si128();
|
||||
|
||||
for( i = 0; i < dataSize8; i += 8 ) {
|
||||
inVec1_76543210 = _mm_loadu_si128( (__m128i *)(&inVec1[i + 0] ) );
|
||||
inVec2_76543210 = _mm_loadu_si128( (__m128i *)(&inVec2[i + 0] ) );
|
||||
for( i = 0; i < dataSize4; i += 4 ) {
|
||||
inVec1_3210 = OP_CVTEPI16_EPI32_M64( &inVec1[i + 0] );
|
||||
inVec2_3210 = OP_CVTEPI16_EPI32_M64( &inVec2[i + 0] );
|
||||
xmm_prod_20 = _mm_mul_epi32( inVec1_3210, inVec2_3210 );
|
||||
|
||||
/* only when all 4 operands are -32768 (0x8000), this results in wrap around */
|
||||
inVec1_76543210 = _mm_madd_epi16( inVec1_76543210, inVec2_76543210 );
|
||||
inVec1_3210 = _mm_shuffle_epi32( inVec1_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
|
||||
inVec2_3210 = _mm_shuffle_epi32( inVec2_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
|
||||
xmm_prod_31 = _mm_mul_epi32( inVec1_3210, inVec2_3210 );
|
||||
|
||||
xmm_tempa = _mm_cvtepi32_epi64( inVec1_76543210 );
|
||||
/* equal shift right 8 bytes */
|
||||
inVec1_76543210 = _mm_shuffle_epi32( inVec1_76543210, _MM_SHUFFLE( 0, 0, 3, 2 ) );
|
||||
inVec1_76543210 = _mm_cvtepi32_epi64( inVec1_76543210 );
|
||||
|
||||
acc1 = _mm_add_epi64( acc1, xmm_tempa );
|
||||
acc2 = _mm_add_epi64( acc2, inVec1_76543210 );
|
||||
acc1 = _mm_add_epi64( acc1, xmm_prod_20 );
|
||||
acc2 = _mm_add_epi64( acc2, xmm_prod_31 );
|
||||
}
|
||||
|
||||
acc1 = _mm_add_epi64( acc1, acc2 );
|
||||
|
@ -84,5 +81,12 @@ opus_int64 silk_inner_prod16_aligned_64_sse4_1(
|
|||
sum = silk_SMLABB( sum, inVec1[ i ], inVec2[ i ] );
|
||||
}
|
||||
|
||||
#ifdef OPUS_CHECK_ASM
|
||||
{
|
||||
opus_int64 sum_c = silk_inner_prod16_c( inVec1, inVec2, len );
|
||||
silk_assert( sum == sum_c );
|
||||
}
|
||||
#endif
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/* Copyright (c) 2014, Cisco Systems, INC
|
||||
Written by XiangMingZhu WeiZhou MinPeng YanWang
|
||||
/* Copyright (c) 2014-2020, Cisco Systems, INC
|
||||
Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
|
@ -46,6 +46,7 @@ typedef struct {
|
|||
opus_int32 Shape_Q14[ DECISION_DELAY ];
|
||||
opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ];
|
||||
opus_int32 LF_AR_Q14;
|
||||
opus_int32 Diff_Q14;
|
||||
opus_int32 Seed;
|
||||
opus_int32 SeedInit;
|
||||
opus_int32 RD_Q10;
|
||||
|
@ -56,6 +57,7 @@ typedef struct {
|
|||
opus_int32 RD_Q10;
|
||||
opus_int32 xq_Q14;
|
||||
opus_int32 LF_AR_Q14;
|
||||
opus_int32 Diff_Q14;
|
||||
opus_int32 sLTP_shp_Q14;
|
||||
opus_int32 LPC_exc_Q14;
|
||||
} NSQ_sample_struct;
|
||||
|
@ -66,7 +68,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
|
|||
const silk_encoder_state *psEncC, /* I Encoder State */
|
||||
silk_nsq_state *NSQ, /* I/O NSQ state */
|
||||
NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */
|
||||
const opus_int32 x_Q3[], /* I Input in Q3 */
|
||||
const opus_int16 x16[], /* I Input */
|
||||
opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */
|
||||
const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
|
||||
opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
|
||||
|
@ -115,11 +117,11 @@ void silk_NSQ_del_dec_sse4_1(
|
|||
const silk_encoder_state *psEncC, /* I Encoder State */
|
||||
silk_nsq_state *NSQ, /* I/O NSQ state */
|
||||
SideInfoIndices *psIndices, /* I/O Quantization Indices */
|
||||
const opus_int32 x_Q3[], /* I Prefiltered input signal */
|
||||
const opus_int16 x16[], /* I Input */
|
||||
opus_int8 pulses[], /* O Quantized pulse signal */
|
||||
const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
|
||||
const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
|
||||
const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
|
||||
const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
|
||||
const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
|
||||
const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
|
||||
const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
|
||||
|
@ -142,8 +144,39 @@ void silk_NSQ_del_dec_sse4_1(
|
|||
VARDECL( opus_int32, delayedGain_Q10 );
|
||||
VARDECL( NSQ_del_dec_struct, psDelDec );
|
||||
NSQ_del_dec_struct *psDD;
|
||||
#ifdef OPUS_CHECK_ASM
|
||||
silk_nsq_state NSQ_c;
|
||||
SideInfoIndices psIndices_c;
|
||||
opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
|
||||
const opus_int8 *const pulses_a = pulses;
|
||||
#endif
|
||||
SAVE_STACK;
|
||||
|
||||
#ifdef OPUS_CHECK_ASM
|
||||
( void )pulses_a;
|
||||
silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
|
||||
silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
|
||||
silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH );
|
||||
silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) );
|
||||
silk_NSQ_del_dec_c(
|
||||
psEncC,
|
||||
&NSQ_c,
|
||||
&psIndices_c,
|
||||
x16,
|
||||
pulses_c,
|
||||
PredCoef_Q12,
|
||||
LTPCoef_Q14,
|
||||
AR_Q13,
|
||||
HarmShapeGain_Q14,
|
||||
Tilt_Q14,
|
||||
LF_shp_Q14,
|
||||
Gains_Q16,
|
||||
pitchL,
|
||||
Lambda_Q10,
|
||||
LTP_scale_Q14
|
||||
);
|
||||
#endif
|
||||
|
||||
/* Set unvoiced lag to the previous one, overwrite later for voiced */
|
||||
lag = NSQ->lagPrev;
|
||||
|
||||
|
@ -158,6 +191,7 @@ void silk_NSQ_del_dec_sse4_1(
|
|||
psDD->SeedInit = psDD->Seed;
|
||||
psDD->RD_Q10 = 0;
|
||||
psDD->LF_AR_Q14 = NSQ->sLF_AR_shp_Q14;
|
||||
psDD->Diff_Q14 = NSQ->sDiff_shp_Q14;
|
||||
psDD->Shape_Q14[ 0 ] = NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ];
|
||||
silk_memcpy( psDD->sLPC_Q14, NSQ->sLPC_Q14, NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
|
||||
silk_memcpy( psDD->sAR2_Q14, NSQ->sAR2_Q14, sizeof( NSQ->sAR2_Q14 ) );
|
||||
|
@ -185,8 +219,7 @@ void silk_NSQ_del_dec_sse4_1(
|
|||
LSF_interpolation_flag = 1;
|
||||
}
|
||||
|
||||
ALLOC( sLTP_Q15,
|
||||
psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
|
||||
ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
|
||||
ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
|
||||
ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
|
||||
ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );
|
||||
|
@ -198,7 +231,7 @@ void silk_NSQ_del_dec_sse4_1(
|
|||
for( k = 0; k < psEncC->nb_subfr; k++ ) {
|
||||
A_Q12 = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ];
|
||||
B_Q14 = <PCoef_Q14[ k * LTP_ORDER ];
|
||||
AR_shp_Q13 = &AR2_Q13[ k * MAX_SHAPE_LPC_ORDER ];
|
||||
AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ];
|
||||
|
||||
/* Noise shape parameters */
|
||||
silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
|
||||
|
@ -257,7 +290,7 @@ void silk_NSQ_del_dec_sse4_1(
|
|||
}
|
||||
}
|
||||
|
||||
silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k,
|
||||
silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
|
||||
psEncC->nStatesDelayedDecision, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay );
|
||||
|
||||
silk_noise_shape_quantizer_del_dec_sse4_1( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
|
||||
|
@ -265,7 +298,7 @@ void silk_NSQ_del_dec_sse4_1(
|
|||
Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
|
||||
psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );
|
||||
|
||||
x_Q3 += psEncC->subfr_length;
|
||||
x16 += psEncC->subfr_length;
|
||||
pulses += psEncC->subfr_length;
|
||||
pxq += psEncC->subfr_length;
|
||||
}
|
||||
|
@ -288,6 +321,7 @@ void silk_NSQ_del_dec_sse4_1(
|
|||
for( i = 0; i < decisionDelay; i++ ) {
|
||||
last_smple_idx = ( last_smple_idx - 1 ) % DECISION_DELAY;
|
||||
if( last_smple_idx < 0 ) last_smple_idx += DECISION_DELAY;
|
||||
|
||||
pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
|
||||
pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
|
||||
silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gain_Q10 ), 8 ) );
|
||||
|
@ -298,11 +332,19 @@ void silk_NSQ_del_dec_sse4_1(
|
|||
|
||||
/* Update states */
|
||||
NSQ->sLF_AR_shp_Q14 = psDD->LF_AR_Q14;
|
||||
NSQ->sDiff_shp_Q14 = psDD->Diff_Q14;
|
||||
NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ];
|
||||
|
||||
/* Save quantized speech signal */
|
||||
silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
|
||||
silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
|
||||
|
||||
#ifdef OPUS_CHECK_ASM
|
||||
silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
|
||||
silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
|
||||
silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) );
|
||||
#endif
|
||||
|
||||
RESTORE_STACK;
|
||||
}
|
||||
|
||||
|
@ -345,6 +387,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
|
|||
opus_int32 q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
|
||||
opus_int32 tmp1, tmp2, sLF_AR_shp_Q14;
|
||||
opus_int32 *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14;
|
||||
|
||||
VARDECL( NSQ_sample_pair, psSampleState );
|
||||
NSQ_del_dec_struct *psDD;
|
||||
NSQ_sample_struct *psSS;
|
||||
|
@ -356,6 +399,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
|
|||
celt_assert( nStatesDelayedDecision > 0 );
|
||||
ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair );
|
||||
|
||||
int rdo_offset = (Lambda_Q10 >> 1) - 512;
|
||||
|
||||
shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
|
||||
pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
|
||||
Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 );
|
||||
|
@ -407,7 +452,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
|
|||
/* Long-term shaping */
|
||||
if( lag > 0 ) {
|
||||
/* Symmetric, packed FIR coefficients */
|
||||
n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
|
||||
n_LTP_Q14 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
|
||||
n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
|
||||
n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 ); /* Q12 -> Q14 */
|
||||
shp_lag_ptr++;
|
||||
|
@ -478,7 +523,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
|
|||
psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
|
||||
tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
|
||||
|
||||
/* setp 4 */
|
||||
/* step 4 */
|
||||
psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -15 ] ) );
|
||||
psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
|
||||
tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_CDEF );
|
||||
|
@ -511,9 +556,9 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
|
|||
LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */
|
||||
|
||||
/* Noise shape feedback */
|
||||
silk_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */
|
||||
celt_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */
|
||||
/* Output of lowpass section */
|
||||
tmp2 = silk_SMLAWB( psLPC_Q14[ 0 ], psDD->sAR2_Q14[ 0 ], warping_Q16 );
|
||||
tmp2 = silk_SMLAWB( psDD->Diff_Q14, psDD->sAR2_Q14[ 0 ], warping_Q16 );
|
||||
/* Output of allpass section */
|
||||
tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - tmp2, warping_Q16 );
|
||||
psDD->sAR2_Q14[ 0 ] = tmp2;
|
||||
|
@ -543,9 +588,9 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
|
|||
|
||||
/* Input minus prediction plus noise feedback */
|
||||
/* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */
|
||||
tmp1 = silk_ADD32( n_AR_Q14, n_LF_Q14 ); /* Q14 */
|
||||
tmp1 = silk_ADD_SAT32( n_AR_Q14, n_LF_Q14 ); /* Q14 */
|
||||
tmp2 = silk_ADD32( n_LTP_Q14, LPC_pred_Q14 ); /* Q13 */
|
||||
tmp1 = silk_SUB32( tmp2, tmp1 ); /* Q13 */
|
||||
tmp1 = silk_SUB_SAT32( tmp2, tmp1 ); /* Q13 */
|
||||
tmp1 = silk_RSHIFT_ROUND( tmp1, 4 ); /* Q10 */
|
||||
|
||||
r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 ); /* residual error Q10 */
|
||||
|
@ -559,6 +604,18 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
|
|||
/* Find two quantization level candidates and measure their rate-distortion */
|
||||
q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
|
||||
q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
|
||||
if (Lambda_Q10 > 2048) {
|
||||
/* For aggressive RDO, the bias becomes more than one pulse. */
|
||||
if (q1_Q10 > rdo_offset) {
|
||||
q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 );
|
||||
} else if (q1_Q10 < -rdo_offset) {
|
||||
q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 );
|
||||
} else if (q1_Q10 < 0) {
|
||||
q1_Q0 = -1;
|
||||
} else {
|
||||
q1_Q0 = 0;
|
||||
}
|
||||
}
|
||||
if( q1_Q0 > 0 ) {
|
||||
q1_Q10 = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
|
||||
q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 );
|
||||
|
@ -612,8 +669,9 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
|
|||
xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
|
||||
|
||||
/* Update states */
|
||||
sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 );
|
||||
psSS[ 0 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
|
||||
psSS[ 0 ].Diff_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_Q10[ i ], 4 );
|
||||
sLF_AR_shp_Q14 = silk_SUB32( psSS[ 0 ].Diff_Q14, n_AR_Q14 );
|
||||
psSS[ 0 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 );
|
||||
psSS[ 0 ].LF_AR_Q14 = sLF_AR_shp_Q14;
|
||||
psSS[ 0 ].LPC_exc_Q14 = LPC_exc_Q14;
|
||||
psSS[ 0 ].xq_Q14 = xq_Q14;
|
||||
|
@ -626,14 +684,14 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
|
|||
exc_Q14 = -exc_Q14;
|
||||
}
|
||||
|
||||
|
||||
/* Add predictions */
|
||||
LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
|
||||
xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
|
||||
|
||||
/* Update states */
|
||||
sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 );
|
||||
psSS[ 1 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
|
||||
psSS[ 1 ].Diff_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_Q10[ i ], 4 );
|
||||
sLF_AR_shp_Q14 = silk_SUB32( psSS[ 1 ].Diff_Q14, n_AR_Q14 );
|
||||
psSS[ 1 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 );
|
||||
psSS[ 1 ].LF_AR_Q14 = sLF_AR_shp_Q14;
|
||||
psSS[ 1 ].LPC_exc_Q14 = LPC_exc_Q14;
|
||||
psSS[ 1 ].xq_Q14 = xq_Q14;
|
||||
|
@ -705,6 +763,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
|
|||
psDD = &psDelDec[ k ];
|
||||
psSS = &psSampleState[ k ][ 0 ];
|
||||
psDD->LF_AR_Q14 = psSS->LF_AR_Q14;
|
||||
psDD->Diff_Q14 = psSS->Diff_Q14;
|
||||
psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14;
|
||||
psDD->Xq_Q14[ *smpl_buf_idx ] = psSS->xq_Q14;
|
||||
psDD->Q_Q10[ *smpl_buf_idx ] = psSS->Q_Q10;
|
||||
|
@ -728,7 +787,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
|
|||
const silk_encoder_state *psEncC, /* I Encoder State */
|
||||
silk_nsq_state *NSQ, /* I/O NSQ state */
|
||||
NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */
|
||||
const opus_int32 x_Q3[], /* I Input in Q3 */
|
||||
const opus_int16 x16[], /* I Input */
|
||||
opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */
|
||||
const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
|
||||
opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
|
||||
|
@ -742,51 +801,41 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
|
|||
)
|
||||
{
|
||||
opus_int i, k, lag;
|
||||
opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
|
||||
opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
|
||||
NSQ_del_dec_struct *psDD;
|
||||
__m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
|
||||
__m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1;
|
||||
|
||||
lag = pitchL[ subfr ];
|
||||
inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
|
||||
|
||||
silk_assert( inv_gain_Q31 != 0 );
|
||||
|
||||
/* Calculate gain adjustment factor */
|
||||
if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
|
||||
gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
|
||||
} else {
|
||||
gain_adj_Q16 = (opus_int32)1 << 16;
|
||||
}
|
||||
|
||||
/* Scale input */
|
||||
inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
|
||||
inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );
|
||||
|
||||
/* prepare inv_gain_Q23 in packed 4 32-bits */
|
||||
xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
|
||||
/* prepare inv_gain_Q26 in packed 4 32-bits */
|
||||
xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26);
|
||||
|
||||
for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
|
||||
xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
|
||||
xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) );
|
||||
|
||||
/* equal shift right 4 bytes*/
|
||||
xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
|
||||
xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
|
||||
|
||||
xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
|
||||
xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
|
||||
xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 );
|
||||
xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 );
|
||||
|
||||
xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
|
||||
xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
|
||||
xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 );
|
||||
xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 );
|
||||
|
||||
xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
|
||||
xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC );
|
||||
|
||||
_mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ])), xmm_x_Q3_x2x0 );
|
||||
_mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );
|
||||
}
|
||||
|
||||
for( ; i < psEncC->subfr_length; i++ ) {
|
||||
x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
|
||||
x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 );
|
||||
}
|
||||
|
||||
/* Save inverse gain */
|
||||
NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
|
||||
|
||||
/* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
|
||||
if( NSQ->rewhite_flag ) {
|
||||
if( subfr == 0 ) {
|
||||
|
@ -800,7 +849,9 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
|
|||
}
|
||||
|
||||
/* Adjust for changing gain */
|
||||
if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
|
||||
if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
|
||||
gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
|
||||
|
||||
/* Scale long-term shaping state */
|
||||
{
|
||||
__m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
|
||||
|
@ -841,6 +892,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
|
|||
|
||||
/* Scale scalar states */
|
||||
psDD->LF_AR_Q14 = silk_SMULWW( gain_adj_Q16, psDD->LF_AR_Q14 );
|
||||
psDD->Diff_Q14 = silk_SMULWW( gain_adj_Q16, psDD->Diff_Q14 );
|
||||
|
||||
/* Scale short-term prediction and shaping states */
|
||||
for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
|
||||
|
@ -855,5 +907,8 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Save inverse gain */
|
||||
NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/* Copyright (c) 2014, Cisco Systems, INC
|
||||
Written by XiangMingZhu WeiZhou MinPeng YanWang
|
||||
/* Copyright (c) 2014-2020, Cisco Systems, INC
|
||||
Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
|
@ -39,7 +39,7 @@
|
|||
static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
|
||||
const silk_encoder_state *psEncC, /* I Encoder State */
|
||||
silk_nsq_state *NSQ, /* I/O NSQ state */
|
||||
const opus_int32 x_Q3[], /* I input in Q3 */
|
||||
const opus_int16 x16[], /* I input */
|
||||
opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */
|
||||
const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */
|
||||
opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
|
||||
|
@ -65,6 +65,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
|
|||
opus_int Tilt_Q14, /* I Spectral tilt */
|
||||
opus_int32 LF_shp_Q14, /* I */
|
||||
opus_int32 Gain_Q16, /* I */
|
||||
opus_int Lambda_Q10, /* I */
|
||||
opus_int offset_Q10, /* I */
|
||||
opus_int length, /* I Input length */
|
||||
opus_int32 table[][4] /* I */
|
||||
|
@ -74,11 +75,11 @@ void silk_NSQ_sse4_1(
|
|||
const silk_encoder_state *psEncC, /* I Encoder State */
|
||||
silk_nsq_state *NSQ, /* I/O NSQ state */
|
||||
SideInfoIndices *psIndices, /* I/O Quantization Indices */
|
||||
const opus_int32 x_Q3[], /* I Prefiltered input signal */
|
||||
const opus_int16 x16[], /* I Input */
|
||||
opus_int8 pulses[], /* O Quantized pulse signal */
|
||||
const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
|
||||
const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
|
||||
const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
|
||||
const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
|
||||
const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
|
||||
const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
|
||||
const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
|
||||
|
@ -101,8 +102,41 @@ void silk_NSQ_sse4_1(
|
|||
opus_int32 tmp1;
|
||||
opus_int32 q1_Q10, q2_Q10, rd1_Q20, rd2_Q20;
|
||||
|
||||
#ifdef OPUS_CHECK_ASM
|
||||
silk_nsq_state NSQ_c;
|
||||
SideInfoIndices psIndices_c;
|
||||
opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
|
||||
const opus_int8 *const pulses_a = pulses;
|
||||
#endif
|
||||
|
||||
SAVE_STACK;
|
||||
|
||||
#ifdef OPUS_CHECK_ASM
|
||||
( void )pulses_a;
|
||||
silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
|
||||
silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
|
||||
silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH );
|
||||
silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) );
|
||||
|
||||
silk_NSQ_c(
|
||||
psEncC,
|
||||
&NSQ_c,
|
||||
&psIndices_c,
|
||||
x16,
|
||||
pulses_c,
|
||||
PredCoef_Q12,
|
||||
LTPCoef_Q14,
|
||||
AR_Q13,
|
||||
HarmShapeGain_Q14,
|
||||
Tilt_Q14,
|
||||
LF_shp_Q14,
|
||||
Gains_Q16,
|
||||
pitchL,
|
||||
Lambda_Q10,
|
||||
LTP_scale_Q14
|
||||
);
|
||||
#endif
|
||||
|
||||
NSQ->rand_seed = psIndices->Seed;
|
||||
|
||||
/* Set unvoiced lag to the previous one, overwrite later for voiced */
|
||||
|
@ -172,8 +206,7 @@ void silk_NSQ_sse4_1(
|
|||
LSF_interpolation_flag = 1;
|
||||
}
|
||||
|
||||
ALLOC( sLTP_Q15,
|
||||
psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
|
||||
ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
|
||||
ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
|
||||
ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
|
||||
/* Set up pointers to start of sub frame */
|
||||
|
@ -183,7 +216,7 @@ void silk_NSQ_sse4_1(
|
|||
for( k = 0; k < psEncC->nb_subfr; k++ ) {
|
||||
A_Q12 = &PredCoef_Q12[ (( k >> 1 ) | ( 1 - LSF_interpolation_flag )) * MAX_LPC_ORDER ];
|
||||
B_Q14 = <PCoef_Q14[ k * LTP_ORDER ];
|
||||
AR_shp_Q13 = &AR2_Q13[ k * MAX_SHAPE_LPC_ORDER ];
|
||||
AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ];
|
||||
|
||||
/* Noise shape parameters */
|
||||
silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
|
||||
|
@ -209,12 +242,12 @@ void silk_NSQ_sse4_1(
|
|||
}
|
||||
}
|
||||
|
||||
silk_nsq_scale_states_sse4_1( psEncC, NSQ, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );
|
||||
silk_nsq_scale_states_sse4_1( psEncC, NSQ, x16, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );
|
||||
|
||||
if ( opus_likely( ( 10 == psEncC->shapingLPCOrder ) && ( 16 == psEncC->predictLPCOrder) ) )
|
||||
{
|
||||
silk_noise_shape_quantizer_10_16_sse4_1( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
|
||||
AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ],
|
||||
AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10,
|
||||
offset_Q10, psEncC->subfr_length, &(table[32]) );
|
||||
}
|
||||
else
|
||||
|
@ -224,7 +257,7 @@ void silk_NSQ_sse4_1(
|
|||
offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->arch );
|
||||
}
|
||||
|
||||
x_Q3 += psEncC->subfr_length;
|
||||
x16 += psEncC->subfr_length;
|
||||
pulses += psEncC->subfr_length;
|
||||
pxq += psEncC->subfr_length;
|
||||
}
|
||||
|
@ -235,12 +268,19 @@ void silk_NSQ_sse4_1(
|
|||
/* Save quantized speech and noise shaping signals */
|
||||
silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
|
||||
silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
|
||||
|
||||
#ifdef OPUS_CHECK_ASM
|
||||
silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
|
||||
silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
|
||||
silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) );
|
||||
#endif
|
||||
|
||||
RESTORE_STACK;
|
||||
}
|
||||
|
||||
/***********************************/
|
||||
/************************************/
|
||||
/* silk_noise_shape_quantizer_10_16 */
|
||||
/***********************************/
|
||||
/************************************/
|
||||
static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
|
||||
silk_nsq_state *NSQ, /* I/O NSQ state */
|
||||
opus_int signalType, /* I Signal type */
|
||||
|
@ -256,6 +296,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
|
|||
opus_int Tilt_Q14, /* I Spectral tilt */
|
||||
opus_int32 LF_shp_Q14, /* I */
|
||||
opus_int32 Gain_Q16, /* I */
|
||||
opus_int Lambda_Q10, /* I */
|
||||
opus_int offset_Q10, /* I */
|
||||
opus_int length, /* I Input length */
|
||||
opus_int32 table[][4] /* I */
|
||||
|
@ -264,7 +305,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
|
|||
opus_int i;
|
||||
opus_int32 LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13;
|
||||
opus_int32 n_LF_Q12, r_Q10, q1_Q0, q1_Q10, q2_Q10;
|
||||
opus_int32 exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
|
||||
opus_int32 exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10, sDiff_shp_Q14;
|
||||
opus_int32 tmp1, tmp2, sLF_AR_shp_Q14;
|
||||
opus_int32 *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr;
|
||||
|
||||
|
@ -279,6 +320,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
|
|||
__m128i sAR2_Q14_hi_76543210, sAR2_Q14_lo_76543210;
|
||||
__m128i AR_shp_Q13_76543210;
|
||||
|
||||
int rdo_offset = (Lambda_Q10 >> 1) - 512;
|
||||
|
||||
shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
|
||||
pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
|
||||
Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 );
|
||||
|
@ -288,6 +331,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
|
|||
|
||||
sLF_AR_shp_Q14 = NSQ->sLF_AR_shp_Q14;
|
||||
xq_Q14 = psLPC_Q14[ 0 ];
|
||||
sDiff_shp_Q14 = NSQ->sDiff_shp_Q14;
|
||||
LTP_pred_Q13 = 0;
|
||||
|
||||
/* load a_Q12 */
|
||||
|
@ -430,8 +474,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
|
|||
sAR2_Q14_hi_76543210 = _mm_slli_si128( sAR2_Q14_hi_76543210, 2 );
|
||||
sAR2_Q14_lo_76543210 = _mm_slli_si128( sAR2_Q14_lo_76543210, 2 );
|
||||
|
||||
sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (xq_Q14 >> 16), 0 );
|
||||
sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (xq_Q14), 0 );
|
||||
sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (sDiff_shp_Q14 >> 16), 0 );
|
||||
sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (sDiff_shp_Q14), 0 );
|
||||
|
||||
/* high part, use pmaddwd, results in 4 32-bit */
|
||||
xmm_hi_07 = _mm_madd_epi16( sAR2_Q14_hi_76543210, AR_shp_Q13_76543210 );
|
||||
|
@ -462,14 +506,14 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
|
|||
n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 );
|
||||
n_LF_Q12 = silk_SMLAWT( n_LF_Q12, sLF_AR_shp_Q14, LF_shp_Q14 );
|
||||
|
||||
silk_assert( lag > 0 || signalType != TYPE_VOICED );
|
||||
celt_assert( lag > 0 || signalType != TYPE_VOICED );
|
||||
|
||||
/* Combine prediction and noise shaping signals */
|
||||
tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 ); /* Q12 */
|
||||
tmp1 = silk_SUB32( tmp1, n_LF_Q12 ); /* Q12 */
|
||||
if( lag > 0 ) {
|
||||
/* Symmetric, packed FIR coefficients */
|
||||
n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
|
||||
n_LTP_Q13 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
|
||||
n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
|
||||
n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 );
|
||||
shp_lag_ptr++;
|
||||
|
@ -495,6 +539,18 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
|
|||
/* Find two quantization level candidates and measure their rate-distortion */
|
||||
q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
|
||||
q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
|
||||
if (Lambda_Q10 > 2048) {
|
||||
/* For aggressive RDO, the bias becomes more than one pulse. */
|
||||
if (q1_Q10 > rdo_offset) {
|
||||
q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 );
|
||||
} else if (q1_Q10 < -rdo_offset) {
|
||||
q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 );
|
||||
} else if (q1_Q10 < 0) {
|
||||
q1_Q0 = -1;
|
||||
} else {
|
||||
q1_Q0 = 0;
|
||||
}
|
||||
}
|
||||
|
||||
q1_Q10 = table[q1_Q0][0];
|
||||
q2_Q10 = table[q1_Q0][1];
|
||||
|
@ -519,7 +575,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
|
|||
/* Update states */
|
||||
psLPC_Q14++;
|
||||
*psLPC_Q14 = xq_Q14;
|
||||
sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, n_AR_Q12, 2 );
|
||||
NSQ->sDiff_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_sc_Q10[ i ], 4 );
|
||||
sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( NSQ->sDiff_shp_Q14, n_AR_Q12, 2 );
|
||||
|
||||
NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 );
|
||||
sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 );
|
||||
|
@ -602,7 +659,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
|
|||
static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
|
||||
const silk_encoder_state *psEncC, /* I Encoder State */
|
||||
silk_nsq_state *NSQ, /* I/O NSQ state */
|
||||
const opus_int32 x_Q3[], /* I input in Q3 */
|
||||
const opus_int16 x16[], /* I input */
|
||||
opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */
|
||||
const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */
|
||||
opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
|
||||
|
@ -614,50 +671,40 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
|
|||
)
|
||||
{
|
||||
opus_int i, lag;
|
||||
opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
|
||||
__m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
|
||||
opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
|
||||
__m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1;
|
||||
|
||||
lag = pitchL[ subfr ];
|
||||
inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
|
||||
silk_assert( inv_gain_Q31 != 0 );
|
||||
|
||||
/* Calculate gain adjustment factor */
|
||||
if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
|
||||
gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
|
||||
} else {
|
||||
gain_adj_Q16 = (opus_int32)1 << 16;
|
||||
}
|
||||
|
||||
/* Scale input */
|
||||
inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
|
||||
inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );
|
||||
|
||||
/* prepare inv_gain_Q23 in packed 4 32-bits */
|
||||
xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
|
||||
/* prepare inv_gain_Q26 in packed 4 32-bits */
|
||||
xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26);
|
||||
|
||||
for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
|
||||
xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
|
||||
xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) );
|
||||
|
||||
/* equal shift right 4 bytes*/
|
||||
xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
|
||||
xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
|
||||
|
||||
xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
|
||||
xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
|
||||
xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 );
|
||||
xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 );
|
||||
|
||||
xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
|
||||
xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
|
||||
xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 );
|
||||
xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 );
|
||||
|
||||
xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
|
||||
xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC );
|
||||
|
||||
_mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x_Q3_x2x0 );
|
||||
_mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );
|
||||
}
|
||||
|
||||
for( ; i < psEncC->subfr_length; i++ ) {
|
||||
x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
|
||||
x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 );
|
||||
}
|
||||
|
||||
/* Save inverse gain */
|
||||
NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
|
||||
|
||||
/* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
|
||||
if( NSQ->rewhite_flag ) {
|
||||
if( subfr == 0 ) {
|
||||
|
@ -671,7 +718,9 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
|
|||
}
|
||||
|
||||
/* Adjust for changing gain */
|
||||
if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
|
||||
if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
|
||||
gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
|
||||
|
||||
/* Scale long-term shaping state */
|
||||
__m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
|
||||
|
||||
|
@ -707,6 +756,7 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
|
|||
}
|
||||
|
||||
NSQ->sLF_AR_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sLF_AR_shp_Q14 );
|
||||
NSQ->sDiff_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sDiff_shp_Q14 );
|
||||
|
||||
/* Scale short-term prediction and shaping states */
|
||||
for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
|
||||
|
@ -715,5 +765,8 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
|
|||
for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
|
||||
NSQ->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sAR2_Q14[ i ] );
|
||||
}
|
||||
|
||||
/* Save inverse gain */
|
||||
NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
|
||||
}
|
||||
}
|
||||
|
|
|
@ -67,7 +67,7 @@ extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])(
|
|||
|
||||
#endif
|
||||
|
||||
opus_int64 silk_inner_prod16_aligned_64_sse4_1(
|
||||
opus_int64 silk_inner_prod16_sse4_1(
|
||||
const opus_int16 *inVec1,
|
||||
const opus_int16 *inVec2,
|
||||
const opus_int len
|
||||
|
@ -76,18 +76,18 @@ opus_int64 silk_inner_prod16_aligned_64_sse4_1(
|
|||
|
||||
#if defined(OPUS_X86_PRESUME_SSE4_1)
|
||||
|
||||
#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
|
||||
((void)(arch),silk_inner_prod16_aligned_64_sse4_1(inVec1, inVec2, len))
|
||||
#define silk_inner_prod16(inVec1, inVec2, len, arch) \
|
||||
((void)(arch),silk_inner_prod16_sse4_1(inVec1, inVec2, len))
|
||||
|
||||
#else
|
||||
|
||||
extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])(
|
||||
extern opus_int64 (*const SILK_INNER_PROD16_IMPL[OPUS_ARCHMASK + 1])(
|
||||
const opus_int16 *inVec1,
|
||||
const opus_int16 *inVec2,
|
||||
const opus_int len);
|
||||
|
||||
# define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
|
||||
((*SILK_INNER_PROD16_ALIGNED_64_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len))
|
||||
# define silk_inner_prod16(inVec1, inVec2, len, arch) \
|
||||
((*SILK_INNER_PROD16_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len))
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/* Copyright (c) 2014, Cisco Systems, INC
|
||||
Written by XiangMingZhu WeiZhou MinPeng YanWang
|
||||
/* Copyright (c) 2014-2020, Cisco Systems, INC
|
||||
Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
|
@ -63,6 +63,14 @@ opus_int silk_VAD_GetSA_Q8_sse4_1( /* O Return value, 0 if s
|
|||
|
||||
SAVE_STACK;
|
||||
|
||||
#ifdef OPUS_CHECK_ASM
|
||||
silk_encoder_state psEncC_c;
|
||||
opus_int ret_c;
|
||||
|
||||
silk_memcpy( &psEncC_c, psEncC, sizeof( psEncC_c ) );
|
||||
ret_c = silk_VAD_GetSA_Q8_c( &psEncC_c, pIn );
|
||||
#endif
|
||||
|
||||
/* Safety checks */
|
||||
silk_assert( VAD_N_BANDS == 4 );
|
||||
celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
|
||||
|
@ -233,15 +241,14 @@ opus_int silk_VAD_GetSA_Q8_sse4_1( /* O Return value, 0 if s
|
|||
speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
|
||||
}
|
||||
|
||||
if( psEncC->frame_length == 20 * psEncC->fs_kHz ) {
|
||||
speech_nrg = silk_RSHIFT32( speech_nrg, 1 );
|
||||
}
|
||||
/* Power scaling */
|
||||
if( speech_nrg <= 0 ) {
|
||||
SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
|
||||
} else if( speech_nrg < 32768 ) {
|
||||
if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
|
||||
speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 16 );
|
||||
} else {
|
||||
speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 15 );
|
||||
}
|
||||
} else if( speech_nrg < 16384 ) {
|
||||
speech_nrg = silk_LSHIFT32( speech_nrg, 16 );
|
||||
|
||||
/* square-root */
|
||||
speech_nrg = silk_SQRT_APPROX( speech_nrg );
|
||||
|
@ -272,6 +279,11 @@ opus_int silk_VAD_GetSA_Q8_sse4_1( /* O Return value, 0 if s
|
|||
psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
|
||||
}
|
||||
|
||||
#ifdef OPUS_CHECK_ASM
|
||||
silk_assert( ret == ret_c );
|
||||
silk_assert( !memcmp( &psEncC_c, psEncC, sizeof( psEncC_c ) ) );
|
||||
#endif
|
||||
|
||||
RESTORE_STACK;
|
||||
return( ret );
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/* Copyright (c) 2014, Cisco Systems, INC
|
||||
Written by XiangMingZhu WeiZhou MinPeng YanWang
|
||||
/* Copyright (c) 2014-2020, Cisco Systems, INC
|
||||
Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
|
@ -38,105 +38,136 @@
|
|||
/* Entropy constrained matrix-weighted VQ, hard-coded to 5-element vectors, for a single input data vector */
|
||||
void silk_VQ_WMat_EC_sse4_1(
|
||||
opus_int8 *ind, /* O index of best codebook vector */
|
||||
opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
|
||||
opus_int32 *res_nrg_Q15, /* O best residual energy */
|
||||
opus_int32 *rate_dist_Q8, /* O best total bitrate */
|
||||
opus_int *gain_Q7, /* O sum of absolute LTP coefficients */
|
||||
const opus_int16 *in_Q14, /* I input vector to be quantized */
|
||||
const opus_int32 *W_Q18, /* I weighting matrix */
|
||||
const opus_int32 *XX_Q17, /* I correlation matrix */
|
||||
const opus_int32 *xX_Q17, /* I correlation vector */
|
||||
const opus_int8 *cb_Q7, /* I codebook */
|
||||
const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */
|
||||
const opus_uint8 *cl_Q5, /* I code length for each codebook vector */
|
||||
const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */
|
||||
const opus_int subfr_len, /* I number of samples per subframe */
|
||||
const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */
|
||||
opus_int L /* I number of vectors in codebook */
|
||||
const opus_int L /* I number of vectors in codebook */
|
||||
)
|
||||
{
|
||||
opus_int k, gain_tmp_Q7;
|
||||
const opus_int8 *cb_row_Q7;
|
||||
opus_int16 diff_Q14[ 5 ];
|
||||
opus_int32 sum1_Q14, sum2_Q16;
|
||||
opus_int32 neg_xX_Q24[ 5 ];
|
||||
opus_int32 sum1_Q15, sum2_Q24;
|
||||
opus_int32 bits_res_Q8, bits_tot_Q8;
|
||||
__m128i v_XX_31_Q17, v_XX_42_Q17, v_cb_row_31_Q7, v_cb_row_42_Q7, v_acc1_Q24, v_acc2_Q24;
|
||||
|
||||
/* Negate and convert to new Q domain */
|
||||
neg_xX_Q24[ 0 ] = -silk_LSHIFT32( xX_Q17[ 0 ], 7 );
|
||||
neg_xX_Q24[ 1 ] = -silk_LSHIFT32( xX_Q17[ 1 ], 7 );
|
||||
neg_xX_Q24[ 2 ] = -silk_LSHIFT32( xX_Q17[ 2 ], 7 );
|
||||
neg_xX_Q24[ 3 ] = -silk_LSHIFT32( xX_Q17[ 3 ], 7 );
|
||||
neg_xX_Q24[ 4 ] = -silk_LSHIFT32( xX_Q17[ 4 ], 7 );
|
||||
|
||||
v_XX_31_Q17 = _mm_loadu_si128( (__m128i *)(&XX_Q17[ 1 ] ) );
|
||||
v_XX_42_Q17 = _mm_shuffle_epi32( v_XX_31_Q17, _MM_SHUFFLE( 0, 3, 2, 1 ) );
|
||||
|
||||
__m128i C_tmp1, C_tmp2, C_tmp3, C_tmp4, C_tmp5;
|
||||
/* Loop over codebook */
|
||||
*rate_dist_Q14 = silk_int32_MAX;
|
||||
*rate_dist_Q8 = silk_int32_MAX;
|
||||
*res_nrg_Q15 = silk_int32_MAX;
|
||||
cb_row_Q7 = cb_Q7;
|
||||
/* If things go really bad, at least *ind is set to something safe. */
|
||||
*ind = 0;
|
||||
for( k = 0; k < L; k++ ) {
|
||||
opus_int32 penalty;
|
||||
gain_tmp_Q7 = cb_gain_Q7[k];
|
||||
|
||||
diff_Q14[ 0 ] = in_Q14[ 0 ] - silk_LSHIFT( cb_row_Q7[ 0 ], 7 );
|
||||
|
||||
C_tmp1 = OP_CVTEPI16_EPI32_M64( &in_Q14[ 1 ] );
|
||||
C_tmp2 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] );
|
||||
C_tmp2 = _mm_slli_epi32( C_tmp2, 7 );
|
||||
C_tmp1 = _mm_sub_epi32( C_tmp1, C_tmp2 );
|
||||
|
||||
diff_Q14[ 1 ] = _mm_extract_epi16( C_tmp1, 0 );
|
||||
diff_Q14[ 2 ] = _mm_extract_epi16( C_tmp1, 2 );
|
||||
diff_Q14[ 3 ] = _mm_extract_epi16( C_tmp1, 4 );
|
||||
diff_Q14[ 4 ] = _mm_extract_epi16( C_tmp1, 6 );
|
||||
|
||||
/* Weighted rate */
|
||||
sum1_Q14 = silk_SMULBB( mu_Q9, cl_Q5[ k ] );
|
||||
/* Quantization error: 1 - 2 * xX * cb + cb' * XX * cb */
|
||||
sum1_Q15 = SILK_FIX_CONST( 1.001, 15 );
|
||||
|
||||
/* Penalty for too large gain */
|
||||
sum1_Q14 = silk_ADD_LSHIFT32( sum1_Q14, silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 10 );
|
||||
penalty = silk_LSHIFT32( silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 11 );
|
||||
|
||||
silk_assert( sum1_Q14 >= 0 );
|
||||
/* first row of XX_Q17 */
|
||||
v_cb_row_31_Q7 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] );
|
||||
v_cb_row_42_Q7 = _mm_shuffle_epi32( v_cb_row_31_Q7, _MM_SHUFFLE( 0, 3, 2, 1 ) );
|
||||
v_cb_row_31_Q7 = _mm_mul_epi32( v_XX_31_Q17, v_cb_row_31_Q7 );
|
||||
v_cb_row_42_Q7 = _mm_mul_epi32( v_XX_42_Q17, v_cb_row_42_Q7 );
|
||||
v_acc1_Q24 = _mm_add_epi64( v_cb_row_31_Q7, v_cb_row_42_Q7);
|
||||
v_acc2_Q24 = _mm_shuffle_epi32( v_acc1_Q24, _MM_SHUFFLE( 1, 0, 3, 2 ) );
|
||||
v_acc1_Q24 = _mm_add_epi64( v_acc1_Q24, v_acc2_Q24);
|
||||
sum2_Q24 = _mm_cvtsi128_si32( v_acc1_Q24 );
|
||||
sum2_Q24 = silk_ADD32( neg_xX_Q24[ 0 ], sum2_Q24 );
|
||||
sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
|
||||
sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 0 ], cb_row_Q7[ 0 ] );
|
||||
sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 0 ] );
|
||||
|
||||
/* first row of W_Q18 */
|
||||
C_tmp3 = _mm_loadu_si128( (__m128i *)(&W_Q18[ 1 ] ) );
|
||||
C_tmp4 = _mm_mul_epi32( C_tmp3, C_tmp1 );
|
||||
C_tmp4 = _mm_srli_si128( C_tmp4, 2 );
|
||||
/* second row of XX_Q17 */
|
||||
sum2_Q24 = silk_MLA( neg_xX_Q24[ 1 ], XX_Q17[ 7 ], cb_row_Q7[ 2 ] );
|
||||
sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 8 ], cb_row_Q7[ 3 ] );
|
||||
sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 9 ], cb_row_Q7[ 4 ] );
|
||||
sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
|
||||
sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 6 ], cb_row_Q7[ 1 ] );
|
||||
sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 1 ] );
|
||||
|
||||
C_tmp1 = _mm_shuffle_epi32( C_tmp1, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */
|
||||
C_tmp3 = _mm_shuffle_epi32( C_tmp3, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */
|
||||
/* third row of XX_Q17 */
|
||||
sum2_Q24 = silk_MLA( neg_xX_Q24[ 2 ], XX_Q17[ 13 ], cb_row_Q7[ 3 ] );
|
||||
sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 14 ], cb_row_Q7[ 4 ] );
|
||||
sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
|
||||
sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 12 ], cb_row_Q7[ 2 ] );
|
||||
sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 2 ] );
|
||||
|
||||
C_tmp5 = _mm_mul_epi32( C_tmp3, C_tmp1 );
|
||||
C_tmp5 = _mm_srli_si128( C_tmp5, 2 );
|
||||
/* fourth row of XX_Q17 */
|
||||
sum2_Q24 = silk_MLA( neg_xX_Q24[ 3 ], XX_Q17[ 19 ], cb_row_Q7[ 4 ] );
|
||||
sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
|
||||
sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 18 ], cb_row_Q7[ 3 ] );
|
||||
sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 3 ] );
|
||||
|
||||
C_tmp5 = _mm_add_epi32( C_tmp4, C_tmp5 );
|
||||
C_tmp5 = _mm_slli_epi32( C_tmp5, 1 );
|
||||
|
||||
C_tmp5 = _mm_add_epi32( C_tmp5, _mm_shuffle_epi32( C_tmp5, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
|
||||
sum2_Q16 = _mm_cvtsi128_si32( C_tmp5 );
|
||||
|
||||
sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 0 ], diff_Q14[ 0 ] );
|
||||
sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 0 ] );
|
||||
|
||||
/* second row of W_Q18 */
|
||||
sum2_Q16 = silk_SMULWB( W_Q18[ 7 ], diff_Q14[ 2 ] );
|
||||
sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 8 ], diff_Q14[ 3 ] );
|
||||
sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 9 ], diff_Q14[ 4 ] );
|
||||
sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
|
||||
sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 6 ], diff_Q14[ 1 ] );
|
||||
sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 1 ] );
|
||||
|
||||
/* third row of W_Q18 */
|
||||
sum2_Q16 = silk_SMULWB( W_Q18[ 13 ], diff_Q14[ 3 ] );
|
||||
sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 14 ], diff_Q14[ 4 ] );
|
||||
sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
|
||||
sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 12 ], diff_Q14[ 2 ] );
|
||||
sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 2 ] );
|
||||
|
||||
/* fourth row of W_Q18 */
|
||||
sum2_Q16 = silk_SMULWB( W_Q18[ 19 ], diff_Q14[ 4 ] );
|
||||
sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
|
||||
sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 18 ], diff_Q14[ 3 ] );
|
||||
sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 3 ] );
|
||||
|
||||
/* last row of W_Q18 */
|
||||
sum2_Q16 = silk_SMULWB( W_Q18[ 24 ], diff_Q14[ 4 ] );
|
||||
sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 4 ] );
|
||||
|
||||
silk_assert( sum1_Q14 >= 0 );
|
||||
/* last row of XX_Q17 */
|
||||
sum2_Q24 = silk_LSHIFT32( neg_xX_Q24[ 4 ], 1 );
|
||||
sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 24 ], cb_row_Q7[ 4 ] );
|
||||
sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 4 ] );
|
||||
|
||||
/* find best */
|
||||
if( sum1_Q14 < *rate_dist_Q14 ) {
|
||||
*rate_dist_Q14 = sum1_Q14;
|
||||
if( sum1_Q15 >= 0 ) {
|
||||
/* Translate residual energy to bits using high-rate assumption (6 dB ==> 1 bit/sample) */
|
||||
bits_res_Q8 = silk_SMULBB( subfr_len, silk_lin2log( sum1_Q15 + penalty) - (15 << 7) );
|
||||
/* In the following line we reduce the codelength component by half ("-1"); seems to slightly improve quality */
|
||||
bits_tot_Q8 = silk_ADD_LSHIFT32( bits_res_Q8, cl_Q5[ k ], 3-1 );
|
||||
if( bits_tot_Q8 <= *rate_dist_Q8 ) {
|
||||
*rate_dist_Q8 = bits_tot_Q8;
|
||||
*res_nrg_Q15 = sum1_Q15 + penalty;
|
||||
*ind = (opus_int8)k;
|
||||
*gain_Q7 = gain_tmp_Q7;
|
||||
}
|
||||
}
|
||||
|
||||
/* Go to next cbk vector */
|
||||
cb_row_Q7 += LTP_ORDER;
|
||||
}
|
||||
|
||||
#ifdef OPUS_CHECK_ASM
|
||||
{
|
||||
opus_int8 ind_c = 0;
|
||||
opus_int32 res_nrg_Q15_c = 0;
|
||||
opus_int32 rate_dist_Q8_c = 0;
|
||||
opus_int gain_Q7_c = 0;
|
||||
|
||||
silk_VQ_WMat_EC_c(
|
||||
&ind_c,
|
||||
&res_nrg_Q15_c,
|
||||
&rate_dist_Q8_c,
|
||||
&gain_Q7_c,
|
||||
XX_Q17,
|
||||
xX_Q17,
|
||||
cb_Q7,
|
||||
cb_gain_Q7,
|
||||
cl_Q5,
|
||||
subfr_len,
|
||||
max_gain_Q7,
|
||||
L
|
||||
);
|
||||
|
||||
silk_assert( *ind == ind_c );
|
||||
silk_assert( *res_nrg_Q15 == res_nrg_Q15_c );
|
||||
silk_assert( *rate_dist_Q8 == rate_dist_Q8_c );
|
||||
silk_assert( *gain_Q7 == gain_Q7_c );
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -34,66 +34,65 @@
|
|||
|
||||
# if defined(OPUS_X86_MAY_HAVE_SSE4_1)
|
||||
|
||||
#if 0 /* FIXME: SSE disabled until silk_VQ_WMat_EC_sse4_1() gets updated. */
|
||||
# define OVERRIDE_silk_VQ_WMat_EC
|
||||
|
||||
void silk_VQ_WMat_EC_sse4_1(
|
||||
opus_int8 *ind, /* O index of best codebook vector */
|
||||
opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
|
||||
opus_int32 *res_nrg_Q15, /* O best residual energy */
|
||||
opus_int32 *rate_dist_Q8, /* O best total bitrate */
|
||||
opus_int *gain_Q7, /* O sum of absolute LTP coefficients */
|
||||
const opus_int16 *in_Q14, /* I input vector to be quantized */
|
||||
const opus_int32 *W_Q18, /* I weighting matrix */
|
||||
const opus_int32 *XX_Q17, /* I correlation matrix */
|
||||
const opus_int32 *xX_Q17, /* I correlation vector */
|
||||
const opus_int8 *cb_Q7, /* I codebook */
|
||||
const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */
|
||||
const opus_uint8 *cl_Q5, /* I code length for each codebook vector */
|
||||
const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */
|
||||
const opus_int subfr_len, /* I number of samples per subframe */
|
||||
const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */
|
||||
opus_int L /* I number of vectors in codebook */
|
||||
const opus_int L /* I number of vectors in codebook */
|
||||
);
|
||||
|
||||
#if defined OPUS_X86_PRESUME_SSE4_1
|
||||
|
||||
#define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
|
||||
mu_Q9, max_gain_Q7, L, arch) \
|
||||
((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
|
||||
mu_Q9, max_gain_Q7, L))
|
||||
#define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
|
||||
subfr_len, max_gain_Q7, L, arch) \
|
||||
((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
|
||||
subfr_len, max_gain_Q7, L))
|
||||
|
||||
#else
|
||||
|
||||
extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])(
|
||||
opus_int8 *ind, /* O index of best codebook vector */
|
||||
opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
|
||||
opus_int32 *res_nrg_Q15, /* O best residual energy */
|
||||
opus_int32 *rate_dist_Q8, /* O best total bitrate */
|
||||
opus_int *gain_Q7, /* O sum of absolute LTP coefficients */
|
||||
const opus_int16 *in_Q14, /* I input vector to be quantized */
|
||||
const opus_int32 *W_Q18, /* I weighting matrix */
|
||||
const opus_int32 *XX_Q17, /* I correlation matrix */
|
||||
const opus_int32 *xX_Q17, /* I correlation vector */
|
||||
const opus_int8 *cb_Q7, /* I codebook */
|
||||
const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */
|
||||
const opus_uint8 *cl_Q5, /* I code length for each codebook vector */
|
||||
const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */
|
||||
const opus_int subfr_len, /* I number of samples per subframe */
|
||||
const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */
|
||||
opus_int L /* I number of vectors in codebook */
|
||||
const opus_int L /* I number of vectors in codebook */
|
||||
);
|
||||
|
||||
# define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
|
||||
mu_Q9, max_gain_Q7, L, arch) \
|
||||
((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
|
||||
mu_Q9, max_gain_Q7, L))
|
||||
# define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
|
||||
subfr_len, max_gain_Q7, L, arch) \
|
||||
((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
|
||||
subfr_len, max_gain_Q7, L))
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
|
||||
# define OVERRIDE_silk_NSQ
|
||||
|
||||
void silk_NSQ_sse4_1(
|
||||
const silk_encoder_state *psEncC, /* I Encoder State */
|
||||
silk_nsq_state *NSQ, /* I/O NSQ state */
|
||||
SideInfoIndices *psIndices, /* I/O Quantization Indices */
|
||||
const opus_int32 x_Q3[], /* I Prefiltered input signal */
|
||||
const opus_int16 x16[], /* I Input */
|
||||
opus_int8 pulses[], /* O Quantized pulse signal */
|
||||
const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
|
||||
const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
|
||||
const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
|
||||
const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
|
||||
const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
|
||||
const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
|
||||
const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
|
||||
|
@ -116,11 +115,11 @@ extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(
|
|||
const silk_encoder_state *psEncC, /* I Encoder State */
|
||||
silk_nsq_state *NSQ, /* I/O NSQ state */
|
||||
SideInfoIndices *psIndices, /* I/O Quantization Indices */
|
||||
const opus_int32 x_Q3[], /* I Prefiltered input signal */
|
||||
const opus_int16 x16[], /* I Input */
|
||||
opus_int8 pulses[], /* O Quantized pulse signal */
|
||||
const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
|
||||
const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
|
||||
const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
|
||||
const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
|
||||
const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
|
||||
const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
|
||||
const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
|
||||
|
@ -143,11 +142,11 @@ void silk_NSQ_del_dec_sse4_1(
|
|||
const silk_encoder_state *psEncC, /* I Encoder State */
|
||||
silk_nsq_state *NSQ, /* I/O NSQ state */
|
||||
SideInfoIndices *psIndices, /* I/O Quantization Indices */
|
||||
const opus_int32 x_Q3[], /* I Prefiltered input signal */
|
||||
const opus_int16 x16[], /* I Input */
|
||||
opus_int8 pulses[], /* O Quantized pulse signal */
|
||||
const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
|
||||
const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
|
||||
const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
|
||||
const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
|
||||
const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
|
||||
const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
|
||||
const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
|
||||
|
@ -159,9 +158,9 @@ void silk_NSQ_del_dec_sse4_1(
|
|||
|
||||
#if defined OPUS_X86_PRESUME_SSE4_1
|
||||
|
||||
#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
|
||||
#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
|
||||
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
|
||||
((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
|
||||
((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
|
||||
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
|
||||
|
||||
#else
|
||||
|
@ -170,11 +169,11 @@ extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
|
|||
const silk_encoder_state *psEncC, /* I Encoder State */
|
||||
silk_nsq_state *NSQ, /* I/O NSQ state */
|
||||
SideInfoIndices *psIndices, /* I/O Quantization Indices */
|
||||
const opus_int32 x_Q3[], /* I Prefiltered input signal */
|
||||
const opus_int16 x16[], /* I Input */
|
||||
opus_int8 pulses[], /* O Quantized pulse signal */
|
||||
const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
|
||||
const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
|
||||
const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
|
||||
const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
|
||||
const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
|
||||
const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
|
||||
const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
|
||||
|
@ -184,13 +183,12 @@ extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
|
|||
const opus_int LTP_scale_Q14 /* I LTP state scaling */
|
||||
);
|
||||
|
||||
# define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
|
||||
# define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
|
||||
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
|
||||
((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
|
||||
((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
|
||||
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
void silk_noise_shape_quantizer(
|
||||
silk_nsq_state *NSQ, /* I/O NSQ state */
|
||||
|
|
|
@ -41,16 +41,16 @@
|
|||
|
||||
#include "fixed/main_FIX.h"
|
||||
|
||||
opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )(
|
||||
opus_int64 (*const SILK_INNER_PROD16_IMPL[ OPUS_ARCHMASK + 1 ] )(
|
||||
const opus_int16 *inVec1,
|
||||
const opus_int16 *inVec2,
|
||||
const opus_int len
|
||||
) = {
|
||||
silk_inner_prod16_aligned_64_c, /* non-sse */
|
||||
silk_inner_prod16_aligned_64_c,
|
||||
silk_inner_prod16_aligned_64_c,
|
||||
MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ), /* sse4.1 */
|
||||
MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ) /* avx */
|
||||
silk_inner_prod16_c, /* non-sse */
|
||||
silk_inner_prod16_c,
|
||||
silk_inner_prod16_c,
|
||||
MAY_HAVE_SSE4_1( silk_inner_prod16 ), /* sse4.1 */
|
||||
MAY_HAVE_SSE4_1( silk_inner_prod16 ) /* avx */
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -66,16 +66,15 @@ opus_int (*const SILK_VAD_GETSA_Q8_IMPL[ OPUS_ARCHMASK + 1 ] )(
|
|||
MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 ) /* avx */
|
||||
};
|
||||
|
||||
#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
|
||||
void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
|
||||
const silk_encoder_state *psEncC, /* I Encoder State */
|
||||
silk_nsq_state *NSQ, /* I/O NSQ state */
|
||||
SideInfoIndices *psIndices, /* I/O Quantization Indices */
|
||||
const opus_int32 x_Q3[], /* I Prefiltered input signal */
|
||||
const opus_int16 x16[], /* I Input */
|
||||
opus_int8 pulses[], /* O Quantized pulse signal */
|
||||
const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
|
||||
const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
|
||||
const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
|
||||
const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
|
||||
const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
|
||||
const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
|
||||
const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
|
||||
|
@ -90,21 +89,20 @@ void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
|
|||
MAY_HAVE_SSE4_1( silk_NSQ ), /* sse4.1 */
|
||||
MAY_HAVE_SSE4_1( silk_NSQ ) /* avx */
|
||||
};
|
||||
#endif
|
||||
|
||||
#if 0 /* FIXME: SSE disabled until silk_VQ_WMat_EC_sse4_1() gets updated. */
|
||||
void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(
|
||||
opus_int8 *ind, /* O index of best codebook vector */
|
||||
opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
|
||||
opus_int32 *res_nrg_Q15, /* O best residual energy */
|
||||
opus_int32 *rate_dist_Q8, /* O best total bitrate */
|
||||
opus_int *gain_Q7, /* O sum of absolute LTP coefficients */
|
||||
const opus_int16 *in_Q14, /* I input vector to be quantized */
|
||||
const opus_int32 *W_Q18, /* I weighting matrix */
|
||||
const opus_int32 *XX_Q17, /* I correlation matrix */
|
||||
const opus_int32 *xX_Q17, /* I correlation vector */
|
||||
const opus_int8 *cb_Q7, /* I codebook */
|
||||
const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */
|
||||
const opus_uint8 *cl_Q5, /* I code length for each codebook vector */
|
||||
const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */
|
||||
const opus_int subfr_len, /* I number of samples per subframe */
|
||||
const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */
|
||||
opus_int L /* I number of vectors in codebook */
|
||||
const opus_int L /* I number of vectors in codebook */
|
||||
) = {
|
||||
silk_VQ_WMat_EC_c, /* non-sse */
|
||||
silk_VQ_WMat_EC_c,
|
||||
|
@ -112,18 +110,16 @@ void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(
|
|||
MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ), /* sse4.1 */
|
||||
MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ) /* avx */
|
||||
};
|
||||
#endif
|
||||
|
||||
#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
|
||||
void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
|
||||
const silk_encoder_state *psEncC, /* I Encoder State */
|
||||
silk_nsq_state *NSQ, /* I/O NSQ state */
|
||||
SideInfoIndices *psIndices, /* I/O Quantization Indices */
|
||||
const opus_int32 x_Q3[], /* I Prefiltered input signal */
|
||||
const opus_int16 x16[], /* I Input */
|
||||
opus_int8 pulses[], /* O Quantized pulse signal */
|
||||
const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
|
||||
const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
|
||||
const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
|
||||
const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
|
||||
const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
|
||||
const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
|
||||
const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
|
||||
|
@ -138,7 +134,6 @@ void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
|
|||
MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */
|
||||
MAY_HAVE_SSE4_1( silk_NSQ_del_dec ) /* avx */
|
||||
};
|
||||
#endif
|
||||
|
||||
#if defined(FIXED_POINT)
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue