Cisco optimization for x86 & fixed point

1. Only for fixed point on x86 platform (32bit and 64bit, uses SIMD
   intrinsics up to SSE4.2)
2. Use "configure --enable-fixed-point --enable-intrinsics" to enable
   optimization, default is disabled.
3. Official test cases are verified and passed.

Signed-off-by: Timothy B. Terriberry <tterribe@xiph.org>
This commit is contained in:
xiangmingzhu 2014-04-30 15:48:07 +08:00 committed by Jean-Marc Valin
parent 80460334b7
commit c95c9a048f
74 changed files with 4404 additions and 182 deletions

View file

@ -18,6 +18,9 @@ include opus_sources.mk
if FIXED_POINT
SILK_SOURCES += $(SILK_SOURCES_FIXED)
if HAVE_SSE4_1
SILK_SOURCES += $(SILK_SOURCES_SSE4_1) $(SILK_SOURCES_FIXED_SSE4_1)
endif
else
SILK_SOURCES += $(SILK_SOURCES_FLOAT)
endif
@ -27,6 +30,14 @@ else
OPUS_SOURCES += $(OPUS_SOURCES_FLOAT)
endif
if HAVE_SSE4_1
CELT_SOURCES += $(CELT_SOURCES_SSE) $(CELT_SOURCES_SSE4_1)
else
if HAVE_SSE2
CELT_SOURCES += $(CELT_SOURCES_SSE)
endif
endif
if CPU_ARM
CELT_SOURCES += $(CELT_SOURCES_ARM)
SILK_SOURCES += $(SILK_SOURCES_ARM)
@ -229,3 +240,13 @@ $(CELT_SOURCES_ARM_ASM:%.s=%-gnu.S): $(top_srcdir)/celt/arm/arm2gnu.pl
# For autoconf-modified sources (e.g., armopts.s)
%-gnu.S: %.s
$(top_srcdir)/celt/arm/arm2gnu.pl @ARM2GNU_PARAMS@ < $< > $@
SSE_OBJ = %_sse.o %_sse.lo %test_unit_mathops.o %test_unit_rotation.o
if HAVE_SSE4_1
$(SSE_OBJ): CFLAGS += -msse4.1
else
if HAVE_SSE2
$(SSE_OBJ): CFLAGS += -msse2
endif
endif

View file

@ -164,7 +164,7 @@ void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *band
for (i=0;i<end;i++)
{
opus_val32 sum;
sum = 1e-27f + celt_inner_prod(&X[c*N+(eBands[i]<<LM)], &X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM);
sum = 1e-27f + celt_inner_prod_c(&X[c*N+(eBands[i]<<LM)], &X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM);
bandE[i+c*m->nbEBands] = celt_sqrt(sum);
/*printf ("%f ", bandE[i+c*m->nbEBands]);*/
}
@ -266,7 +266,7 @@ void denormalise_bands(const CELTMode *m, const celt_norm * OPUS_RESTRICT X,
/* This prevents energy collapse for transients with multiple short MDCTs */
void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_masks, int LM, int C, int size,
int start, int end, const opus_val16 *logE, const opus_val16 *prev1logE,
const opus_val16 *prev2logE, const int *pulses, opus_uint32 seed)
const opus_val16 *prev2logE, const int *pulses, opus_uint32 seed, int arch)
{
int c, i, j, k;
for (i=start;i<end;i++)
@ -355,7 +355,7 @@ void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_mas
}
/* We just added some energy, so we need to renormalise */
if (renormalize)
renormalise_vector(X, N0<<LM, Q15ONE);
renormalise_vector(X, N0<<LM, Q15ONE, arch);
} while (++c<C);
}
}
@ -656,6 +656,7 @@ struct band_ctx {
opus_int32 remaining_bits;
const celt_ener *bandE;
opus_uint32 seed;
int arch;
};
struct split_ctx {
@ -707,7 +708,7 @@ static void compute_theta(struct band_ctx *ctx, struct split_ctx *sctx,
side and mid. With just that parameter, we can re-scale both
mid and side because we know that 1) they have unit norm and
2) they are orthogonal. */
itheta = stereo_itheta(X, Y, stereo, N);
itheta = stereo_itheta(X, Y, stereo, N, ctx->arch);
}
tell = ec_tell_frac(ec);
if (qn!=1)
@ -1055,7 +1056,7 @@ static unsigned quant_partition(struct band_ctx *ctx, celt_norm *X,
}
cm = fill;
}
renormalise_vector(X, N, gain);
renormalise_vector(X, N, gain, ctx->arch);
}
}
}
@ -1360,9 +1361,11 @@ static unsigned quant_band_stereo(struct band_ctx *ctx, celt_norm *X, celt_norm
void quant_all_bands(int encode, const CELTMode *m, int start, int end,
celt_norm *X_, celt_norm *Y_, unsigned char *collapse_masks, const celt_ener *bandE, int *pulses,
int shortBlocks, int spread, int dual_stereo, int intensity, int *tf_res,
opus_int32 total_bits, opus_int32 balance, ec_ctx *ec, int LM, int codedBands, opus_uint32 *seed)
celt_norm *X_, celt_norm *Y_, unsigned char *collapse_masks,
const celt_ener *bandE, int *pulses, int shortBlocks, int spread,
int dual_stereo, int intensity, int *tf_res, opus_int32 total_bits,
opus_int32 balance, ec_ctx *ec, int LM, int codedBands,
opus_uint32 *seed, int arch)
{
int i;
opus_int32 remaining_bits;
@ -1404,6 +1407,7 @@ void quant_all_bands(int encode, const CELTMode *m, int start, int end,
ctx.m = m;
ctx.seed = *seed;
ctx.spread = spread;
ctx.arch = arch;
for (i=start;i<end;i++)
{
opus_int32 tell;

View file

@ -98,15 +98,20 @@ void haar1(celt_norm *X, int N0, int stride);
* @param LM log2() of the number of 2.5 subframes in the frame
* @param codedBands Last band to receive bits + 1
* @param seed Random generator seed
* @param arch Run-time architecture (see opus_select_arch())
*/
void quant_all_bands(int encode, const CELTMode *m, int start, int end,
celt_norm * X, celt_norm * Y, unsigned char *collapse_masks, const celt_ener *bandE, int *pulses,
int shortBlocks, int spread, int dual_stereo, int intensity, int *tf_res,
opus_int32 total_bits, opus_int32 balance, ec_ctx *ec, int M, int codedBands, opus_uint32 *seed);
celt_norm * X, celt_norm * Y, unsigned char *collapse_masks,
const celt_ener *bandE, int *pulses, int shortBlocks, int spread,
int dual_stereo, int intensity, int *tf_res, opus_int32 total_bits,
opus_int32 balance, ec_ctx *ec, int M, int codedBands, opus_uint32 *seed,
int arch);
void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_masks, int LM, int C, int size,
int start, int end, const opus_val16 *logE, const opus_val16 *prev1logE,
const opus_val16 *prev2logE, const int *pulses, opus_uint32 seed);
void anti_collapse(const CELTMode *m, celt_norm *X_,
unsigned char *collapse_masks, int LM, int C, int size, int start,
int end, const opus_val16 *logE, const opus_val16 *prev1logE,
const opus_val16 *prev2logE, const int *pulses, opus_uint32 seed,
int arch);
opus_uint32 celt_lcg_rand(opus_uint32 seed);

View file

@ -499,7 +499,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
seed = celt_lcg_rand(seed);
X[boffs+j] = (celt_norm)((opus_int32)seed>>20);
}
renormalise_vector(X+boffs, blen, Q15ONE);
renormalise_vector(X+boffs, blen, Q15ONE, st->arch);
}
}
st->rng = seed;
@ -583,7 +583,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
}
/* Compute the excitation for exc_length samples before the loss. */
celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*LPC_ORDER,
exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, lpc_mem);
exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, lpc_mem, st->arch);
}
/* Check if the waveform is decaying, and if so how fast.
@ -650,7 +650,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
the signal domain. */
celt_iir(buf+DECODE_BUFFER_SIZE-N, lpc+c*LPC_ORDER,
buf+DECODE_BUFFER_SIZE-N, extrapolation_len, LPC_ORDER,
lpc_mem);
lpc_mem, st->arch);
}
/* Check if the synthesis energy is higher than expected, which can
@ -982,7 +982,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
quant_all_bands(0, mode, start, end, X, C==2 ? X+N : NULL, collapse_masks,
NULL, pulses, shortBlocks, spread_decision, dual_stereo, intensity, tf_res,
len*(8<<BITRES)-anti_collapse_rsv, balance, dec, LM, codedBands, &st->rng);
len*(8<<BITRES)-anti_collapse_rsv, balance, dec, LM, codedBands, &st->rng, st->arch);
if (anti_collapse_rsv > 0)
{
@ -994,7 +994,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
if (anti_collapse_on)
anti_collapse(mode, X, collapse_masks, LM, C, N,
start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng);
start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng, st->arch);
if (silence)
{

View file

@ -751,7 +751,7 @@ static void tf_encode(int start, int end, int isTransient, int *tf_res, int LM,
static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X,
const opus_val16 *bandLogE, int end, int LM, int C, int N0,
AnalysisInfo *analysis, opus_val16 *stereo_saving, opus_val16 tf_estimate,
int intensity, opus_val16 surround_trim)
int intensity, opus_val16 surround_trim, int arch)
{
int i;
opus_val32 diff=0;
@ -767,7 +767,8 @@ static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X,
for (i=0;i<8;i++)
{
opus_val32 partial;
partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)], (m->eBands[i+1]-m->eBands[i])<<LM);
partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)],
(m->eBands[i+1]-m->eBands[i])<<LM, arch);
sum = ADD16(sum, EXTRACT16(SHR32(partial, 18)));
}
sum = MULT16_16_Q15(QCONST16(1.f/8, 15), sum);
@ -776,7 +777,8 @@ static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X,
for (i=8;i<intensity;i++)
{
opus_val32 partial;
partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)], (m->eBands[i+1]-m->eBands[i])<<LM);
partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)],
(m->eBands[i+1]-m->eBands[i])<<LM, arch);
minXC = MIN16(minXC, ABS16(EXTRACT16(SHR32(partial, 18))));
}
minXC = MIN16(QCONST16(1.f, 10), ABS16(minXC));
@ -1097,7 +1099,7 @@ static int run_prefilter(CELTEncoder *st, celt_sig *in, celt_sig *prefilter_mem,
pitch_index = COMBFILTER_MAXPERIOD-pitch_index;
gain1 = remove_doubling(pitch_buf, COMBFILTER_MAXPERIOD, COMBFILTER_MINPERIOD,
N, &pitch_index, st->prefilter_period, st->prefilter_gain);
N, &pitch_index, st->prefilter_period, st->prefilter_gain, st->arch);
if (pitch_index > COMBFILTER_MAXPERIOD-2)
pitch_index = COMBFILTER_MAXPERIOD-2;
gain1 = MULT16_16_Q15(QCONST16(.7f,15),gain1);
@ -1887,7 +1889,8 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
alloc_trim = 5;
else
alloc_trim = alloc_trim_analysis(mode, X, bandLogE,
end, LM, C, N, &st->analysis, &st->stereo_saving, tf_estimate, st->intensity, surround_trim);
end, LM, C, N, &st->analysis, &st->stereo_saving, tf_estimate,
st->intensity, surround_trim, st->arch);
ec_enc_icdf(enc, alloc_trim, trim_icdf, 7);
tell = ec_tell_frac(enc);
}
@ -2022,8 +2025,9 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
/* Residual quantisation */
ALLOC(collapse_masks, C*nbEBands, unsigned char);
quant_all_bands(1, mode, start, end, X, C==2 ? X+N : NULL, collapse_masks,
bandE, pulses, shortBlocks, st->spread_decision, dual_stereo, st->intensity, tf_res,
nbCompressedBytes*(8<<BITRES)-anti_collapse_rsv, balance, enc, LM, codedBands, &st->rng);
bandE, pulses, shortBlocks, st->spread_decision,
dual_stereo, st->intensity, tf_res, nbCompressedBytes*(8<<BITRES)-anti_collapse_rsv,
balance, enc, LM, codedBands, &st->rng, st->arch);
if (anti_collapse_rsv > 0)
{

View file

@ -88,12 +88,15 @@ int p
#endif
}
void celt_fir(const opus_val16 *_x,
void celt_fir_c(
const opus_val16 *_x,
const opus_val16 *num,
opus_val16 *_y,
int N,
int ord,
opus_val16 *mem)
opus_val16 *mem,
int arch)
{
int i,j;
VARDECL(opus_val16, rnum);
@ -124,7 +127,7 @@ void celt_fir(const opus_val16 *_x,
for (i=0;i<N-3;i+=4)
{
opus_val32 sum[4]={0,0,0,0};
xcorr_kernel(rnum, x+i, sum, ord);
xcorr_kernel(rnum, x+i, sum, ord, arch);
_y[i ] = SATURATE16(ADD32(EXTEND32(_x[i ]), PSHR32(sum[0], SIG_SHIFT)));
_y[i+1] = SATURATE16(ADD32(EXTEND32(_x[i+1]), PSHR32(sum[1], SIG_SHIFT)));
_y[i+2] = SATURATE16(ADD32(EXTEND32(_x[i+2]), PSHR32(sum[2], SIG_SHIFT)));
@ -146,7 +149,8 @@ void celt_iir(const opus_val32 *_x,
opus_val32 *_y,
int N,
int ord,
opus_val16 *mem)
opus_val16 *mem,
int arch)
{
#ifdef SMALL_FOOTPRINT
int i,j;
@ -187,7 +191,7 @@ void celt_iir(const opus_val32 *_x,
sum[1]=_x[i+1];
sum[2]=_x[i+2];
sum[3]=_x[i+3];
xcorr_kernel(rden, y+i, sum, ord);
xcorr_kernel(rden, y+i, sum, ord, arch);
/* Patch up the result to compensate for the fact that this is an IIR */
y[i+ord ] = -ROUND16(sum[0],SIG_SHIFT);

View file

@ -29,24 +29,37 @@
#define PLC_H
#include "arch.h"
#include "cpu_support.h"
#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
#include "x86/celt_lpc_sse.h"
#endif
#define LPC_ORDER 24
void _celt_lpc(opus_val16 *_lpc, const opus_val32 *ac, int p);
void celt_fir(const opus_val16 *x,
void celt_fir_c(
const opus_val16 *x,
const opus_val16 *num,
opus_val16 *y,
int N,
int ord,
opus_val16 *mem);
opus_val16 *mem,
int arch);
#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
#define celt_fir(x, num, y, N, ord, mem, arch) \
(celt_fir_c(x, num, y, N, ord, mem, arch))
#endif
void celt_iir(const opus_val32 *x,
const opus_val16 *den,
opus_val32 *y,
int N,
int ord,
opus_val16 *mem);
opus_val16 *mem,
int arch);
int _celt_autocorr(const opus_val16 *x, opus_val32 *ac,
const opus_val16 *window, int overlap, int lag, int n, int arch);

View file

@ -42,6 +42,18 @@
*/
#define OPUS_ARCHMASK 3
#elif defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1)
#include "x86/x86cpu.h"
/* We currently support 3 x86 variants:
* arch[0] -> non-sse
* arch[1] -> sse2
* arch[2] -> sse4.1
* arch[3] -> NULL
*/
#define OPUS_ARCHMASK 3
int opus_select_arch(void);
#else
#define OPUS_ARCHMASK 0
@ -50,5 +62,4 @@ static OPUS_INLINE int opus_select_arch(void)
return 0;
}
#endif
#endif

View file

@ -98,7 +98,7 @@ static void ec_enc_carry_out(ec_enc *_this,int _c){
else _this->ext++;
}
static void ec_enc_normalize(ec_enc *_this){
static OPUS_INLINE void ec_enc_normalize(ec_enc *_this){
/*If the range is too small, output some bits and rescale it.*/
while(_this->rng<=EC_CODE_BOT){
ec_enc_carry_out(_this,(int)(_this->val>>EC_CODE_SHIFT));

View file

@ -73,7 +73,11 @@ static void exp_rotation1(celt_norm *X, int len, int stride, opus_val16 c, opus_
}
#define OVERRIDE_renormalise_vector
void renormalise_vector(celt_norm *X, int N, opus_val16 gain)
#define renormalise_vector(X, N, gain, arch) \
((void)(arch), renormalize_vector_mips(x, N, gain))
void renormalise_vector_mips(celt_norm *X, int N, opus_val16 gain)
{
int i;
#ifdef FIXED_POINT

View file

@ -250,7 +250,8 @@ opus_val32
#else
void
#endif
celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch)
celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
opus_val32 *xcorr, int len, int max_pitch, int arch)
{
int i;
/*The EDSP version requires that max_pitch is at least 1, and that _x is
@ -264,7 +265,7 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr
for (i=0;i<max_pitch-3;i+=4)
{
opus_val32 sum[4]={0,0,0,0};
xcorr_kernel(_x, _y+i, sum, len);
xcorr_kernel(_x, _y+i, sum, len, arch);
xcorr[i]=sum[0];
xcorr[i+1]=sum[1];
xcorr[i+2]=sum[2];
@ -280,7 +281,7 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr
for (;i<max_pitch;i++)
{
opus_val32 sum;
sum = celt_inner_prod(_x, _y+i, len);
sum = celt_inner_prod(_x, _y+i, len, arch);
xcorr[i] = sum;
#ifdef FIXED_POINT
maxcorr = MAX32(maxcorr, sum);
@ -369,7 +370,7 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR
for (j=0;j<len>>1;j++)
sum += SHR32(MULT16_16(x_lp[j],y[i+j]), shift);
#else
sum = celt_inner_prod(x_lp, y+i, len>>1);
sum = celt_inner_prod_c(x_lp, y+i, len>>1);
#endif
xcorr[i] = MAX32(-1, sum);
#ifdef FIXED_POINT
@ -405,7 +406,7 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR
static const int second_check[16] = {0, 0, 3, 2, 3, 2, 5, 2, 3, 2, 3, 2, 5, 2, 3, 2};
opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
int N, int *T0_, int prev_period, opus_val16 prev_gain)
int N, int *T0_, int prev_period, opus_val16 prev_gain, int arch)
{
int k, i, T, T0;
opus_val16 g, g0;
@ -517,7 +518,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
pg = SHR32(frac_div32(best_xy,best_yy+1),16);
for (k=0;k<3;k++)
xcorr[k] = celt_inner_prod(x, x-(T+k-1), N);
xcorr[k] = celt_inner_prod(x, x-(T+k-1), N, arch);
if ((xcorr[2]-xcorr[0]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[0]))
offset = 1;
else if ((xcorr[0]-xcorr[2]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[2]))

View file

@ -37,7 +37,8 @@
#include "modes.h"
#include "cpu_support.h"
#if defined(__SSE__) && !defined(FIXED_POINT)
#if defined(__SSE__) && !defined(FIXED_POINT) \
|| defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)
#include "x86/pitch_sse.h"
#endif
@ -56,12 +57,13 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR
int len, int max_pitch, int *pitch, int arch);
opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
int N, int *T0, int prev_period, opus_val16 prev_gain);
int N, int *T0, int prev_period, opus_val16 prev_gain, int arch);
/* OPT: This is the kernel you really want to optimize. It gets used a lot
by the prefilter and by the PLC. */
#ifndef OVERRIDE_XCORR_KERNEL
static OPUS_INLINE void xcorr_kernel(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
static OPUS_INLINE void xcorr_kernel_c(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
{
int j;
opus_val16 y_0, y_1, y_2, y_3;
@ -126,8 +128,15 @@ static OPUS_INLINE void xcorr_kernel(const opus_val16 * x, const opus_val16 * y,
sum[3] = MAC16_16(sum[3],tmp,y_1);
}
}
#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
#define xcorr_kernel(x, y, sum, len, arch) \
((void)(arch),xcorr_kernel_c(x, y, sum, len))
#endif
#endif /* OVERRIDE_XCORR_KERNEL */
#ifndef OVERRIDE_DUAL_INNER_PROD
static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
int N, opus_val32 *xy1, opus_val32 *xy2)
@ -145,9 +154,10 @@ static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y
}
#endif
#ifndef OVERRIDE_CELT_INNER_PROD
static OPUS_INLINE opus_val32 celt_inner_prod(const opus_val16 *x, const opus_val16 *y,
int N)
/*We make sure a C version is always available for cases where the overhead of
vectorization and passing around an arch flag aren't worth it.*/
static OPUS_INLINE opus_val32 celt_inner_prod_c(const opus_val16 *x,
const opus_val16 *y, int N)
{
int i;
opus_val32 xy=0;
@ -155,6 +165,10 @@ static OPUS_INLINE opus_val32 celt_inner_prod(const opus_val16 *x, const opus_va
xy = MAC16_16(xy, x[i], y[i]);
return xy;
}
#if !defined(OVERRIDE_CELT_INNER_PROD)
# define celt_inner_prod(x, y, N, arch) \
((void)(arch),celt_inner_prod_c(x, y, N))
#endif
#ifdef FIXED_POINT
@ -163,11 +177,11 @@ opus_val32
void
#endif
celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
opus_val32 *xcorr, int len, int max_pitch);
opus_val32 *xcorr, int len, int max_pitch, int arch);
#if !defined(OVERRIDE_PITCH_XCORR)
/*Is run-time CPU detection enabled on this platform?*/
# if defined(OPUS_HAVE_RTCD)
# if defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_ASM)
extern
# if defined(FIXED_POINT)
opus_val32
@ -179,10 +193,10 @@ void
# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
xcorr, len, max_pitch))
xcorr, len, max_pitch, arch))
# else
# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
((void)(arch),celt_pitch_xcorr_c(_x, _y, xcorr, len, max_pitch))
((void)(arch),celt_pitch_xcorr_c(_x, _y, xcorr, len, max_pitch, arch))
# endif
#endif

View file

@ -36,6 +36,8 @@
#define CELT_C
#include <stdio.h>
#include <math.h>
#include "mathops.c"
#include "entenc.c"
#include "entdec.c"
@ -45,8 +47,16 @@
#include "laplace.c"
#include "vq.c"
#include "cwrs.c"
#include <stdio.h>
#include <math.h>
#include "pitch.c"
#include "celt_lpc.c"
#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)
#include "x86/pitch_sse.c"
#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
#include "x86/celt_lpc_sse.c"
#endif
#include "x86/x86_celt_map.c"
#endif
#ifdef FIXED_POINT
#define WORD "%d"

View file

@ -44,7 +44,18 @@
#include "entdec.c"
#include "mathops.c"
#include "bands.h"
#include "pitch.c"
#include "celt_lpc.c"
#include <math.h>
#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)
#include "x86/pitch_sse.c"
#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
#include "x86/celt_lpc_sse.c"
#endif
#include "x86/x86_celt_map.c"
#endif
#define MAX_SIZE 100
int ret=0;

View file

@ -350,7 +350,7 @@ unsigned alg_unquant(celt_norm *X, int N, int K, int spread, int B,
}
#ifndef OVERRIDE_renormalise_vector
void renormalise_vector(celt_norm *X, int N, opus_val16 gain)
void renormalise_vector(celt_norm *X, int N, opus_val16 gain, int arch)
{
int i;
#ifdef FIXED_POINT
@ -360,7 +360,7 @@ void renormalise_vector(celt_norm *X, int N, opus_val16 gain)
opus_val16 g;
opus_val32 t;
celt_norm *xptr;
E = EPSILON + celt_inner_prod(X, X, N);
E = EPSILON + celt_inner_prod(X, X, N, arch);
#ifdef FIXED_POINT
k = celt_ilog2(E)>>1;
#endif
@ -377,7 +377,7 @@ void renormalise_vector(celt_norm *X, int N, opus_val16 gain)
}
#endif /* OVERRIDE_renormalise_vector */
int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N)
int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N, int arch)
{
int i;
int itheta;
@ -396,8 +396,8 @@ int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N)
Eside = MAC16_16(Eside, s, s);
}
} else {
Emid += celt_inner_prod(X, X, N);
Eside += celt_inner_prod(Y, Y, N);
Emid += celt_inner_prod(X, X, N, arch);
Eside += celt_inner_prod(Y, Y, N, arch);
}
mid = celt_sqrt(Emid);
side = celt_sqrt(Eside);

View file

@ -63,8 +63,8 @@ unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B,
unsigned alg_unquant(celt_norm *X, int N, int K, int spread, int B,
ec_dec *dec, opus_val16 gain);
void renormalise_vector(celt_norm *X, int N, opus_val16 gain);
void renormalise_vector(celt_norm *X, int N, opus_val16 gain, int arch);
int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N);
int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N, int arch);
#endif /* VQ_H */

128
celt/x86/celt_lpc_sse.c Normal file
View file

@ -0,0 +1,128 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <xmmintrin.h>
#include <emmintrin.h>
#include <smmintrin.h>
#include "celt_lpc.h"
#include "stack_alloc.h"
#include "mathops.h"
#include "pitch.h"
#include "x86cpu.h"
void celt_fir_sse4_1(const opus_val16 *_x,
const opus_val16 *num,
opus_val16 *_y,
int N,
int ord,
opus_val16 *mem,
int arch)
{
int i,j;
VARDECL(opus_val16, rnum);
VARDECL(opus_val16, x);
__m128i vecNoA;
opus_int32 noA ;
SAVE_STACK;
ALLOC(rnum, ord, opus_val16);
ALLOC(x, N+ord, opus_val16);
for(i=0;i<ord;i++)
rnum[i] = num[ord-i-1];
for(i=0;i<ord;i++)
x[i] = mem[ord-i-1];
for (i=0;i<N-7;i+=8)
{
x[i+ord ]=_x[i ];
x[i+ord+1]=_x[i+1];
x[i+ord+2]=_x[i+2];
x[i+ord+3]=_x[i+3];
x[i+ord+4]=_x[i+4];
x[i+ord+5]=_x[i+5];
x[i+ord+6]=_x[i+6];
x[i+ord+7]=_x[i+7];
}
for (;i<N-3;i+=4)
{
x[i+ord ]=_x[i ];
x[i+ord+1]=_x[i+1];
x[i+ord+2]=_x[i+2];
x[i+ord+3]=_x[i+3];
}
for (;i<N;i++)
x[i+ord]=_x[i];
for(i=0;i<ord;i++)
mem[i] = _x[N-i-1];
#ifdef SMALL_FOOTPRINT
for (i=0;i<N;i++)
{
opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT);
for (j=0;j<ord;j++)
{
sum = MAC16_16(sum,rnum[j],x[i+j]);
}
_y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT));
}
#else
noA = EXTEND32(1) << SIG_SHIFT >> 1;
vecNoA = _mm_set_epi32(noA, noA, noA, noA);
for (i=0;i<N-3;i+=4)
{
opus_val32 sums[4] = {0};
__m128i vecSum, vecX;
xcorr_kernel(rnum, x+i, sums, ord, arch);
vecSum = _mm_loadu_si128((__m128i *)sums);
vecSum = _mm_add_epi32(vecSum, vecNoA);
vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT);
vecX = OP_CVTEPI16_EPI32_M64(_x + i);
vecSum = _mm_add_epi32(vecSum, vecX);
vecSum = _mm_packs_epi32(vecSum, vecSum);
_mm_storel_epi64((__m128i *)(_y + i), vecSum);
}
for (;i<N;i++)
{
opus_val32 sum = 0;
for (j=0;j<ord;j++)
sum = MAC16_16(sum, rnum[j], x[i + j]);
_y[i] = SATURATE16(ADD32(EXTEND32(_x[i]), PSHR32(sum, SIG_SHIFT)));
}
#endif
RESTORE_STACK;
}

58
celt/x86/celt_lpc_sse.h Normal file
View file

@ -0,0 +1,58 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef CELT_LPC_SSE_H
#define CELT_LPC_SSE_H
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
void celt_fir_sse4_1(
const opus_val16 *x,
const opus_val16 *num,
opus_val16 *y,
int N,
int ord,
opus_val16 *mem,
int arch);
extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
const opus_val16 *num,
opus_val16 *y,
int N,
int ord,
opus_val16 *mem,
int arch);
# define celt_fir(x, num, y, N, ord, mem, arch) \
((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, mem, arch))
#endif
#endif

251
celt/x86/pitch_sse.c Normal file
View file

@ -0,0 +1,251 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <xmmintrin.h>
#include <emmintrin.h>
#include "macros.h"
#include "celt_lpc.h"
#include "stack_alloc.h"
#include "mathops.h"
#include "pitch.h"
#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
#include <smmintrin.h>
#include "x86cpu.h"
opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y,
int N)
{
opus_int i, dataSize16;
opus_int32 sum;
__m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
__m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
__m128i inVec1_3210, inVec2_3210;
sum = 0;
dataSize16 = N & ~15;
acc1 = _mm_setzero_si128();
acc2 = _mm_setzero_si128();
for (i=0;i<dataSize16;i+=16) {
inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
acc1 = _mm_add_epi32(acc1, inVec1_76543210);
acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
}
acc1 = _mm_add_epi32(acc1, acc2);
if (N - i >= 8)
{
inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
acc1 = _mm_add_epi32(acc1, inVec1_76543210);
i += 8;
}
if (N - i >= 4)
{
inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]);
inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]);
inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210);
acc1 = _mm_add_epi32(acc1, inVec1_3210);
i += 4;
}
acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1));
acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E));
sum += _mm_cvtsi128_si32(acc1);
for (;i<N;i++)
{
sum = silk_SMLABB(sum, x[i], y[i]);
}
return sum;
}
void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len)
{
int j;
__m128i vecX, vecX0, vecX1, vecX2, vecX3;
__m128i vecY0, vecY1, vecY2, vecY3;
__m128i sum0, sum1, sum2, sum3, vecSum;
__m128i initSum;
celt_assert(len >= 3);
sum0 = _mm_setzero_si128();
sum1 = _mm_setzero_si128();
sum2 = _mm_setzero_si128();
sum3 = _mm_setzero_si128();
for (j=0;j<(len-7);j+=8)
{
vecX = _mm_loadu_si128((__m128i *)(&x[j + 0]));
vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0]));
vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1]));
vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2]));
vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3]));
sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0));
sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1));
sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2));
sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3));
}
sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0));
sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E));
sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1));
sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E));
sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2));
sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E));
sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3));
sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E));
vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1),
_mm_unpacklo_epi32(sum2, sum3));
for (;j<(len-3);j+=4)
{
vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
vecX0 = _mm_shuffle_epi32(vecX, 0x00);
vecX1 = _mm_shuffle_epi32(vecX, 0x55);
vecX2 = _mm_shuffle_epi32(vecX, 0xaa);
vecX3 = _mm_shuffle_epi32(vecX, 0xff);
vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]);
sum0 = _mm_mullo_epi32(vecX0, vecY0);
sum1 = _mm_mullo_epi32(vecX1, vecY1);
sum2 = _mm_mullo_epi32(vecX2, vecY2);
sum3 = _mm_mullo_epi32(vecX3, vecY3);
sum0 = _mm_add_epi32(sum0, sum1);
sum2 = _mm_add_epi32(sum2, sum3);
vecSum = _mm_add_epi32(vecSum, sum0);
vecSum = _mm_add_epi32(vecSum, sum2);
}
for (;j<len;j++)
{
vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
vecX0 = _mm_shuffle_epi32(vecX, 0x00);
vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
sum0 = _mm_mullo_epi32(vecX0, vecY0);
vecSum = _mm_add_epi32(vecSum, sum0);
}
initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
initSum = _mm_add_epi32(initSum, vecSum);
_mm_storeu_si128((__m128i *)sum, initSum);
}
#endif
#if defined(OPUS_X86_MAY_HAVE_SSE2)
opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y,
int N)
{
opus_int i, dataSize16;
opus_int32 sum;
__m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
__m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
sum = 0;
dataSize16 = N & ~15;
acc1 = _mm_setzero_si128();
acc2 = _mm_setzero_si128();
for (i=0;i<dataSize16;i+=16)
{
inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
acc1 = _mm_add_epi32(acc1, inVec1_76543210);
acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
}
acc1 = _mm_add_epi32( acc1, acc2 );
if (N - i >= 8)
{
inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
acc1 = _mm_add_epi32(acc1, inVec1_76543210);
i += 8;
}
acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1));
acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E));
sum += _mm_cvtsi128_si32(acc1);
for (;i<N;i++) {
sum = silk_SMLABB(sum, x[i], y[i]);
}
return sum;
}
#endif

View file

@ -1,4 +1,5 @@
/* Copyright (c) 2013 Jean-Marc Valin and John Ridges */
/* Copyright (c) 2013 Jean-Marc Valin and John Ridges
Copyright (c) 2014, Cisco Systems, INC MingXiang WeiZhou MinPeng YanWang*/
/**
@file pitch_sse.h
@brief Pitch analysis
@ -32,11 +33,55 @@
#ifndef PITCH_SSE_H
#define PITCH_SSE_H
#if defined(HAVE_CONFIG_H)
#include "config.h"
#endif
#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)
#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
void xcorr_kernel_sse4_1(
const opus_int16 *x,
const opus_int16 *y,
opus_val32 sum[4],
int len );
extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
const opus_int16 *x,
const opus_int16 *y,
opus_val32 sum[4],
int len );
#define xcorr_kernel(x, y, sum, len, arch) \
((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len))
opus_val32 celt_inner_prod_sse4_1(
const opus_int16 *x,
const opus_int16 *y,
int N);
#endif
#if defined(OPUS_X86_MAY_HAVE_SSE2)
opus_val32 celt_inner_prod_sse2(
const opus_int16 *x,
const opus_int16 *y,
int N);
#endif
extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
const opus_int16 *x,
const opus_int16 *y,
int N);
#define OVERRIDE_CELT_INNER_PROD
#define celt_inner_prod(x, y, N, arch) \
((*CELT_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N))
#else
#include <xmmintrin.h>
#include "arch.h"
#define OVERRIDE_XCORR_KERNEL
static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
static OPUS_INLINE void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
{
int j;
__m128 xsum1, xsum2;
@ -71,6 +116,9 @@ static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, o
_mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
}
#define xcorr_kernel(_x, _y, _z, len, arch) \
((void)(arch),xcorr_kernel_sse(_x, _y, _z, len))
#define OVERRIDE_DUAL_INNER_PROD
static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
int N, opus_val32 *xy1, opus_val32 *xy2)
@ -102,7 +150,7 @@ static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y
}
#define OVERRIDE_CELT_INNER_PROD
static OPUS_INLINE opus_val32 celt_inner_prod(const opus_val16 *x, const opus_val16 *y,
static OPUS_INLINE opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y,
int N)
{
int i;
@ -127,6 +175,9 @@ static OPUS_INLINE opus_val32 celt_inner_prod(const opus_val16 *x, const opus_va
return xy;
}
# define celt_inner_prod(_x, _y, len, arch) \
((void)(arch),celt_inner_prod_sse(_x, _y, len))
#define OVERRIDE_COMB_FILTER_CONST
static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
opus_val16 g10, opus_val16 g11, opus_val16 g12)
@ -180,3 +231,4 @@ static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, i
}
#endif
#endif

84
celt/x86/x86_celt_map.c Normal file
View file

@ -0,0 +1,84 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if defined(HAVE_CONFIG_H)
#include "config.h"
#endif
#include "x86/x86cpu.h"
#include "celt_lpc.h"
#include "pitch.h"
#include "pitch_sse.h"
#if defined(OPUS_HAVE_RTCD)
# if defined(FIXED_POINT)
void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
const opus_val16 *num,
opus_val16 *y,
int N,
int ord,
opus_val16 *mem,
int arch
) = {
celt_fir_c, /* non-sse */
celt_fir_c,
MAY_HAVE_SSE4_1(celt_fir), /* sse4.1 */
NULL
};
void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
const opus_val16 *y,
opus_val32 sum[4],
int len
) = {
xcorr_kernel_c, /* non-sse */
xcorr_kernel_c,
MAY_HAVE_SSE4_1(xcorr_kernel), /* sse4.1 */
NULL
};
opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
const opus_val16 *y,
int N
) = {
celt_inner_prod_c, /* non-sse */
MAY_HAVE_SSE2(celt_inner_prod),
MAY_HAVE_SSE4_1(celt_inner_prod), /* sse4.1 */
NULL
};
# else
# error "Floating-point implementation is not supported by x86 RTCD yet." \
"Reconfigure with --disable-rtcd or send patches."
# endif
#endif

111
celt/x86/x86cpu.c Normal file
View file

@ -0,0 +1,111 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "cpu_support.h"
#include "macros.h"
#include "main.h"
#include "pitch.h"
#include "x86cpu.h"
#if defined(_MSC_VER)
#include <intrin.h>
#define cpuid(info,x) __cpuid(info,x)
#else
#if defined(CPU_INFO_BY_C)
#include <cpuid.h>
#endif
static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
{
#if defined(CPU_INFO_BY_ASM)
__asm__ __volatile__ (
"cpuid":
"=a" (CPUInfo[0]),
"=b" (CPUInfo[1]),
"=c" (CPUInfo[2]),
"=d" (CPUInfo[3]) :
"a" (InfoType), "c" (0)
);
#elif defined(CPU_INFO_BY_C)
__get_cpuid(InfoType, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3]));
#endif
}
#endif
#include "SigProc_FIX.h"
#include "celt_lpc.h"
typedef struct CPU_Feature{
/* SIMD: 128-bit */
int HW_SSE2;
int HW_SSE41;
} CPU_Feature;
static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
{
unsigned int info[4] = {0};
unsigned int nIds = 0;
cpuid(info, 0);
nIds = info[0];
if (nIds >= 1){
cpuid(info, 1);
cpu_feature->HW_SSE2 = (info[3] & (1 << 26)) != 0;
cpu_feature->HW_SSE41 = (info[2] & (1 << 19)) != 0;
}
}
int opus_select_arch(void)
{
CPU_Feature cpu_feature = {0};
int arch;
opus_cpu_feature_check(&cpu_feature);
arch = 0;
if (!cpu_feature.HW_SSE2)
{
return arch;
}
arch++;
if (!cpu_feature.HW_SSE41)
{
return arch;
}
arch++;
return arch;
}

63
celt/x86/x86cpu.h Normal file
View file

@ -0,0 +1,63 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if !defined(X86CPU_H)
# define X86CPU_H
# if defined(OPUS_X86_MAY_HAVE_SSE2)
# define MAY_HAVE_SSE2(name) name ## _sse2
# else
# define MAY_HAVE_SSE2(name) name ## _c
# endif
# if defined(OPUS_X86_MAY_HAVE_SSE4_1)
# define MAY_HAVE_SSE4_1(name) name ## _sse4_1
# else
# define MAY_HAVE_SSE4_1(name) name ## _c
# endif
# if defined(OPUS_HAVE_RTCD)
int opus_select_arch(void);
# endif
/*gcc appears to emit MOVDQA's to load the argument of an _mm_cvtepi16_epi32()
when optimizations are disabled, even though the actual PMOVSXWD instruction
takes an m64. Unlike a normal m64 reference, these require 16-byte alignment
and load 16 bytes instead of 8, possibly reading out of bounds.
We can insert an explicit MOVQ using _mm_loadl_epi64(), which should have the
same semantics as an m64 reference in the PMOVSXWD instruction itself, but
gcc is not smart enough to optimize this out when optimizations ARE enabled.*/
# if !defined(__OPTIMIZE__)
# define OP_CVTEPI16_EPI32_M64(x) \
(_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x))))
# else
# define OP_CVTEPI16_EPI32_M64(x) \
(_mm_cvtepi16_epi32(*(__m128i *)(x)))
# endif
#endif

View file

@ -24,6 +24,7 @@ celt/modes.h \
celt/os_support.h \
celt/pitch.h \
celt/celt_lpc.h \
celt/x86/celt_lpc_sse.h \
celt/quant_bands.h \
celt/rate.h \
celt/stack_alloc.h \
@ -36,4 +37,5 @@ celt/arm/fixed_armv5e.h \
celt/arm/kiss_fft_armv4.h \
celt/arm/kiss_fft_armv5e.h \
celt/arm/pitch_arm.h \
celt/x86/pitch_sse.h
celt/x86/pitch_sse.h \
celt/x86/x86cpu.h

View file

@ -17,6 +17,12 @@ celt/quant_bands.c \
celt/rate.c \
celt/vq.c
CELT_SOURCES_SSE = celt/x86/x86cpu.c \
celt/x86/x86_celt_map.c \
celt/x86/pitch_sse.c
CELT_SOURCES_SSE4_1 = celt/x86/celt_lpc_sse.c
CELT_SOURCES_ARM = \
celt/arm/armcpu.c \
celt/arm/arm_celt_map.c

View file

@ -189,6 +189,10 @@ AC_ARG_ENABLE([rtcd],
[AS_HELP_STRING([--disable-rtcd], [Disable run-time CPU capabilities detection])],,
[enable_rtcd=yes])
AC_ARG_ENABLE([intrinsics],
[AS_HELP_STRING([--enable-intrinsics], [Enable intrinsics optimizations (only for fixed point x86)])],,
[enable_intrinsics=no])
rtcd_support=no
cpu_arm=no
@ -345,6 +349,110 @@ AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],
AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM],
[test x"${asm_optimization%% *}" = x"ARM"])
AM_CONDITIONAL([HAVE_SSE4_1], [false])
AM_CONDITIONAL([HAVE_SSE2], [false])
AS_IF([test x"$enable_intrinsics" = x"yes"],[
AS_IF([test x"$enable_float" = x"no"],
[AS_IF([test x"$host_cpu" = x"i386" -o x"$host_cpu" = x"i686" -o x"$host_cpu" = x"x86_64"],[
AS_IF([test x"$enable_rtcd" = x"yes"],[
get_cpuid_by_asm="no"
AC_MSG_CHECKING([Get CPU Info])
AC_LINK_IFELSE(AC_LANG_PROGRAM([
#include <stdio.h>
],[
unsigned int CPUInfo0;
unsigned int CPUInfo1;
unsigned int CPUInfo2;
unsigned int CPUInfo3;
unsigned int InfoType;
__asm__ __volatile__ (
"cpuid11":
"=a" (CPUInfo0),
"=b" (CPUInfo1),
"=c" (CPUInfo2),
"=d" (CPUInfo3) :
"a" (InfoType), "c" (0)
);
]),
[get_cpuid_by_asm="yes"
AC_MSG_RESULT([Inline Assembly])],
[AC_LINK_IFELSE(AC_LANG_PROGRAM([
#include <cpuid.h>
],[
unsigned int CPUInfo0;
unsigned int CPUInfo1;
unsigned int CPUInfo2;
unsigned int CPUInfo3;
unsigned int InfoType;
__get_cpuid(InfoType, &CPUInfo0, &CPUInfo1, &CPUInfo2, &CPUInfo3);
]),
[AC_MSG_RESULT([C method])],
[AC_MSG_ERROR([not support Get CPU Info, please disable intrinsics ])])])
AC_MSG_CHECKING([sse4.1])
TMP_CFLAGS="$CFLAGS"
gcc -Q --help=target | grep "\-msse4.1 "
AS_IF([test x"$?" = x"0"],[
CFLAGS="$CFLAGS -msse4.1"
AC_CHECK_HEADER(xmmintrin.h, [], [AC_MSG_ERROR([Couldn't find xmmintrin.h])])
AC_CHECK_HEADER(emmintrin.h, [], [AC_MSG_ERROR([Couldn't find emmintrin.h])])
AC_CHECK_HEADER(smmintrin.h, [], [AC_MSG_ERROR([Couldn't find smmintrin.h])],[
#ifdef HAVE_XMMINSTRIN_H
#include <xmmintrin.h>
#endif
#ifdef HAVE_EMMINSTRIN_H
#include <emmintrin.h>
#endif
])
AC_LINK_IFELSE(AC_LANG_PROGRAM([
#include <xmmintrin.h>
#include <emmintrin.h>
#include <smmintrin.h>
],[
__m128i mtest = _mm_setzero_si128();
mtest = _mm_cmpeq_epi64(mtest, mtest);
]),
[AC_MSG_RESULT([yes])], [AC_MSG_ERROR([Compiler & linker failure for sse4.1, please disable intrinsics])])
CFLAGS="$TMP_CFLAGS"
AC_DEFINE([OPUS_X86_MAY_HAVE_SSE4_1], [1], [For x86 sse4.1 instrinsics optimizations])
AC_DEFINE([OPUS_X86_MAY_HAVE_SSE2], [1], [For x86 sse2 instrinsics optimizations])
rtcd_support="x86 sse4.1"
AM_CONDITIONAL([HAVE_SSE4_1], [true])
AM_CONDITIONAL([HAVE_SSE2], [true])
AS_IF([test x"$get_cpuid_by_asm" = x"yes"],[AC_DEFINE([CPU_INFO_BY_ASM], [1], [Get CPU Info by asm method])],
[AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by C method])])
],[
gcc -Q --help=target | grep "\-msse2 "
AC_MSG_CHECKING([sse2])
AS_IF([test x"$?" = x"0"],[
AC_MSG_RESULT([yes])
CFLAGS="$CFLAGS -msse2"
AC_CHECK_HEADER(xmmintrin.h, [], [AC_MSG_ERROR([Couldn't find xmmintrin.h])])
AC_CHECK_HEADER(emmintrin.h, [], [AC_MSG_ERROR([Couldn't find emmintrin.h])])
AC_LINK_IFELSE(AC_LANG_PROGRAM([
#include <xmmintrin.h>
#include <emmintrin.h>
],[
__m128i mtest = _mm_setzero_si128();
]),
[AC_MSG_RESULT([yes])], [AC_MSG_ERROR([Compiler & linker failure for sse2, please disable intrinsics])])
CFLAGS="$TMP_CFLAGS"
AC_DEFINE([OPUS_X86_MAY_HAVE_SSE2], [1], [For x86 sse2 instrinsics optimize])
rtcd_support="x86 sse2"
AM_CONDITIONAL([HAVE_SSE2], [true])
AS_IF([test x"$get_cpuid_by_asm" = x"yes"],[AC_DEFINE([CPU_INFO_BY_ASM], [1], [Get CPU Info by asm method])],
[AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by c method])])
],[enable_intrinsics="no"])
])
], [enable_intrinsics="no"])
])
], [enable_intrinsics="no"])
])
AS_IF([test x"$enable_rtcd" = x"yes"],[
AS_IF([test x"$rtcd_support" != x"no"],[
AC_DEFINE([OPUS_HAVE_RTCD], [1],
@ -451,6 +559,7 @@ AC_MSG_NOTICE([
Fixed point debugging: ......... ${enable_fixed_point_debug}
Inline Assembly Optimizations: . ${inline_optimization}
External Assembly Optimizations: ${asm_optimization}
Intrinsics Optimizations.......: ${enable_intrinsics}
Run-time CPU detection: ........ ${rtcd_support}
Custom modes: .................. ${enable_custom_modes}
Assertion checking: ............ ${enable_assertions}

View file

@ -71,9 +71,24 @@ static OPUS_INLINE opus_int32 silk_A2NLSF_eval_poly( /* return the polynomial ev
y32 = p[ dd ]; /* Q16 */
x_Q16 = silk_LSHIFT( x, 4 );
if ( opus_likely( 8 == dd ) )
{
y32 = silk_SMLAWW( p[ 7 ], y32, x_Q16 );
y32 = silk_SMLAWW( p[ 6 ], y32, x_Q16 );
y32 = silk_SMLAWW( p[ 5 ], y32, x_Q16 );
y32 = silk_SMLAWW( p[ 4 ], y32, x_Q16 );
y32 = silk_SMLAWW( p[ 3 ], y32, x_Q16 );
y32 = silk_SMLAWW( p[ 2 ], y32, x_Q16 );
y32 = silk_SMLAWW( p[ 1 ], y32, x_Q16 );
y32 = silk_SMLAWW( p[ 0 ], y32, x_Q16 );
}
else
{
for( n = dd - 1; n >= 0; n-- ) {
y32 = silk_SMLAWW( p[ n ], y32, x_Q16 ); /* Q16 */
}
}
return y32;
}

View file

@ -111,7 +111,8 @@ opus_int silk_Decode( /* O Returns error co
opus_int newPacketFlag, /* I Indicates first decoder call for this packet */
ec_dec *psRangeDec, /* I/O Compressor data structure */
opus_int16 *samplesOut, /* O Decoded output speech vector */
opus_int32 *nSamplesOut /* O Number of samples decoded */
opus_int32 *nSamplesOut, /* O Number of samples decoded */
int arch /* I Run-time architecture */
);
#if 0

View file

@ -44,7 +44,8 @@ void silk_LPC_analysis_filter(
const opus_int16 *in, /* I Input signal */
const opus_int16 *B, /* I MA prediction coefficients, Q12 [order] */
const opus_int32 len, /* I Signal length */
const opus_int32 d /* I Filter order */
const opus_int32 d, /* I Filter order */
int arch /* I Run-time architecture */
)
{
opus_int j;
@ -69,11 +70,12 @@ void silk_LPC_analysis_filter(
for (j=0;j<d;j++) {
mem[ j ] = in[ d - j - 1 ];
}
celt_fir( in + d, num, out + d, len - d, d, mem );
celt_fir( in + d, num, out + d, len - d, d, mem, arch );
for ( j = 0; j < d; j++ ) {
out[ j ] = 0;
}
#else
(void)arch;
for( ix = d; ix < len; ix++ ) {
in_ptr = &in[ ix - 1 ];

View file

@ -56,6 +56,28 @@ opus_int32 silk_NLSF_del_dec_quant( /* O Returns
opus_int32 RD_max_Q25[ NLSF_QUANT_DEL_DEC_STATES ];
const opus_uint8 *rates_Q5;
opus_int out0_Q10_table[2 * NLSF_QUANT_MAX_AMPLITUDE_EXT];
opus_int out1_Q10_table[2 * NLSF_QUANT_MAX_AMPLITUDE_EXT];
for (i = -NLSF_QUANT_MAX_AMPLITUDE_EXT; i <= NLSF_QUANT_MAX_AMPLITUDE_EXT-1; i++)
{
out0_Q10 = silk_LSHIFT( i, 10 );
out1_Q10 = silk_ADD16( out0_Q10, 1024 );
if( i > 0 ) {
out0_Q10 = silk_SUB16( out0_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) );
out1_Q10 = silk_SUB16( out1_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) );
} else if( i == 0 ) {
out1_Q10 = silk_SUB16( out1_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) );
} else if( i == -1 ) {
out0_Q10 = silk_ADD16( out0_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) );
} else {
out0_Q10 = silk_ADD16( out0_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) );
out1_Q10 = silk_ADD16( out1_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) );
}
out0_Q10_table[ i + NLSF_QUANT_MAX_AMPLITUDE_EXT ] = silk_SMULWB( (opus_int32)out0_Q10, quant_step_size_Q16 );
out1_Q10_table[ i + NLSF_QUANT_MAX_AMPLITUDE_EXT ] = silk_SMULWB( (opus_int32)out1_Q10, quant_step_size_Q16 );
}
silk_assert( (NLSF_QUANT_DEL_DEC_STATES & (NLSF_QUANT_DEL_DEC_STATES-1)) == 0 ); /* must be power of two */
nStates = 1;
@ -73,21 +95,9 @@ opus_int32 silk_NLSF_del_dec_quant( /* O Returns
ind[ j ][ i ] = (opus_int8)ind_tmp;
/* compute outputs for ind_tmp and ind_tmp + 1 */
out0_Q10 = silk_LSHIFT( ind_tmp, 10 );
out1_Q10 = silk_ADD16( out0_Q10, 1024 );
if( ind_tmp > 0 ) {
out0_Q10 = silk_SUB16( out0_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) );
out1_Q10 = silk_SUB16( out1_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) );
} else if( ind_tmp == 0 ) {
out1_Q10 = silk_SUB16( out1_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) );
} else if( ind_tmp == -1 ) {
out0_Q10 = silk_ADD16( out0_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) );
} else {
out0_Q10 = silk_ADD16( out0_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) );
out1_Q10 = silk_ADD16( out1_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) );
}
out0_Q10 = silk_SMULWB( (opus_int32)out0_Q10, quant_step_size_Q16 );
out1_Q10 = silk_SMULWB( (opus_int32)out1_Q10, quant_step_size_Q16 );
out0_Q10 = out0_Q10_table[ ind_tmp + NLSF_QUANT_MAX_AMPLITUDE_EXT ];
out1_Q10 = out1_Q10_table[ ind_tmp + NLSF_QUANT_MAX_AMPLITUDE_EXT ];
out0_Q10 = silk_ADD16( out0_Q10, pred_Q10 );
out1_Q10 = silk_ADD16( out1_Q10, pred_Q10 );
prev_out_Q10[ j ] = out0_Q10;

View file

@ -46,6 +46,7 @@ static OPUS_INLINE void silk_nsq_scale_states(
const opus_int signal_type /* I Signal type */
);
#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
static OPUS_INLINE void silk_noise_shape_quantizer(
silk_nsq_state *NSQ, /* I/O NSQ state */
opus_int signalType, /* I Signal type */
@ -67,8 +68,10 @@ static OPUS_INLINE void silk_noise_shape_quantizer(
opus_int shapingLPCOrder, /* I Noise shaping AR filter order */
opus_int predictLPCOrder /* I Prediction filter order */
);
#endif
void silk_NSQ(
void silk_NSQ_c
(
const silk_encoder_state *psEncC, /* I/O Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
SideInfoIndices *psIndices, /* I/O Quantization Indices */
@ -141,7 +144,7 @@ void silk_NSQ(
silk_assert( start_idx > 0 );
silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder );
A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
NSQ->rewhite_flag = 1;
NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
@ -172,7 +175,11 @@ void silk_NSQ(
/***********************************/
/* silk_noise_shape_quantizer */
/***********************************/
static OPUS_INLINE void silk_noise_shape_quantizer(
#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
static OPUS_INLINE
#endif
void silk_noise_shape_quantizer(
silk_nsq_state *NSQ, /* I/O NSQ state */
opus_int signalType, /* I Signal type */
const opus_int32 x_sc_Q10[], /* I */

View file

@ -109,7 +109,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
opus_int decisionDelay /* I */
);
void silk_NSQ_del_dec(
void silk_NSQ_del_dec_c(
const silk_encoder_state *psEncC, /* I/O Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
SideInfoIndices *psIndices, /* I/O Quantization Indices */
@ -247,7 +247,7 @@ void silk_NSQ_del_dec(
silk_assert( start_idx > 0 );
silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder );
A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
NSQ->rewhite_flag = 1;

View file

@ -46,7 +46,8 @@ static OPUS_INLINE void silk_PLC_update(
static OPUS_INLINE void silk_PLC_conceal(
silk_decoder_state *psDec, /* I/O Decoder state */
silk_decoder_control *psDecCtrl, /* I/O Decoder control */
opus_int16 frame[] /* O LPC residual signal */
opus_int16 frame[], /* O LPC residual signal */
int arch /* I Run-time architecture */
);
@ -65,7 +66,8 @@ void silk_PLC(
silk_decoder_state *psDec, /* I/O Decoder state */
silk_decoder_control *psDecCtrl, /* I/O Decoder control */
opus_int16 frame[], /* I/O signal */
opus_int lost /* I Loss flag */
opus_int lost, /* I Loss flag */
int arch /* I Run-time architecture */
)
{
/* PLC control function */
@ -78,7 +80,7 @@ void silk_PLC(
/****************************/
/* Generate Signal */
/****************************/
silk_PLC_conceal( psDec, psDecCtrl, frame );
silk_PLC_conceal( psDec, psDecCtrl, frame, arch );
psDec->lossCnt++;
} else {
@ -192,7 +194,8 @@ static OPUS_INLINE void silk_PLC_energy(opus_int32 *energy1, opus_int *shift1, o
static OPUS_INLINE void silk_PLC_conceal(
silk_decoder_state *psDec, /* I/O Decoder state */
silk_decoder_control *psDecCtrl, /* I/O Decoder control */
opus_int16 frame[] /* O LPC residual signal */
opus_int16 frame[], /* O LPC residual signal */
int arch /* I Run-time architecture */
)
{
opus_int i, j, k;
@ -289,7 +292,7 @@ static OPUS_INLINE void silk_PLC_conceal(
/* Rewhiten LTP state */
idx = psDec->ltp_mem_length - lag - psDec->LPC_order - LTP_ORDER / 2;
silk_assert( idx > 0 );
silk_LPC_analysis_filter( &sLTP[ idx ], &psDec->outBuf[ idx ], A_Q12, psDec->ltp_mem_length - idx, psDec->LPC_order );
silk_LPC_analysis_filter( &sLTP[ idx ], &psDec->outBuf[ idx ], A_Q12, psDec->ltp_mem_length - idx, psDec->LPC_order, arch );
/* Scale LTP state */
inv_gain_Q30 = silk_INVERSE32_varQ( psPLC->prevGain_Q16[ 1 ], 46 );
inv_gain_Q30 = silk_min( inv_gain_Q30, silk_int32_MAX >> 1 );

View file

@ -48,7 +48,8 @@ void silk_PLC(
silk_decoder_state *psDec, /* I/O Decoder state */
silk_decoder_control *psDecCtrl, /* I/O Decoder control */
opus_int16 frame[], /* I/O signal */
opus_int lost /* I Loss flag */
opus_int lost, /* I Loss flag */
int arch /* I Run-time architecture */
);
void silk_PLC_glue_frames(

View file

@ -41,7 +41,11 @@ extern "C"
#include "typedef.h"
#include "resampler_structs.h"
#include "macros.h"
#include "cpu_support.h"
#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
#include "x86/SigProc_FIX_sse.h"
#endif
/********************************************************************/
/* SIGNAL PROCESSING FUNCTIONS */
@ -108,7 +112,8 @@ void silk_LPC_analysis_filter(
const opus_int16 *in, /* I Input signal */
const opus_int16 *B, /* I MA prediction coefficients, Q12 [order] */
const opus_int32 len, /* I Signal length */
const opus_int32 d /* I Filter order */
const opus_int32 d, /* I Filter order */
int arch /* I Run-time architecture */
);
/* Chirp (bandwidth expand) LP AR filter */
@ -303,7 +308,7 @@ void silk_NLSF_VQ_weights_laroia(
);
/* Compute reflection coefficients from input signal */
void silk_burg_modified(
void silk_burg_modified_c(
opus_int32 *res_nrg, /* O Residual energy */
opus_int *res_nrg_Q, /* O Residual energy Q value */
opus_int32 A_Q16[], /* O Prediction coefficients (length order) */
@ -335,12 +340,15 @@ void silk_scale_vector32_Q26_lshift_18(
/********************************************************************/
/* return sum( inVec1[i] * inVec2[i] ) */
opus_int32 silk_inner_prod_aligned(
const opus_int16 *const inVec1, /* I input vector 1 */
const opus_int16 *const inVec2, /* I input vector 2 */
const opus_int len /* I vector lengths */
const opus_int len, /* I vector lengths */
int arch /* I Run-time architecture */
);
opus_int32 silk_inner_prod_aligned_scale(
const opus_int16 *const inVec1, /* I input vector 1 */
const opus_int16 *const inVec2, /* I input vector 2 */
@ -348,7 +356,7 @@ opus_int32 silk_inner_prod_aligned_scale(
const opus_int len /* I vector lengths */
);
opus_int64 silk_inner_prod16_aligned_64(
opus_int64 silk_inner_prod16_aligned_64_c(
const opus_int16 *inVec1, /* I input vector 1 */
const opus_int16 *inVec2, /* I input vector 2 */
const opus_int len /* I vector lengths */
@ -575,6 +583,14 @@ static OPUS_INLINE opus_int64 silk_max_64(opus_int64 a, opus_int64 b)
/* the following seems faster on x86 */
#define silk_SMMUL(a32, b32) (opus_int32)silk_RSHIFT64(silk_SMULL((a32), (b32)), 32)
#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
#define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
((void)(arch), silk_burg_modified_c(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
((void)(arch),silk_inner_prod16_aligned_64_c(inVec1, inVec2, len))
#endif
#include "Inlines.h"
#include "MacroCount.h"
#include "MacroDebug.h"

View file

@ -33,10 +33,12 @@ POSSIBILITY OF SUCH DAMAGE.
#include "stack_alloc.h"
/* Silk VAD noise level estimation */
# if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
static OPUS_INLINE void silk_VAD_GetNoiseLevels(
const opus_int32 pX[ VAD_N_BANDS ], /* I subband energies */
silk_VAD_state *psSilk_VAD /* I/O Pointer to Silk VAD state */
);
#endif
/**********************************/
/* Initialization of the Silk VAD */
@ -77,7 +79,7 @@ static const opus_int32 tiltWeights[ VAD_N_BANDS ] = { 30000, 6000, -12000, -120
/***************************************/
/* Get the speech activity level in Q8 */
/***************************************/
opus_int silk_VAD_GetSA_Q8( /* O Return value, 0 if success */
opus_int silk_VAD_GetSA_Q8_c( /* O Return value, 0 if success */
silk_encoder_state *psEncC, /* I/O Encoder state */
const opus_int16 pIn[] /* I PCM input */
)
@ -296,7 +298,10 @@ opus_int silk_VAD_GetSA_Q8( /* O Return v
/**************************/
/* Noise level estimation */
/**************************/
static OPUS_INLINE void silk_VAD_GetNoiseLevels(
# if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
static OPUS_INLINE
#endif
void silk_VAD_GetNoiseLevels(
const opus_int32 pX[ VAD_N_BANDS ], /* I subband energies */
silk_VAD_state *psSilk_VAD /* I/O Pointer to Silk VAD state */
)

View file

@ -32,7 +32,7 @@ POSSIBILITY OF SUCH DAMAGE.
#include "main.h"
/* Entropy constrained matrix-weighted VQ, hard-coded to 5-element vectors, for a single input data vector */
void silk_VQ_WMat_EC(
void silk_VQ_WMat_EC_c(
opus_int8 *ind, /* O index of best codebook vector */
opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
opus_int *gain_Q7, /* O sum of absolute LTP coefficients */

View file

@ -85,7 +85,8 @@ opus_int silk_Decode( /* O Returns error co
opus_int newPacketFlag, /* I Indicates first decoder call for this packet */
ec_dec *psRangeDec, /* I/O Compressor data structure */
opus_int16 *samplesOut, /* O Decoded output speech vector */
opus_int32 *nSamplesOut /* O Number of samples decoded */
opus_int32 *nSamplesOut, /* O Number of samples decoded */
int arch /* I Run-time architecture */
)
{
opus_int i, n, decode_only_middle = 0, ret = SILK_NO_ERROR;
@ -296,7 +297,7 @@ opus_int silk_Decode( /* O Returns error co
} else {
condCoding = CODE_CONDITIONALLY;
}
ret += silk_decode_frame( &channel_state[ n ], psRangeDec, &samplesOut1_tmp[ n ][ 2 ], &nSamplesOutDec, lostFlag, condCoding);
ret += silk_decode_frame( &channel_state[ n ], psRangeDec, &samplesOut1_tmp[ n ][ 2 ], &nSamplesOutDec, lostFlag, condCoding, arch);
} else {
silk_memset( &samplesOut1_tmp[ n ][ 2 ], 0, nSamplesOutDec * sizeof( opus_int16 ) );
}

View file

@ -39,7 +39,8 @@ void silk_decode_core(
silk_decoder_state *psDec, /* I/O Decoder state */
silk_decoder_control *psDecCtrl, /* I Decoder control */
opus_int16 xq[], /* O Decoded speech */
const opus_int16 pulses[ MAX_FRAME_LENGTH ] /* I Pulse signal */
const opus_int16 pulses[ MAX_FRAME_LENGTH ], /* I Pulse signal */
int arch /* I Run-time architecture */
)
{
opus_int i, k, lag = 0, start_idx, sLTP_buf_idx, NLSF_interpolation_flag, signalType;
@ -147,7 +148,7 @@ void silk_decode_core(
}
silk_LPC_analysis_filter( &sLTP[ start_idx ], &psDec->outBuf[ start_idx + k * psDec->subfr_length ],
A_Q12, psDec->ltp_mem_length - start_idx, psDec->LPC_order );
A_Q12, psDec->ltp_mem_length - start_idx, psDec->LPC_order, arch );
/* After rewhitening the LTP state is unscaled */
if( k == 0 ) {

View file

@ -42,7 +42,8 @@ opus_int silk_decode_frame(
opus_int16 pOut[], /* O Pointer to output speech frame */
opus_int32 *pN, /* O Pointer to size of output frame */
opus_int lostFlag, /* I 0: no loss, 1 loss, 2 decode fec */
opus_int condCoding /* I The type of conditional coding to use */
opus_int condCoding, /* I The type of conditional coding to use */
int arch /* I Run-time architecture */
)
{
VARDECL( silk_decoder_control, psDecCtrl );
@ -81,12 +82,12 @@ opus_int silk_decode_frame(
/********************************************************/
/* Run inverse NSQ */
/********************************************************/
silk_decode_core( psDec, psDecCtrl, pOut, pulses );
silk_decode_core( psDec, psDecCtrl, pOut, pulses, arch );
/********************************************************/
/* Update PLC state */
/********************************************************/
silk_PLC( psDec, psDecCtrl, pOut, 0 );
silk_PLC( psDec, psDecCtrl, pOut, 0, arch );
psDec->lossCnt = 0;
psDec->prevSignalType = psDec->indices.signalType;
@ -96,7 +97,7 @@ opus_int silk_decode_frame(
psDec->first_frame_after_reset = 0;
} else {
/* Handle packet loss by extrapolation */
silk_PLC( psDec, psDecCtrl, pOut, 1 );
silk_PLC( psDec, psDecCtrl, pOut, 1, arch );
}
/*************************/

View file

@ -45,7 +45,7 @@ void silk_LTP_analysis_filter_FIX(
const opus_int16 *x_ptr, *x_lag_ptr;
opus_int16 Btmp_Q14[ LTP_ORDER ];
opus_int16 *LTP_res_ptr;
opus_int k, i, j;
opus_int k, i;
opus_int32 LTP_est;
x_ptr = x;
@ -53,9 +53,12 @@ void silk_LTP_analysis_filter_FIX(
for( k = 0; k < nb_subfr; k++ ) {
x_lag_ptr = x_ptr - pitchL[ k ];
for( i = 0; i < LTP_ORDER; i++ ) {
Btmp_Q14[ i ] = LTPCoef_Q14[ k * LTP_ORDER + i ];
}
Btmp_Q14[ 0 ] = LTPCoef_Q14[ k * LTP_ORDER ];
Btmp_Q14[ 1 ] = LTPCoef_Q14[ k * LTP_ORDER + 1 ];
Btmp_Q14[ 2 ] = LTPCoef_Q14[ k * LTP_ORDER + 2 ];
Btmp_Q14[ 3 ] = LTPCoef_Q14[ k * LTP_ORDER + 3 ];
Btmp_Q14[ 4 ] = LTPCoef_Q14[ k * LTP_ORDER + 4 ];
/* LTP analysis FIR filter */
for( i = 0; i < subfr_length + pre_length; i++ ) {
@ -63,9 +66,11 @@ void silk_LTP_analysis_filter_FIX(
/* Long-term prediction */
LTP_est = silk_SMULBB( x_lag_ptr[ LTP_ORDER / 2 ], Btmp_Q14[ 0 ] );
for( j = 1; j < LTP_ORDER; j++ ) {
LTP_est = silk_SMLABB_ovflw( LTP_est, x_lag_ptr[ LTP_ORDER / 2 - j ], Btmp_Q14[ j ] );
}
LTP_est = silk_SMLABB_ovflw( LTP_est, x_lag_ptr[ 1 ], Btmp_Q14[ 1 ] );
LTP_est = silk_SMLABB_ovflw( LTP_est, x_lag_ptr[ 0 ], Btmp_Q14[ 2 ] );
LTP_est = silk_SMLABB_ovflw( LTP_est, x_lag_ptr[ -1 ], Btmp_Q14[ 3 ] );
LTP_est = silk_SMLABB_ovflw( LTP_est, x_lag_ptr[ -2 ], Btmp_Q14[ 4 ] );
LTP_est = silk_RSHIFT_ROUND( LTP_est, 14 ); /* round and -> Q0*/
/* Subtract long-term prediction */

View file

@ -42,7 +42,7 @@ POSSIBILITY OF SUCH DAMAGE.
#define MAX_RSHIFTS (32 - QA)
/* Compute reflection coefficients from input signal */
void silk_burg_modified(
void silk_burg_modified_c(
opus_int32 *res_nrg, /* O Residual energy */
opus_int *res_nrg_Q, /* O Residual energy Q value */
opus_int32 A_Q16[], /* O Prediction coefficients (length order) */
@ -68,7 +68,7 @@ void silk_burg_modified(
silk_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
/* Compute autocorrelations, added over subframes */
C0_64 = silk_inner_prod16_aligned_64( x, x, subfr_length*nb_subfr );
C0_64 = silk_inner_prod16_aligned_64( x, x, subfr_length*nb_subfr, arch );
lz = silk_CLZ64(C0_64);
rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz;
if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS;
@ -87,7 +87,7 @@ void silk_burg_modified(
x_ptr = x + s * subfr_length;
for( n = 1; n < D + 1; n++ ) {
C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n ), rshifts );
silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
}
}
} else {
@ -248,12 +248,12 @@ void silk_burg_modified(
if( rshifts > 0 ) {
for( s = 0; s < nb_subfr; s++ ) {
x_ptr = x + s * subfr_length;
C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D ), rshifts );
C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
}
} else {
for( s = 0; s < nb_subfr; s++ ) {
x_ptr = x + s * subfr_length;
C0 -= silk_LSHIFT32( silk_inner_prod_aligned( x_ptr, x_ptr, D ), -rshifts );
C0 -= silk_LSHIFT32( silk_inner_prod_aligned( x_ptr, x_ptr, D, arch), -rshifts);
}
}
/* Approximate residual energy */

View file

@ -42,7 +42,8 @@ void silk_corrVector_FIX(
const opus_int L, /* I Length of vectors */
const opus_int order, /* I Max lag for correlation */
opus_int32 *Xt, /* O Pointer to X'*t correlation vector [order] */
const opus_int rshifts /* I Right shifts of correlations */
const opus_int rshifts, /* I Right shifts of correlations */
int arch /* I Run-time architecture */
)
{
opus_int lag, i;
@ -65,7 +66,7 @@ void silk_corrVector_FIX(
} else {
silk_assert( rshifts == 0 );
for( lag = 0; lag < order; lag++ ) {
Xt[ lag ] = silk_inner_prod_aligned( ptr1, ptr2, L ); /* X[:,lag]'*t */
Xt[ lag ] = silk_inner_prod_aligned( ptr1, ptr2, L, arch ); /* X[:,lag]'*t */
ptr1--; /* Go to next column of X */
}
}
@ -78,7 +79,8 @@ void silk_corrMatrix_FIX(
const opus_int order, /* I Max lag for correlation */
const opus_int head_room, /* I Desired headroom */
opus_int32 *XX, /* O Pointer to X'*X correlation matrix [ order x order ] */
opus_int *rshifts /* I/O Right shifts of correlations */
opus_int *rshifts, /* I/O Right shifts of correlations */
int arch /* I Run-time architecture */
)
{
opus_int i, j, lag, rshifts_local, head_room_rshifts;
@ -138,7 +140,7 @@ void silk_corrMatrix_FIX(
} else {
for( lag = 1; lag < order; lag++ ) {
/* Inner product of column 0 and column lag: X[:,0]'*X[:,lag] */
energy = silk_inner_prod_aligned( ptr1, ptr2, L );
energy = silk_inner_prod_aligned( ptr1, ptr2, L, arch );
matrix_ptr( XX, lag, 0, order ) = energy;
matrix_ptr( XX, 0, lag, order ) = energy;
/* Calculate remaining off diagonal: X[:,j]'*X[:,j + lag] */

View file

@ -48,7 +48,7 @@ void silk_encode_do_VAD_FIX(
/****************************/
/* Voice Activity Detection */
/****************************/
silk_VAD_GetSA_Q8( &psEnc->sCmn, psEnc->sCmn.inputBuf + 1 );
silk_VAD_GetSA_Q8( &psEnc->sCmn, psEnc->sCmn.inputBuf + 1, psEnc->sCmn.arch );
/**************************************************/
/* Convert speech activity into VAD and DTX flags */
@ -196,11 +196,13 @@ opus_int silk_encode_frame_FIX(
if( psEnc->sCmn.nStatesDelayedDecision > 1 || psEnc->sCmn.warping_Q16 > 0 ) {
silk_NSQ_del_dec( &psEnc->sCmn, &psEnc->sCmn.sNSQ, &psEnc->sCmn.indices, xfw_Q3, psEnc->sCmn.pulses,
sEncCtrl.PredCoef_Q12[ 0 ], sEncCtrl.LTPCoef_Q14, sEncCtrl.AR2_Q13, sEncCtrl.HarmShapeGain_Q14,
sEncCtrl.Tilt_Q14, sEncCtrl.LF_shp_Q14, sEncCtrl.Gains_Q16, sEncCtrl.pitchL, sEncCtrl.Lambda_Q10, sEncCtrl.LTP_scale_Q14 );
sEncCtrl.Tilt_Q14, sEncCtrl.LF_shp_Q14, sEncCtrl.Gains_Q16, sEncCtrl.pitchL, sEncCtrl.Lambda_Q10, sEncCtrl.LTP_scale_Q14,
psEnc->sCmn.arch );
} else {
silk_NSQ( &psEnc->sCmn, &psEnc->sCmn.sNSQ, &psEnc->sCmn.indices, xfw_Q3, psEnc->sCmn.pulses,
sEncCtrl.PredCoef_Q12[ 0 ], sEncCtrl.LTPCoef_Q14, sEncCtrl.AR2_Q13, sEncCtrl.HarmShapeGain_Q14,
sEncCtrl.Tilt_Q14, sEncCtrl.LF_shp_Q14, sEncCtrl.Gains_Q16, sEncCtrl.pitchL, sEncCtrl.Lambda_Q10, sEncCtrl.LTP_scale_Q14 );
sEncCtrl.Tilt_Q14, sEncCtrl.LF_shp_Q14, sEncCtrl.Gains_Q16, sEncCtrl.pitchL, sEncCtrl.Lambda_Q10, sEncCtrl.LTP_scale_Q14,
psEnc->sCmn.arch);
}
/****************************************/
@ -371,12 +373,12 @@ static OPUS_INLINE void silk_LBRR_encode_FIX(
silk_NSQ_del_dec( &psEnc->sCmn, &sNSQ_LBRR, psIndices_LBRR, xfw_Q3,
psEnc->sCmn.pulses_LBRR[ psEnc->sCmn.nFramesEncoded ], psEncCtrl->PredCoef_Q12[ 0 ], psEncCtrl->LTPCoef_Q14,
psEncCtrl->AR2_Q13, psEncCtrl->HarmShapeGain_Q14, psEncCtrl->Tilt_Q14, psEncCtrl->LF_shp_Q14,
psEncCtrl->Gains_Q16, psEncCtrl->pitchL, psEncCtrl->Lambda_Q10, psEncCtrl->LTP_scale_Q14 );
psEncCtrl->Gains_Q16, psEncCtrl->pitchL, psEncCtrl->Lambda_Q10, psEncCtrl->LTP_scale_Q14, psEnc->sCmn.arch );
} else {
silk_NSQ( &psEnc->sCmn, &sNSQ_LBRR, psIndices_LBRR, xfw_Q3,
psEnc->sCmn.pulses_LBRR[ psEnc->sCmn.nFramesEncoded ], psEncCtrl->PredCoef_Q12[ 0 ], psEncCtrl->LTPCoef_Q14,
psEncCtrl->AR2_Q13, psEncCtrl->HarmShapeGain_Q14, psEncCtrl->Tilt_Q14, psEncCtrl->LF_shp_Q14,
psEncCtrl->Gains_Q16, psEncCtrl->pitchL, psEncCtrl->Lambda_Q10, psEncCtrl->LTP_scale_Q14 );
psEncCtrl->Gains_Q16, psEncCtrl->pitchL, psEncCtrl->Lambda_Q10, psEncCtrl->LTP_scale_Q14, psEnc->sCmn.arch );
}
/* Restore original gains */

View file

@ -95,7 +95,7 @@ void silk_find_LPC_FIX(
silk_NLSF2A( a_tmp_Q12, NLSF0_Q15, psEncC->predictLPCOrder );
/* Calculate residual energy with NLSF interpolation */
silk_LPC_analysis_filter( LPC_res, x, a_tmp_Q12, 2 * subfr_length, psEncC->predictLPCOrder );
silk_LPC_analysis_filter( LPC_res, x, a_tmp_Q12, 2 * subfr_length, psEncC->predictLPCOrder, psEncC->arch );
silk_sum_sqr_shift( &res_nrg0, &rshift0, LPC_res + psEncC->predictLPCOrder, subfr_length - psEncC->predictLPCOrder );
silk_sum_sqr_shift( &res_nrg1, &rshift1, LPC_res + psEncC->predictLPCOrder + subfr_length, subfr_length - psEncC->predictLPCOrder );

View file

@ -50,7 +50,8 @@ void silk_find_LTP_FIX(
const opus_int subfr_length, /* I subframe length */
const opus_int nb_subfr, /* I number of subframes */
const opus_int mem_offset, /* I number of samples in LTP memory */
opus_int corr_rshifts[ MAX_NB_SUBFR ] /* O right shifts applied to correlations */
opus_int corr_rshifts[ MAX_NB_SUBFR ], /* O right shifts applied to correlations */
int arch /* I Run-time architecture */
)
{
opus_int i, k, lshift;
@ -84,10 +85,10 @@ void silk_find_LTP_FIX(
rr_shifts += ( LTP_CORRS_HEAD_ROOM - LZs );
}
corr_rshifts[ k ] = rr_shifts;
silk_corrMatrix_FIX( lag_ptr, subfr_length, LTP_ORDER, LTP_CORRS_HEAD_ROOM, WLTP_ptr, &corr_rshifts[ k ] ); /* WLTP_fix_ptr in Q( -corr_rshifts[ k ] ) */
silk_corrMatrix_FIX( lag_ptr, subfr_length, LTP_ORDER, LTP_CORRS_HEAD_ROOM, WLTP_ptr, &corr_rshifts[ k ], arch ); /* WLTP_fix_ptr in Q( -corr_rshifts[ k ] ) */
/* The correlation vector always has lower max abs value than rr and/or RR so head room is assured */
silk_corrVector_FIX( lag_ptr, r_ptr, subfr_length, LTP_ORDER, Rr, corr_rshifts[ k ] ); /* Rr_fix_ptr in Q( -corr_rshifts[ k ] ) */
silk_corrVector_FIX( lag_ptr, r_ptr, subfr_length, LTP_ORDER, Rr, corr_rshifts[ k ], arch ); /* Rr_fix_ptr in Q( -corr_rshifts[ k ] ) */
if( corr_rshifts[ k ] > rr_shifts ) {
rr[ k ] = silk_RSHIFT( rr[ k ], corr_rshifts[ k ] - rr_shifts ); /* rr[ k ] in Q( -corr_rshifts[ k ] ) */
}

View file

@ -112,7 +112,7 @@ void silk_find_pitch_lags_FIX(
/*****************************************/
/* LPC analysis filtering */
/*****************************************/
silk_LPC_analysis_filter( res, x_buf, A_Q12, buf_len, psEnc->sCmn.pitchEstimationLPCOrder );
silk_LPC_analysis_filter( res, x_buf, A_Q12, buf_len, psEnc->sCmn.pitchEstimationLPCOrder, psEnc->sCmn.arch );
if( psEnc->sCmn.indices.signalType != TYPE_NO_VOICE_ACTIVITY && psEnc->sCmn.first_frame_after_reset == 0 ) {
/* Threshold for pitch estimator */

View file

@ -89,11 +89,12 @@ void silk_find_pred_coefs_FIX(
/* LTP analysis */
silk_find_LTP_FIX( psEncCtrl->LTPCoef_Q14, WLTP, &psEncCtrl->LTPredCodGain_Q7,
res_pitch, psEncCtrl->pitchL, Wght_Q15, psEnc->sCmn.subfr_length,
psEnc->sCmn.nb_subfr, psEnc->sCmn.ltp_mem_length, LTP_corrs_rshift );
psEnc->sCmn.nb_subfr, psEnc->sCmn.ltp_mem_length, LTP_corrs_rshift, psEnc->sCmn.arch );
/* Quantize LTP gain parameters */
silk_quant_LTP_gains( psEncCtrl->LTPCoef_Q14, psEnc->sCmn.indices.LTPIndex, &psEnc->sCmn.indices.PERIndex,
&psEnc->sCmn.sum_log_gain_Q7, WLTP, psEnc->sCmn.mu_LTP_Q9, psEnc->sCmn.LTPQuantLowComplexity, psEnc->sCmn.nb_subfr);
&psEnc->sCmn.sum_log_gain_Q7, WLTP, psEnc->sCmn.mu_LTP_Q9, psEnc->sCmn.LTPQuantLowComplexity, psEnc->sCmn.nb_subfr,
psEnc->sCmn.arch);
/* Control LTP scaling */
silk_LTP_scale_ctrl_FIX( psEnc, psEncCtrl, condCoding );
@ -139,7 +140,7 @@ void silk_find_pred_coefs_FIX(
/* Calculate residual energy using quantized LPC coefficients */
silk_residual_energy_FIX( psEncCtrl->ResNrg, psEncCtrl->ResNrgQ, LPC_in_pre, psEncCtrl->PredCoef_Q12, local_gains,
psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr, psEnc->sCmn.predictLPCOrder );
psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr, psEnc->sCmn.predictLPCOrder, psEnc->sCmn.arch );
/* Copy to prediction struct for use in next frame for interpolation */
silk_memcpy( psEnc->sCmn.prev_NLSFq_Q15, NLSF_Q15, sizeof( psEnc->sCmn.prev_NLSFq_Q15 ) );

View file

@ -166,7 +166,8 @@ void silk_find_LTP_FIX(
const opus_int subfr_length, /* I subframe length */
const opus_int nb_subfr, /* I number of subframes */
const opus_int mem_offset, /* I number of samples in LTP memory */
opus_int corr_rshifts[ MAX_NB_SUBFR ] /* O right shifts applied to correlations */
opus_int corr_rshifts[ MAX_NB_SUBFR ], /* O right shifts applied to correlations */
int arch /* I Run-time architecture */
);
void silk_LTP_analysis_filter_FIX(
@ -190,7 +191,8 @@ void silk_residual_energy_FIX(
const opus_int32 gains[ MAX_NB_SUBFR ], /* I Quantization gains */
const opus_int subfr_length, /* I Subframe length */
const opus_int nb_subfr, /* I Number of subframes */
const opus_int LPC_order /* I LPC order */
const opus_int LPC_order, /* I LPC order */
int arch /* I Run-time architecture */
);
/* Residual energy: nrg = wxx - 2 * wXx * c + c' * wXX * c */
@ -220,7 +222,8 @@ void silk_corrMatrix_FIX(
const opus_int order, /* I Max lag for correlation */
const opus_int head_room, /* I Desired headroom */
opus_int32 *XX, /* O Pointer to X'*X correlation matrix [ order x order ] */
opus_int *rshifts /* I/O Right shifts of correlations */
opus_int *rshifts, /* I/O Right shifts of correlations */
int arch /* I Run-time architecture */
);
/* Calculates correlation vector X'*t */
@ -230,7 +233,8 @@ void silk_corrVector_FIX(
const opus_int L, /* I Length of vectors */
const opus_int order, /* I Max lag for correlation */
opus_int32 *Xt, /* O Pointer to X'*t correlation vector [order] */
const opus_int rshifts /* I Right shifts of correlations */
const opus_int rshifts, /* I Right shifts of correlations */
int arch /* I Run-time architecture */
);
/* Add noise to matrix diagonal */

View file

@ -72,7 +72,8 @@ static void silk_P_Ana_calc_energy_st3(
opus_int start_lag, /* I lag offset to search around */
opus_int sf_length, /* I length of one 5 ms subframe */
opus_int nb_subfr, /* I number of subframes */
opus_int complexity /* I Complexity setting */
opus_int complexity, /* I Complexity setting */
int arch /* I Run-time architecture */
);
/*************************************************************/
@ -195,8 +196,8 @@ opus_int silk_pitch_analysis_core( /* O Voicing estimate: 0
/* Calculate first vector products before loop */
cross_corr = xcorr32[ MAX_LAG_4KHZ - MIN_LAG_4KHZ ];
normalizer = silk_inner_prod_aligned( target_ptr, target_ptr, SF_LENGTH_8KHZ );
normalizer = silk_ADD32( normalizer, silk_inner_prod_aligned( basis_ptr, basis_ptr, SF_LENGTH_8KHZ ) );
normalizer = silk_inner_prod_aligned( target_ptr, target_ptr, SF_LENGTH_8KHZ, arch );
normalizer = silk_ADD32( normalizer, silk_inner_prod_aligned( basis_ptr, basis_ptr, SF_LENGTH_8KHZ, arch ) );
normalizer = silk_ADD32( normalizer, silk_SMULBB( SF_LENGTH_8KHZ, 4000 ) );
matrix_ptr( C, k, 0, CSTRIDE_4KHZ ) =
@ -334,7 +335,7 @@ opus_int silk_pitch_analysis_core( /* O Voicing estimate: 0
silk_assert( target_ptr >= frame_8kHz );
silk_assert( target_ptr + SF_LENGTH_8KHZ <= frame_8kHz + frame_length_8kHz );
energy_target = silk_ADD32( silk_inner_prod_aligned( target_ptr, target_ptr, SF_LENGTH_8KHZ ), 1 );
energy_target = silk_ADD32( silk_inner_prod_aligned( target_ptr, target_ptr, SF_LENGTH_8KHZ, arch ), 1 );
for( j = 0; j < length_d_comp; j++ ) {
d = d_comp[ j ];
basis_ptr = target_ptr - d;
@ -343,9 +344,9 @@ opus_int silk_pitch_analysis_core( /* O Voicing estimate: 0
silk_assert( basis_ptr >= frame_8kHz );
silk_assert( basis_ptr + SF_LENGTH_8KHZ <= frame_8kHz + frame_length_8kHz );
cross_corr = silk_inner_prod_aligned( target_ptr, basis_ptr, SF_LENGTH_8KHZ );
cross_corr = silk_inner_prod_aligned( target_ptr, basis_ptr, SF_LENGTH_8KHZ, arch );
if( cross_corr > 0 ) {
energy_basis = silk_inner_prod_aligned( basis_ptr, basis_ptr, SF_LENGTH_8KHZ );
energy_basis = silk_inner_prod_aligned( basis_ptr, basis_ptr, SF_LENGTH_8KHZ, arch );
matrix_ptr( C, k, d - ( MIN_LAG_8KHZ - 2 ), CSTRIDE_8KHZ ) =
(opus_int16)silk_DIV32_varQ( cross_corr,
silk_ADD32( energy_target,
@ -519,14 +520,14 @@ opus_int silk_pitch_analysis_core( /* O Voicing estimate: 0
ALLOC( energies_st3, nb_subfr * nb_cbk_search, silk_pe_stage3_vals );
ALLOC( cross_corr_st3, nb_subfr * nb_cbk_search, silk_pe_stage3_vals );
silk_P_Ana_calc_corr_st3( cross_corr_st3, input_frame_ptr, start_lag, sf_length, nb_subfr, complexity, arch );
silk_P_Ana_calc_energy_st3( energies_st3, input_frame_ptr, start_lag, sf_length, nb_subfr, complexity );
silk_P_Ana_calc_energy_st3( energies_st3, input_frame_ptr, start_lag, sf_length, nb_subfr, complexity, arch );
lag_counter = 0;
silk_assert( lag == silk_SAT16( lag ) );
contour_bias_Q15 = silk_DIV32_16( SILK_FIX_CONST( PE_FLATCONTOUR_BIAS, 15 ), lag );
target_ptr = &input_frame_ptr[ PE_LTP_MEM_LENGTH_MS * Fs_kHz ];
energy_target = silk_ADD32( silk_inner_prod_aligned( target_ptr, target_ptr, nb_subfr * sf_length ), 1 );
energy_target = silk_ADD32( silk_inner_prod_aligned( target_ptr, target_ptr, nb_subfr * sf_length, arch ), 1 );
for( d = start_lag; d <= end_lag; d++ ) {
for( j = 0; j < nb_cbk_search; j++ ) {
cross_corr = 0;
@ -671,7 +672,8 @@ static void silk_P_Ana_calc_energy_st3(
opus_int start_lag, /* I lag offset to search around */
opus_int sf_length, /* I length of one 5 ms subframe */
opus_int nb_subfr, /* I number of subframes */
opus_int complexity /* I Complexity setting */
opus_int complexity, /* I Complexity setting */
int arch /* I Run-time architecture */
)
{
const opus_int16 *target_ptr, *basis_ptr;
@ -705,7 +707,7 @@ static void silk_P_Ana_calc_energy_st3(
/* Calculate the energy for first lag */
basis_ptr = target_ptr - ( start_lag + matrix_ptr( Lag_range_ptr, k, 0, 2 ) );
energy = silk_inner_prod_aligned( basis_ptr, basis_ptr, sf_length );
energy = silk_inner_prod_aligned( basis_ptr, basis_ptr, sf_length, arch );
silk_assert( energy >= 0 );
scratch_mem[ lag_counter ] = energy;
lag_counter++;

View file

@ -50,8 +50,7 @@ static OPUS_INLINE void silk_prefilt_FIX(
opus_int length /* I Length of signals */
);
#ifndef OVERRIDE_silk_warped_LPC_analysis_filter_FIX
void silk_warped_LPC_analysis_filter_FIX(
void silk_warped_LPC_analysis_filter_FIX_c(
opus_int32 state[], /* I/O State [order + 1] */
opus_int32 res_Q2[], /* O Residual signal [length] */
const opus_int16 coef_Q13[], /* I Coefficients [order] */
@ -92,7 +91,6 @@ void silk_warped_LPC_analysis_filter_FIX(
res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( acc_Q11, 9 );
}
}
#endif /* OVERRIDE_silk_warped_LPC_analysis_filter_FIX */
void silk_prefilter_FIX(
silk_encoder_state_FIX *psEnc, /* I/O Encoder state */
@ -137,7 +135,7 @@ void silk_prefilter_FIX(
/* Short term FIR filtering*/
silk_warped_LPC_analysis_filter_FIX( P->sAR_shp, st_res_Q2, AR1_shp_Q13, px,
psEnc->sCmn.warping_Q16, psEnc->sCmn.subfr_length, psEnc->sCmn.shapingLPCOrder );
psEnc->sCmn.warping_Q16, psEnc->sCmn.subfr_length, psEnc->sCmn.shapingLPCOrder, psEnc->sCmn.arch );
/* Reduce (mainly) low frequencies during harmonic emphasis */
B_Q10[ 0 ] = silk_RSHIFT_ROUND( psEncCtrl->GainsPre_Q14[ k ], 4 );

View file

@ -42,7 +42,8 @@ void silk_residual_energy_FIX(
const opus_int32 gains[ MAX_NB_SUBFR ], /* I Quantization gains */
const opus_int subfr_length, /* I Subframe length */
const opus_int nb_subfr, /* I Number of subframes */
const opus_int LPC_order /* I LPC order */
const opus_int LPC_order, /* I LPC order */
int arch /* I Run-time architecture */
)
{
opus_int offset, i, j, rshift, lz1, lz2;
@ -60,7 +61,7 @@ void silk_residual_energy_FIX(
silk_assert( ( nb_subfr >> 1 ) * ( MAX_NB_SUBFR >> 1 ) == nb_subfr );
for( i = 0; i < nb_subfr >> 1; i++ ) {
/* Calculate half frame LPC residual signal including preceding samples */
silk_LPC_analysis_filter( LPC_res, x_ptr, a_Q12[ i ], ( MAX_NB_SUBFR >> 1 ) * offset, LPC_order );
silk_LPC_analysis_filter( LPC_res, x_ptr, a_Q12[ i ], ( MAX_NB_SUBFR >> 1 ) * offset, LPC_order, arch );
/* Point to first subframe of the just calculated LPC residual signal */
LPC_res_ptr = LPC_res + LPC_order;

View file

@ -71,11 +71,12 @@ void silk_scale_vector32_Q26_lshift_18(
opus_int32 silk_inner_prod_aligned(
const opus_int16 *const inVec1, /* I input vector 1 */
const opus_int16 *const inVec2, /* I input vector 2 */
const opus_int len /* I vector lengths */
const opus_int len, /* I vector lengths */
int arch /* I Run-time architecture */
)
{
#ifdef FIXED_POINT
return celt_inner_prod(inVec1, inVec2, len);
return celt_inner_prod(inVec1, inVec2, len, arch);
#else
opus_int i;
opus_int32 sum = 0;
@ -86,7 +87,7 @@ opus_int32 silk_inner_prod_aligned(
#endif
}
opus_int64 silk_inner_prod16_aligned_64(
opus_int64 silk_inner_prod16_aligned_64_c(
const opus_int16 *inVec1, /* I input vector 1 */
const opus_int16 *inVec2, /* I input vector 2 */
const opus_int len /* I vector lengths */

View file

@ -0,0 +1,375 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <xmmintrin.h>
#include <emmintrin.h>
#include <smmintrin.h>
#include "SigProc_FIX.h"
#include "define.h"
#include "tuning_parameters.h"
#include "pitch.h"
#include "celt/x86/x86cpu.h"
#define MAX_FRAME_SIZE 384 /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384 */
#define QA 25
#define N_BITS_HEAD_ROOM 2
#define MIN_RSHIFTS -16
#define MAX_RSHIFTS (32 - QA)
/* Compute reflection coefficients from input signal */
void silk_burg_modified_sse4_1(
opus_int32 *res_nrg, /* O Residual energy */
opus_int *res_nrg_Q, /* O Residual energy Q value */
opus_int32 A_Q16[], /* O Prediction coefficients (length order) */
const opus_int16 x[], /* I Input signal, length: nb_subfr * ( D + subfr_length ) */
const opus_int32 minInvGain_Q30, /* I Inverse of max prediction gain */
const opus_int subfr_length, /* I Input signal subframe length (incl. D preceding samples) */
const opus_int nb_subfr, /* I Number of subframes stacked in x */
const opus_int D, /* I Order */
int arch /* I Run-time architecture */
)
{
opus_int k, n, s, lz, rshifts, rshifts_extra, reached_max_gain;
opus_int32 C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2;
const opus_int16 *x_ptr;
opus_int32 C_first_row[ SILK_MAX_ORDER_LPC ];
opus_int32 C_last_row[ SILK_MAX_ORDER_LPC ];
opus_int32 Af_QA[ SILK_MAX_ORDER_LPC ];
opus_int32 CAf[ SILK_MAX_ORDER_LPC + 1 ];
opus_int32 CAb[ SILK_MAX_ORDER_LPC + 1 ];
opus_int32 xcorr[ SILK_MAX_ORDER_LPC ];
__m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210;
__m128i CONST1 = _mm_set1_epi32(1);
silk_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
/* Compute autocorrelations, added over subframes */
silk_sum_sqr_shift( &C0, &rshifts, x, nb_subfr * subfr_length );
if( rshifts > MAX_RSHIFTS ) {
C0 = silk_LSHIFT32( C0, rshifts - MAX_RSHIFTS );
silk_assert( C0 > 0 );
rshifts = MAX_RSHIFTS;
} else {
lz = silk_CLZ32( C0 ) - 1;
rshifts_extra = N_BITS_HEAD_ROOM - lz;
if( rshifts_extra > 0 ) {
rshifts_extra = silk_min( rshifts_extra, MAX_RSHIFTS - rshifts );
C0 = silk_RSHIFT32( C0, rshifts_extra );
} else {
rshifts_extra = silk_max( rshifts_extra, MIN_RSHIFTS - rshifts );
C0 = silk_LSHIFT32( C0, -rshifts_extra );
}
rshifts += rshifts_extra;
}
CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1; /* Q(-rshifts) */
silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
if( rshifts > 0 ) {
for( s = 0; s < nb_subfr; s++ ) {
x_ptr = x + s * subfr_length;
for( n = 1; n < D + 1; n++ ) {
C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
}
}
} else {
for( s = 0; s < nb_subfr; s++ ) {
int i;
opus_int32 d;
x_ptr = x + s * subfr_length;
celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D, arch );
for( n = 1; n < D + 1; n++ ) {
for ( i = n + subfr_length - D, d = 0; i < subfr_length; i++ )
d = MAC16_16( d, x_ptr[ i ], x_ptr[ i - n ] );
xcorr[ n - 1 ] += d;
}
for( n = 1; n < D + 1; n++ ) {
C_first_row[ n - 1 ] += silk_LSHIFT32( xcorr[ n - 1 ], -rshifts );
}
}
}
silk_memcpy( C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
/* Initialize */
CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1; /* Q(-rshifts) */
invGain_Q30 = (opus_int32)1 << 30;
reached_max_gain = 0;
for( n = 0; n < D; n++ ) {
/* Update first row of correlation matrix (without first element) */
/* Update last row of correlation matrix (without last element, stored in reversed order) */
/* Update C * Af */
/* Update C * flipud(Af) (stored in reversed order) */
if( rshifts > -2 ) {
for( s = 0; s < nb_subfr; s++ ) {
x_ptr = x + s * subfr_length;
x1 = -silk_LSHIFT32( (opus_int32)x_ptr[ n ], 16 - rshifts ); /* Q(16-rshifts) */
x2 = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 16 - rshifts ); /* Q(16-rshifts) */
tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ], QA - 16 ); /* Q(QA-16) */
tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], QA - 16 ); /* Q(QA-16) */
for( k = 0; k < n; k++ ) {
C_first_row[ k ] = silk_SMLAWB( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */
C_last_row[ k ] = silk_SMLAWB( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
Atmp_QA = Af_QA[ k ];
tmp1 = silk_SMLAWB( tmp1, Atmp_QA, x_ptr[ n - k - 1 ] ); /* Q(QA-16) */
tmp2 = silk_SMLAWB( tmp2, Atmp_QA, x_ptr[ subfr_length - n + k ] ); /* Q(QA-16) */
}
tmp1 = silk_LSHIFT32( -tmp1, 32 - QA - rshifts ); /* Q(16-rshifts) */
tmp2 = silk_LSHIFT32( -tmp2, 32 - QA - rshifts ); /* Q(16-rshifts) */
for( k = 0; k <= n; k++ ) {
CAf[ k ] = silk_SMLAWB( CAf[ k ], tmp1, x_ptr[ n - k ] ); /* Q( -rshift ) */
CAb[ k ] = silk_SMLAWB( CAb[ k ], tmp2, x_ptr[ subfr_length - n + k - 1 ] ); /* Q( -rshift ) */
}
}
} else {
for( s = 0; s < nb_subfr; s++ ) {
x_ptr = x + s * subfr_length;
x1 = -silk_LSHIFT32( (opus_int32)x_ptr[ n ], -rshifts ); /* Q( -rshifts ) */
x2 = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], -rshifts ); /* Q( -rshifts ) */
tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ], 17 ); /* Q17 */
tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 17 ); /* Q17 */
X1_3210 = _mm_set1_epi32( x1 );
X2_3210 = _mm_set1_epi32( x2 );
TMP1_3210 = _mm_setzero_si128();
TMP2_3210 = _mm_setzero_si128();
for( k = 0; k < n - 3; k += 4 ) {
PTR_3210 = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 1 - 3 ] );
SUBFR_3210 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n + k ] );
FIRST_3210 = _mm_loadu_si128( (__m128i *)&C_first_row[ k ] );
PTR_3210 = _mm_shuffle_epi32( PTR_3210, _MM_SHUFFLE( 0, 1, 2, 3 ) );
LAST_3210 = _mm_loadu_si128( (__m128i *)&C_last_row[ k ] );
ATMP_3210 = _mm_loadu_si128( (__m128i *)&Af_QA[ k ] );
T1_3210 = _mm_mullo_epi32( PTR_3210, X1_3210 );
T2_3210 = _mm_mullo_epi32( SUBFR_3210, X2_3210 );
ATMP_3210 = _mm_srai_epi32( ATMP_3210, 7 );
ATMP_3210 = _mm_add_epi32( ATMP_3210, CONST1 );
ATMP_3210 = _mm_srai_epi32( ATMP_3210, 1 );
FIRST_3210 = _mm_add_epi32( FIRST_3210, T1_3210 );
LAST_3210 = _mm_add_epi32( LAST_3210, T2_3210 );
PTR_3210 = _mm_mullo_epi32( ATMP_3210, PTR_3210 );
SUBFR_3210 = _mm_mullo_epi32( ATMP_3210, SUBFR_3210 );
_mm_storeu_si128( (__m128i *)&C_first_row[ k ], FIRST_3210 );
_mm_storeu_si128( (__m128i *)&C_last_row[ k ], LAST_3210 );
TMP1_3210 = _mm_add_epi32( TMP1_3210, PTR_3210 );
TMP2_3210 = _mm_add_epi32( TMP2_3210, SUBFR_3210 );
}
TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_unpackhi_epi64(TMP1_3210, TMP1_3210 ) );
TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_unpackhi_epi64(TMP2_3210, TMP2_3210 ) );
TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_shufflelo_epi16(TMP1_3210, 0x0E ) );
TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_shufflelo_epi16(TMP2_3210, 0x0E ) );
tmp1 += _mm_cvtsi128_si32( TMP1_3210 );
tmp2 += _mm_cvtsi128_si32( TMP2_3210 );
for( ; k < n; k++ ) {
C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */
C_last_row[ k ] = silk_MLA( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 ); /* Q17 */
tmp1 = silk_MLA( tmp1, x_ptr[ n - k - 1 ], Atmp1 ); /* Q17 */
tmp2 = silk_MLA( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 ); /* Q17 */
}
tmp1 = -tmp1; /* Q17 */
tmp2 = -tmp2; /* Q17 */
{
__m128i xmm_tmp1, xmm_tmp2;
__m128i xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1;
__m128i xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1;
xmm_tmp1 = _mm_set1_epi32( tmp1 );
xmm_tmp2 = _mm_set1_epi32( tmp2 );
for( k = 0; k <= n - 3; k += 4 ) {
xmm_x_ptr_n_k_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 3 ] );
xmm_x_ptr_sub_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n + k - 1 ] );
xmm_x_ptr_n_k_x2x0 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE( 0, 1, 2, 3 ) );
xmm_x_ptr_n_k_x2x0 = _mm_slli_epi32( xmm_x_ptr_n_k_x2x0, -rshifts - 1 );
xmm_x_ptr_sub_x2x0 = _mm_slli_epi32( xmm_x_ptr_sub_x2x0, -rshifts - 1 );
/* equal shift right 4 bytes, xmm_x_ptr_n_k_x3x1 = _mm_srli_si128(xmm_x_ptr_n_k_x2x0, 4)*/
xmm_x_ptr_n_k_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
xmm_x_ptr_sub_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_sub_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
xmm_x_ptr_n_k_x2x0 = _mm_mul_epi32( xmm_x_ptr_n_k_x2x0, xmm_tmp1 );
xmm_x_ptr_n_k_x3x1 = _mm_mul_epi32( xmm_x_ptr_n_k_x3x1, xmm_tmp1 );
xmm_x_ptr_sub_x2x0 = _mm_mul_epi32( xmm_x_ptr_sub_x2x0, xmm_tmp2 );
xmm_x_ptr_sub_x3x1 = _mm_mul_epi32( xmm_x_ptr_sub_x3x1, xmm_tmp2 );
xmm_x_ptr_n_k_x2x0 = _mm_srli_epi64( xmm_x_ptr_n_k_x2x0, 16 );
xmm_x_ptr_n_k_x3x1 = _mm_slli_epi64( xmm_x_ptr_n_k_x3x1, 16 );
xmm_x_ptr_sub_x2x0 = _mm_srli_epi64( xmm_x_ptr_sub_x2x0, 16 );
xmm_x_ptr_sub_x3x1 = _mm_slli_epi64( xmm_x_ptr_sub_x3x1, 16 );
xmm_x_ptr_n_k_x2x0 = _mm_blend_epi16( xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1, 0xCC );
xmm_x_ptr_sub_x2x0 = _mm_blend_epi16( xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1, 0xCC );
X1_3210 = _mm_loadu_si128( (__m128i *)&CAf[ k ] );
PTR_3210 = _mm_loadu_si128( (__m128i *)&CAb[ k ] );
X1_3210 = _mm_add_epi32( X1_3210, xmm_x_ptr_n_k_x2x0 );
PTR_3210 = _mm_add_epi32( PTR_3210, xmm_x_ptr_sub_x2x0 );
_mm_storeu_si128( (__m128i *)&CAf[ k ], X1_3210 );
_mm_storeu_si128( (__m128i *)&CAb[ k ], PTR_3210 );
}
for( ; k <= n; k++ ) {
CAf[ k ] = silk_SMLAWW( CAf[ k ], tmp1,
silk_LSHIFT32( (opus_int32)x_ptr[ n - k ], -rshifts - 1 ) ); /* Q( -rshift ) */
CAb[ k ] = silk_SMLAWW( CAb[ k ], tmp2,
silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n + k - 1 ], -rshifts - 1 ) ); /* Q( -rshift ) */
}
}
}
}
/* Calculate nominator and denominator for the next order reflection (parcor) coefficient */
tmp1 = C_first_row[ n ]; /* Q( -rshifts ) */
tmp2 = C_last_row[ n ]; /* Q( -rshifts ) */
num = 0; /* Q( -rshifts ) */
nrg = silk_ADD32( CAb[ 0 ], CAf[ 0 ] ); /* Q( 1-rshifts ) */
for( k = 0; k < n; k++ ) {
Atmp_QA = Af_QA[ k ];
lz = silk_CLZ32( silk_abs( Atmp_QA ) ) - 1;
lz = silk_min( 32 - QA, lz );
Atmp1 = silk_LSHIFT32( Atmp_QA, lz ); /* Q( QA + lz ) */
tmp1 = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( C_last_row[ n - k - 1 ], Atmp1 ), 32 - QA - lz ); /* Q( -rshifts ) */
tmp2 = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( C_first_row[ n - k - 1 ], Atmp1 ), 32 - QA - lz ); /* Q( -rshifts ) */
num = silk_ADD_LSHIFT32( num, silk_SMMUL( CAb[ n - k ], Atmp1 ), 32 - QA - lz ); /* Q( -rshifts ) */
nrg = silk_ADD_LSHIFT32( nrg, silk_SMMUL( silk_ADD32( CAb[ k + 1 ], CAf[ k + 1 ] ),
Atmp1 ), 32 - QA - lz ); /* Q( 1-rshifts ) */
}
CAf[ n + 1 ] = tmp1; /* Q( -rshifts ) */
CAb[ n + 1 ] = tmp2; /* Q( -rshifts ) */
num = silk_ADD32( num, tmp2 ); /* Q( -rshifts ) */
num = silk_LSHIFT32( -num, 1 ); /* Q( 1-rshifts ) */
/* Calculate the next order reflection (parcor) coefficient */
if( silk_abs( num ) < nrg ) {
rc_Q31 = silk_DIV32_varQ( num, nrg, 31 );
} else {
rc_Q31 = ( num > 0 ) ? silk_int32_MAX : silk_int32_MIN;
}
/* Update inverse prediction gain */
tmp1 = ( (opus_int32)1 << 30 ) - silk_SMMUL( rc_Q31, rc_Q31 );
tmp1 = silk_LSHIFT( silk_SMMUL( invGain_Q30, tmp1 ), 2 );
if( tmp1 <= minInvGain_Q30 ) {
/* Max prediction gain exceeded; set reflection coefficient such that max prediction gain is exactly hit */
tmp2 = ( (opus_int32)1 << 30 ) - silk_DIV32_varQ( minInvGain_Q30, invGain_Q30, 30 ); /* Q30 */
rc_Q31 = silk_SQRT_APPROX( tmp2 ); /* Q15 */
/* Newton-Raphson iteration */
rc_Q31 = silk_RSHIFT32( rc_Q31 + silk_DIV32( tmp2, rc_Q31 ), 1 ); /* Q15 */
rc_Q31 = silk_LSHIFT32( rc_Q31, 16 ); /* Q31 */
if( num < 0 ) {
/* Ensure adjusted reflection coefficients has the original sign */
rc_Q31 = -rc_Q31;
}
invGain_Q30 = minInvGain_Q30;
reached_max_gain = 1;
} else {
invGain_Q30 = tmp1;
}
/* Update the AR coefficients */
for( k = 0; k < (n + 1) >> 1; k++ ) {
tmp1 = Af_QA[ k ]; /* QA */
tmp2 = Af_QA[ n - k - 1 ]; /* QA */
Af_QA[ k ] = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( tmp2, rc_Q31 ), 1 ); /* QA */
Af_QA[ n - k - 1 ] = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( tmp1, rc_Q31 ), 1 ); /* QA */
}
Af_QA[ n ] = silk_RSHIFT32( rc_Q31, 31 - QA ); /* QA */
if( reached_max_gain ) {
/* Reached max prediction gain; set remaining coefficients to zero and exit loop */
for( k = n + 1; k < D; k++ ) {
Af_QA[ k ] = 0;
}
break;
}
/* Update C * Af and C * Ab */
for( k = 0; k <= n + 1; k++ ) {
tmp1 = CAf[ k ]; /* Q( -rshifts ) */
tmp2 = CAb[ n - k + 1 ]; /* Q( -rshifts ) */
CAf[ k ] = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( tmp2, rc_Q31 ), 1 ); /* Q( -rshifts ) */
CAb[ n - k + 1 ] = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( tmp1, rc_Q31 ), 1 ); /* Q( -rshifts ) */
}
}
if( reached_max_gain ) {
for( k = 0; k < D; k++ ) {
/* Scale coefficients */
A_Q16[ k ] = -silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 );
}
/* Subtract energy of preceding samples from C0 */
if( rshifts > 0 ) {
for( s = 0; s < nb_subfr; s++ ) {
x_ptr = x + s * subfr_length;
C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
}
} else {
for( s = 0; s < nb_subfr; s++ ) {
x_ptr = x + s * subfr_length;
C0 -= silk_LSHIFT32( silk_inner_prod_aligned( x_ptr, x_ptr, D, arch ), -rshifts );
}
}
/* Approximate residual energy */
*res_nrg = silk_LSHIFT( silk_SMMUL( invGain_Q30, C0 ), 2 );
*res_nrg_Q = -rshifts;
} else {
/* Return residual energy */
nrg = CAf[ 0 ]; /* Q( -rshifts ) */
tmp1 = (opus_int32)1 << 16; /* Q16 */
for( k = 0; k < D; k++ ) {
Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 ); /* Q16 */
nrg = silk_SMLAWW( nrg, CAf[ k + 1 ], Atmp1 ); /* Q( -rshifts ) */
tmp1 = silk_SMLAWW( tmp1, Atmp1, Atmp1 ); /* Q16 */
A_Q16[ k ] = -Atmp1;
}
*res_nrg = silk_SMLAWW( nrg, silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ), -tmp1 );/* Q( -rshifts ) */
*res_nrg_Q = -rshifts;
}
}

View file

@ -0,0 +1,160 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <xmmintrin.h>
#include <emmintrin.h>
#include <smmintrin.h>
#include "main.h"
#include "celt/x86/x86cpu.h"
void silk_warped_LPC_analysis_filter_FIX_sse4_1(
opus_int32 state[], /* I/O State [order + 1] */
opus_int32 res_Q2[], /* O Residual signal [length] */
const opus_int16 coef_Q13[], /* I Coefficients [order] */
const opus_int16 input[], /* I Input signal [length] */
const opus_int16 lambda_Q16, /* I Warping factor */
const opus_int length, /* I Length of input signal */
const opus_int order /* I Filter order (even) */
)
{
opus_int n, i;
opus_int32 acc_Q11, tmp1, tmp2;
/* Order must be even */
silk_assert( ( order & 1 ) == 0 );
if (order == 10)
{
if (0 == lambda_Q16)
{
__m128i coef_Q13_3210, coef_Q13_7654;
__m128i coef_Q13_0123, coef_Q13_4567;
__m128i state_0123, state_4567;
__m128i xmm_product1, xmm_product2;
__m128i xmm_tempa, xmm_tempb;
register opus_int32 sum;
register opus_int32 state_8, state_9, state_a;
register opus_int64 coef_Q13_8, coef_Q13_9;
silk_assert( length > 0 );
coef_Q13_3210 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 0 ] );
coef_Q13_7654 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 4 ] );
coef_Q13_0123 = _mm_shuffle_epi32( coef_Q13_3210, _MM_SHUFFLE( 0, 1, 2, 3 ) );
coef_Q13_4567 = _mm_shuffle_epi32( coef_Q13_7654, _MM_SHUFFLE( 0, 1, 2, 3 ) );
coef_Q13_8 = (opus_int64) coef_Q13[ 8 ];
coef_Q13_9 = (opus_int64) coef_Q13[ 9 ];
state_0123 = _mm_loadu_si128( (__m128i *)(&state[ 0 ] ) );
state_4567 = _mm_loadu_si128( (__m128i *)(&state[ 4 ] ) );
state_0123 = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
state_4567 = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
state_8 = state[ 8 ];
state_9 = state[ 9 ];
state_a = 0;
for( n = 0; n < length; n++ )
{
xmm_product1 = _mm_mul_epi32( coef_Q13_0123, state_0123 ); /* 64-bit multiply, only 2 pairs */
xmm_product2 = _mm_mul_epi32( coef_Q13_4567, state_4567 );
xmm_tempa = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
xmm_tempb = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
xmm_product1 = _mm_srli_epi64( xmm_product1, 16 ); /* >> 16, zero extending works */
xmm_product2 = _mm_srli_epi64( xmm_product2, 16 );
xmm_tempa = _mm_mul_epi32( coef_Q13_3210, xmm_tempa );
xmm_tempb = _mm_mul_epi32( coef_Q13_7654, xmm_tempb );
xmm_tempa = _mm_srli_epi64( xmm_tempa, 16 );
xmm_tempb = _mm_srli_epi64( xmm_tempb, 16 );
xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_product1 );
xmm_tempb = _mm_add_epi32( xmm_tempb, xmm_product2 );
xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_tempb );
sum = (coef_Q13_8 * state_8) >> 16;
sum += (coef_Q13_9 * state_9) >> 16;
xmm_tempa = _mm_add_epi32( xmm_tempa, _mm_shuffle_epi32( xmm_tempa, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
sum += _mm_cvtsi128_si32( xmm_tempa);
res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( ( 5 + sum ), 9);
/* move right */
state_a = state_9;
state_9 = state_8;
state_8 = _mm_cvtsi128_si32( state_4567 );
state_4567 = _mm_alignr_epi8( state_0123, state_4567, 4 );
state_0123 = _mm_alignr_epi8( _mm_cvtsi32_si128( silk_LSHIFT( input[ n ], 14 ) ), state_0123, 4 );
}
_mm_storeu_si128( (__m128i *)( &state[ 0 ] ), _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
_mm_storeu_si128( (__m128i *)( &state[ 4 ] ), _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
state[ 8 ] = state_8;
state[ 9 ] = state_9;
state[ 10 ] = state_a;
return;
}
}
for( n = 0; n < length; n++ ) {
/* Output of lowpass section */
tmp2 = silk_SMLAWB( state[ 0 ], state[ 1 ], lambda_Q16 );
state[ 0 ] = silk_LSHIFT( input[ n ], 14 );
/* Output of allpass section */
tmp1 = silk_SMLAWB( state[ 1 ], state[ 2 ] - tmp2, lambda_Q16 );
state[ 1 ] = tmp2;
acc_Q11 = silk_RSHIFT( order, 1 );
acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ 0 ] );
/* Loop over allpass sections */
for( i = 2; i < order; i += 2 ) {
/* Output of allpass section */
tmp2 = silk_SMLAWB( state[ i ], state[ i + 1 ] - tmp1, lambda_Q16 );
state[ i ] = tmp1;
acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ i - 1 ] );
/* Output of allpass section */
tmp1 = silk_SMLAWB( state[ i + 1 ], state[ i + 2 ] - tmp2, lambda_Q16 );
state[ i + 1 ] = tmp2;
acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ i ] );
}
state[ order ] = tmp1;
acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ order - 1 ] );
res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( acc_Q11, 9 );
}
}

View file

@ -0,0 +1,88 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <xmmintrin.h>
#include <emmintrin.h>
#include <smmintrin.h>
#include "main.h"
#include "SigProc_FIX.h"
#include "pitch.h"
opus_int64 silk_inner_prod16_aligned_64_sse4_1(
const opus_int16 *inVec1, /* I input vector 1 */
const opus_int16 *inVec2, /* I input vector 2 */
const opus_int len /* I vector lengths */
)
{
opus_int i, dataSize8;
opus_int64 sum;
__m128i xmm_tempa;
__m128i inVec1_76543210, acc1;
__m128i inVec2_76543210, acc2;
sum = 0;
dataSize8 = len & ~7;
acc1 = _mm_setzero_si128();
acc2 = _mm_setzero_si128();
for( i = 0; i < dataSize8; i += 8 ) {
inVec1_76543210 = _mm_loadu_si128( (__m128i *)(&inVec1[i + 0] ) );
inVec2_76543210 = _mm_loadu_si128( (__m128i *)(&inVec2[i + 0] ) );
/* only when all 4 operands are -32768 (0x8000), this results in wrap around */
inVec1_76543210 = _mm_madd_epi16( inVec1_76543210, inVec2_76543210 );
xmm_tempa = _mm_cvtepi32_epi64( inVec1_76543210 );
/* equal shift right 8 bytes */
inVec1_76543210 = _mm_shuffle_epi32( inVec1_76543210, _MM_SHUFFLE( 0, 0, 3, 2 ) );
inVec1_76543210 = _mm_cvtepi32_epi64( inVec1_76543210 );
acc1 = _mm_add_epi64( acc1, xmm_tempa );
acc2 = _mm_add_epi64( acc2, inVec1_76543210 );
}
acc1 = _mm_add_epi64( acc1, acc2 );
/* equal shift right 8 bytes */
acc2 = _mm_shuffle_epi32( acc1, _MM_SHUFFLE( 0, 0, 3, 2 ) );
acc1 = _mm_add_epi64( acc1, acc2 );
_mm_storel_epi64( (__m128i *)&sum, acc1 );
for( ; i < len; i++ ) {
sum = silk_SMLABB( sum, inVec1[ i ], inVec2[ i ] );
}
return sum;
}

View file

@ -47,7 +47,7 @@ void silk_encode_do_VAD_FLP(
/****************************/
/* Voice Activity Detection */
/****************************/
silk_VAD_GetSA_Q8( &psEnc->sCmn, psEnc->sCmn.inputBuf + 1 );
silk_VAD_GetSA_Q8( &psEnc->sCmn, psEnc->sCmn.inputBuf + 1, psEnc->sCmn.arch );
/**************************************************/
/* Convert speech activity into VAD and DTX flags */

View file

@ -67,7 +67,8 @@ void silk_find_pred_coefs_FLP(
/* Quantize LTP gain parameters */
silk_quant_LTP_gains_FLP( psEncCtrl->LTPCoef, psEnc->sCmn.indices.LTPIndex, &psEnc->sCmn.indices.PERIndex,
&psEnc->sCmn.sum_log_gain_Q7, WLTP, psEnc->sCmn.mu_LTP_Q9, psEnc->sCmn.LTPQuantLowComplexity, psEnc->sCmn.nb_subfr );
&psEnc->sCmn.sum_log_gain_Q7, WLTP, psEnc->sCmn.mu_LTP_Q9, psEnc->sCmn.LTPQuantLowComplexity, psEnc->sCmn.nb_subfr,
psEnc->sCmn.arch );
/* Control LTP scaling */
silk_LTP_scale_ctrl_FLP( psEnc, psEncCtrl, condCoding );

View file

@ -205,7 +205,8 @@ void silk_quant_LTP_gains_FLP(
const silk_float W[ MAX_NB_SUBFR * LTP_ORDER * LTP_ORDER ], /* I Error weights */
const opus_int mu_Q10, /* I Mu value (R/D tradeoff) */
const opus_int lowComplexity, /* I Flag for low complexity */
const opus_int nb_subfr /* I number of subframes */
const opus_int nb_subfr, /* I number of subframes */
int arch /* I Run-time architecture */
);
/* Residual energy: nrg = wxx - 2 * wXx * c + c' * wXX * c */

View file

@ -161,10 +161,10 @@ void silk_NSQ_wrapper_FLP(
/* Call NSQ */
if( psEnc->sCmn.nStatesDelayedDecision > 1 || psEnc->sCmn.warping_Q16 > 0 ) {
silk_NSQ_del_dec( &psEnc->sCmn, psNSQ, psIndices, x_Q3, pulses, PredCoef_Q12[ 0 ], LTPCoef_Q14,
AR2_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, psEncCtrl->pitchL, Lambda_Q10, LTP_scale_Q14 );
AR2_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, psEncCtrl->pitchL, Lambda_Q10, LTP_scale_Q14, psEnc->sCmn.arch );
} else {
silk_NSQ( &psEnc->sCmn, psNSQ, psIndices, x_Q3, pulses, PredCoef_Q12[ 0 ], LTPCoef_Q14,
AR2_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, psEncCtrl->pitchL, Lambda_Q10, LTP_scale_Q14 );
AR2_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, psEncCtrl->pitchL, Lambda_Q10, LTP_scale_Q14, psEnc->sCmn.arch );
}
}
@ -179,7 +179,8 @@ void silk_quant_LTP_gains_FLP(
const silk_float W[ MAX_NB_SUBFR * LTP_ORDER * LTP_ORDER ], /* I Error weights */
const opus_int mu_Q10, /* I Mu value (R/D tradeoff) */
const opus_int lowComplexity, /* I Flag for low complexity */
const opus_int nb_subfr /* I number of subframes */
const opus_int nb_subfr, /* I number of subframes */
int arch /* I Run-time architecture */
)
{
opus_int i;
@ -193,7 +194,7 @@ void silk_quant_LTP_gains_FLP(
W_Q18[ i ] = (opus_int32)silk_float2int( W[ i ] * 262144.0f );
}
silk_quant_LTP_gains( B_Q14, cbk_index, periodicity_index, sum_log_gain_Q7, W_Q18, mu_Q10, lowComplexity, nb_subfr );
silk_quant_LTP_gains( B_Q14, cbk_index, periodicity_index, sum_log_gain_Q7, W_Q18, mu_Q10, lowComplexity, nb_subfr, arch );
for( i = 0; i < nb_subfr * LTP_ORDER; i++ ) {
B[ i ] = (silk_float)B_Q14[ i ] * ( 1.0f / 16384.0f );

View file

@ -35,19 +35,39 @@ POSSIBILITY OF SUCH DAMAGE.
#include "opus_types.h"
#include "opus_defines.h"
#if OPUS_GNUC_PREREQ(3, 0)
#define opus_likely(x) (__builtin_expect(!!(x), 1))
#define opus_unlikely(x) (__builtin_expect(!!(x), 0))
#else
#define opus_likely(x) (!!(x))
#define opus_unlikely(x) (!!(x))
#endif
/* This is an OPUS_INLINE header file for general platform. */
/* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */
#if defined(__x86_64__) || defined(__LP64__) || defined(_WIN64)
#define silk_SMULWB(a32, b32) (((a32) * (opus_int64)((opus_int16)(b32))) >> 16)
#else
#define silk_SMULWB(a32, b32) ((((a32) >> 16) * (opus_int32)((opus_int16)(b32))) + ((((a32) & 0x0000FFFF) * (opus_int32)((opus_int16)(b32))) >> 16))
#endif
/* a32 + (b32 * (opus_int32)((opus_int16)(c32))) >> 16 output have to be 32bit int */
#if defined(__x86_64__) || defined(__LP64__) || defined(_WIN64)
#define silk_SMLAWB(a32, b32, c32) ((a32) + (((b32) * (opus_int64)((opus_int16)(c32))) >> 16))
#else
#define silk_SMLAWB(a32, b32, c32) ((a32) + ((((b32) >> 16) * (opus_int32)((opus_int16)(c32))) + ((((b32) & 0x0000FFFF) * (opus_int32)((opus_int16)(c32))) >> 16)))
#endif
/* (a32 * (b32 >> 16)) >> 16 */
#define silk_SMULWT(a32, b32) (((a32) >> 16) * ((b32) >> 16) + ((((a32) & 0x0000FFFF) * ((b32) >> 16)) >> 16))
/* a32 + (b32 * (c32 >> 16)) >> 16 */
#if defined(__x86_64__) || defined(__LP64__) || defined(_WIN64)
#define silk_SMLAWT(a32, b32, c32) ((a32) + (((b32) * ((opus_int64)(c32) >> 16)) >> 16))
#else
#define silk_SMLAWT(a32, b32, c32) ((a32) + (((b32) >> 16) * ((c32) >> 16)) + ((((b32) & 0x0000FFFF) * ((c32) >> 16)) >> 16))
#endif
/* (opus_int32)((opus_int16)(a3))) * (opus_int32)((opus_int16)(b32)) output have to be 32bit int */
#define silk_SMULBB(a32, b32) ((opus_int32)((opus_int16)(a32)) * (opus_int32)((opus_int16)(b32)))
@ -65,10 +85,18 @@ POSSIBILITY OF SUCH DAMAGE.
#define silk_SMLAL(a64, b32, c32) (silk_ADD64((a64), ((opus_int64)(b32) * (opus_int64)(c32))))
/* (a32 * b32) >> 16 */
#if defined(__x86_64__) || defined(__LP64__) || defined(_WIN64)
#define silk_SMULWW(a32, b32) (((opus_int64)(a32) * (b32)) >> 16)
#else
#define silk_SMULWW(a32, b32) silk_MLA(silk_SMULWB((a32), (b32)), (a32), silk_RSHIFT_ROUND((b32), 16))
#endif
/* a32 + ((b32 * c32) >> 16) */
#if defined(__x86_64__) || defined(__LP64__) || defined(_WIN64)
#define silk_SMLAWW(a32, b32, c32) ((a32) + (((opus_int64)(b32) * (c32)) >> 16))
#else
#define silk_SMLAWW(a32, b32, c32) silk_MLA(silk_SMLAWB((a32), (b32), (c32)), (b32), silk_RSHIFT_ROUND((c32), 16))
#endif
/* add/subtract with output saturated */
#define silk_ADD_SAT32(a, b) ((((opus_uint32)(a) + (opus_uint32)(b)) & 0x80000000) == 0 ? \

View file

@ -38,6 +38,10 @@ POSSIBILITY OF SUCH DAMAGE.
#include "entenc.h"
#include "entdec.h"
#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
#include "x86/main_sse.h"
#endif
/* Convert Left/Right stereo signal to adaptive Mid/Side representation */
void silk_stereo_LR_to_MS(
stereo_enc_state *state, /* I/O State */
@ -208,11 +212,12 @@ void silk_quant_LTP_gains(
const opus_int32 W_Q18[ MAX_NB_SUBFR*LTP_ORDER*LTP_ORDER ], /* I Error Weights in Q18 */
opus_int mu_Q9, /* I Mu value (R/D tradeoff) */
opus_int lowComplexity, /* I Flag for low complexity */
const opus_int nb_subfr /* I number of subframes */
const opus_int nb_subfr, /* I number of subframes */
int arch /* I Run-time architecture */
);
/* Entropy constrained matrix-weighted VQ, for a single input data vector */
void silk_VQ_WMat_EC(
void silk_VQ_WMat_EC_c(
opus_int8 *ind, /* O index of best codebook vector */
opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
opus_int *gain_Q7, /* O sum of absolute LTP coefficients */
@ -226,10 +231,18 @@ void silk_VQ_WMat_EC(
opus_int L /* I number of vectors in codebook */
);
#if !defined(OVERRIDE_silk_VQ_WMat_EC)
#define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
mu_Q9, max_gain_Q7, L, arch) \
((void)(arch),silk_VQ_WMat_EC_c(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
mu_Q9, max_gain_Q7, L))
#endif
/************************************/
/* Noise shaping quantization (NSQ) */
/************************************/
void silk_NSQ(
void silk_NSQ_c(
const silk_encoder_state *psEncC, /* I/O Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
SideInfoIndices *psIndices, /* I/O Quantization Indices */
@ -247,8 +260,15 @@ void silk_NSQ(
const opus_int LTP_scale_Q14 /* I LTP state scaling */
);
#if !defined(OVERRIDE_silk_NSQ)
#define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
((void)(arch),silk_NSQ_c(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
#endif
/* Noise shaping using delayed decision */
void silk_NSQ_del_dec(
void silk_NSQ_del_dec_c(
const silk_encoder_state *psEncC, /* I/O Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
SideInfoIndices *psIndices, /* I/O Quantization Indices */
@ -266,6 +286,13 @@ void silk_NSQ_del_dec(
const opus_int LTP_scale_Q14 /* I LTP state scaling */
);
#if !defined(OVERRIDE_silk_NSQ_del_dec)
#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
((void)(arch),silk_NSQ_del_dec_c(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
#endif
/************/
/* Silk VAD */
/************/
@ -275,11 +302,15 @@ opus_int silk_VAD_Init( /* O Return v
);
/* Get speech activity level in Q8 */
opus_int silk_VAD_GetSA_Q8( /* O Return value, 0 if success */
opus_int silk_VAD_GetSA_Q8_c( /* O Return value, 0 if success */
silk_encoder_state *psEncC, /* I/O Encoder state */
const opus_int16 pIn[] /* I PCM input */
);
#if !defined(OVERRIDE_silk_VAD_GetSA_Q8)
#define silk_VAD_GetSA_Q8(psEnC, pIn, arch) ((void)(arch),silk_VAD_GetSA_Q8_c(psEnC, pIn))
#endif
/* Low-pass filter with variable cutoff frequency based on */
/* piece-wise linear interpolation between elliptic filters */
/* Start by setting transition_frame_no = 1; */
@ -373,7 +404,8 @@ opus_int silk_decode_frame(
opus_int16 pOut[], /* O Pointer to output speech frame */
opus_int32 *pN, /* O Pointer to size of output frame */
opus_int lostFlag, /* I 0: no loss, 1 loss, 2 decode fec */
opus_int condCoding /* I The type of conditional coding to use */
opus_int condCoding, /* I The type of conditional coding to use */
int arch /* I Run-time architecture */
);
/* Decode indices from bitstream */
@ -397,7 +429,8 @@ void silk_decode_core(
silk_decoder_state *psDec, /* I/O Decoder state */
silk_decoder_control *psDecCtrl, /* I Decoder control */
opus_int16 xq[], /* O Decoded speech */
const opus_int16 pulses[ MAX_FRAME_LENGTH ] /* I Pulse signal */
const opus_int16 pulses[ MAX_FRAME_LENGTH ], /* I Pulse signal */
int arch /* I Run-time architecture */
);
/* Decode quantization indices of excitation (Shell coding) */
@ -435,4 +468,23 @@ void silk_encode_indices(
opus_int condCoding /* I The type of conditional coding to use */
);
void silk_warped_LPC_analysis_filter_FIX_c(
opus_int32 state[], /* I/O State [order + 1] */
opus_int32 res_Q2[], /* O Residual signal [length] */
const opus_int16 coef_Q13[], /* I Coefficients [order] */
const opus_int16 input[], /* I Input signal [length] */
const opus_int16 lambda_Q16, /* I Warping factor */
const opus_int length, /* I Length of input signal */
const opus_int order /* I Filter order (even) */
);
#if !defined(OVERRIDE_silk_warped_LPC_analysis_filter_FIX)
#define silk_warped_LPC_analysis_filter_FIX(state, res_Q2, coef_Q13, input, lambda_Q16, length, order, arch) \
((void)(arch),silk_warped_LPC_analysis_filter_FIX_c(state, res_Q2, coef_Q13, input, lambda_Q16, length, order))
#endif
#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
#endif
#endif

View file

@ -40,7 +40,8 @@ void silk_quant_LTP_gains(
const opus_int32 W_Q18[ MAX_NB_SUBFR*LTP_ORDER*LTP_ORDER ], /* I Error Weights in Q18 */
opus_int mu_Q9, /* I Mu value (R/D tradeoff) */
opus_int lowComplexity, /* I Flag for low complexity */
const opus_int nb_subfr /* I number of subframes */
const opus_int nb_subfr, /* I number of subframes */
int arch /* I Run-time architecture */
)
{
opus_int j, k, cbk_size;
@ -90,7 +91,8 @@ void silk_quant_LTP_gains(
cl_ptr_Q5, /* I code length for each codebook vector */
mu_Q9, /* I tradeoff between weighted error and rate */
max_gain_Q7, /* I maximum sum of absolute LTP coefficients */
cbk_size /* I number of vectors in codebook */
cbk_size, /* I number of vectors in codebook */
arch /* I Run-time architecture */
);
rate_dist_Q14 = silk_ADD_POS_SAT32( rate_dist_Q14, rate_dist_Q14_subfr );

857
silk/x86/NSQ_del_dec_sse.c Normal file
View file

@ -0,0 +1,857 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <xmmintrin.h>
#include <emmintrin.h>
#include <smmintrin.h>
#include "main.h"
#include "celt/x86/x86cpu.h"
#include "stack_alloc.h"
typedef struct {
opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ];
opus_int32 RandState[ DECISION_DELAY ];
opus_int32 Q_Q10[ DECISION_DELAY ];
opus_int32 Xq_Q14[ DECISION_DELAY ];
opus_int32 Pred_Q15[ DECISION_DELAY ];
opus_int32 Shape_Q14[ DECISION_DELAY ];
opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ];
opus_int32 LF_AR_Q14;
opus_int32 Seed;
opus_int32 SeedInit;
opus_int32 RD_Q10;
} NSQ_del_dec_struct;
typedef struct {
opus_int32 Q_Q10;
opus_int32 RD_Q10;
opus_int32 xq_Q14;
opus_int32 LF_AR_Q14;
opus_int32 sLTP_shp_Q14;
opus_int32 LPC_exc_Q14;
} NSQ_sample_struct;
typedef NSQ_sample_struct NSQ_sample_pair[ 2 ];
static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
const silk_encoder_state *psEncC, /* I Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */
const opus_int32 x_Q3[], /* I Input in Q3 */
opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */
const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
opus_int subfr, /* I Subframe number */
opus_int nStatesDelayedDecision, /* I Number of del dec states */
const opus_int LTP_scale_Q14, /* I LTP state scaling */
const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
const opus_int signal_type, /* I Signal type */
const opus_int decisionDelay /* I Decision delay */
);
/******************************************/
/* Noise shape quantizer for one subframe */
/******************************************/
static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
silk_nsq_state *NSQ, /* I/O NSQ state */
NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */
opus_int signalType, /* I Signal type */
const opus_int32 x_Q10[], /* I */
opus_int8 pulses[], /* O */
opus_int16 xq[], /* O */
opus_int32 sLTP_Q15[], /* I/O LTP filter state */
opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */
const opus_int16 a_Q12[], /* I Short term prediction coefs */
const opus_int16 b_Q14[], /* I Long term prediction coefs */
const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */
opus_int lag, /* I Pitch lag */
opus_int32 HarmShapeFIRPacked_Q14, /* I */
opus_int Tilt_Q14, /* I Spectral tilt */
opus_int32 LF_shp_Q14, /* I */
opus_int32 Gain_Q16, /* I */
opus_int Lambda_Q10, /* I */
opus_int offset_Q10, /* I */
opus_int length, /* I Input length */
opus_int subfr, /* I Subframe number */
opus_int shapingLPCOrder, /* I Shaping LPC filter order */
opus_int predictLPCOrder, /* I Prediction filter order */
opus_int warping_Q16, /* I */
opus_int nStatesDelayedDecision, /* I Number of states in decision tree */
opus_int *smpl_buf_idx, /* I Index to newest samples in buffers */
opus_int decisionDelay /* I */
);
void silk_NSQ_del_dec_sse4_1(
const silk_encoder_state *psEncC, /* I/O Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
SideInfoIndices *psIndices, /* I/O Quantization Indices */
const opus_int32 x_Q3[], /* I Prefiltered input signal */
opus_int8 pulses[], /* O Quantized pulse signal */
const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
const opus_int LTP_scale_Q14 /* I LTP state scaling */
)
{
opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
opus_int last_smple_idx, smpl_buf_idx, decisionDelay;
const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13;
opus_int16 *pxq;
VARDECL( opus_int32, sLTP_Q15 );
VARDECL( opus_int16, sLTP );
opus_int32 HarmShapeFIRPacked_Q14;
opus_int offset_Q10;
opus_int32 RDmin_Q10, Gain_Q10;
VARDECL( opus_int32, x_sc_Q10 );
VARDECL( opus_int32, delayedGain_Q10 );
VARDECL( NSQ_del_dec_struct, psDelDec );
NSQ_del_dec_struct *psDD;
SAVE_STACK;
/* Set unvoiced lag to the previous one, overwrite later for voiced */
lag = NSQ->lagPrev;
silk_assert( NSQ->prev_gain_Q16 != 0 );
/* Initialize delayed decision states */
ALLOC( psDelDec, psEncC->nStatesDelayedDecision, NSQ_del_dec_struct );
silk_memset( psDelDec, 0, psEncC->nStatesDelayedDecision * sizeof( NSQ_del_dec_struct ) );
for( k = 0; k < psEncC->nStatesDelayedDecision; k++ ) {
psDD = &psDelDec[ k ];
psDD->Seed = ( k + psIndices->Seed ) & 3;
psDD->SeedInit = psDD->Seed;
psDD->RD_Q10 = 0;
psDD->LF_AR_Q14 = NSQ->sLF_AR_shp_Q14;
psDD->Shape_Q14[ 0 ] = NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ];
silk_memcpy( psDD->sLPC_Q14, NSQ->sLPC_Q14, NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
silk_memcpy( psDD->sAR2_Q14, NSQ->sAR2_Q14, sizeof( NSQ->sAR2_Q14 ) );
}
offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
smpl_buf_idx = 0; /* index of oldest samples */
decisionDelay = silk_min_int( DECISION_DELAY, psEncC->subfr_length );
/* For voiced frames limit the decision delay to lower than the pitch lag */
if( psIndices->signalType == TYPE_VOICED ) {
for( k = 0; k < psEncC->nb_subfr; k++ ) {
decisionDelay = silk_min_int( decisionDelay, pitchL[ k ] - LTP_ORDER / 2 - 1 );
}
} else {
if( lag > 0 ) {
decisionDelay = silk_min_int( decisionDelay, lag - LTP_ORDER / 2 - 1 );
}
}
if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
LSF_interpolation_flag = 0;
} else {
LSF_interpolation_flag = 1;
}
ALLOC( sLTP_Q15,
psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );
/* Set up pointers to start of sub frame */
pxq = &NSQ->xq[ psEncC->ltp_mem_length ];
NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
subfr = 0;
for( k = 0; k < psEncC->nb_subfr; k++ ) {
A_Q12 = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ];
B_Q14 = &LTPCoef_Q14[ k * LTP_ORDER ];
AR_shp_Q13 = &AR2_Q13[ k * MAX_SHAPE_LPC_ORDER ];
/* Noise shape parameters */
silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
NSQ->rewhite_flag = 0;
if( psIndices->signalType == TYPE_VOICED ) {
/* Voiced */
lag = pitchL[ k ];
/* Re-whitening */
if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
if( k == 2 ) {
/* RESET DELAYED DECISIONS */
/* Find winner */
RDmin_Q10 = psDelDec[ 0 ].RD_Q10;
Winner_ind = 0;
for( i = 1; i < psEncC->nStatesDelayedDecision; i++ ) {
if( psDelDec[ i ].RD_Q10 < RDmin_Q10 ) {
RDmin_Q10 = psDelDec[ i ].RD_Q10;
Winner_ind = i;
}
}
for( i = 0; i < psEncC->nStatesDelayedDecision; i++ ) {
if( i != Winner_ind ) {
psDelDec[ i ].RD_Q10 += ( silk_int32_MAX >> 4 );
silk_assert( psDelDec[ i ].RD_Q10 >= 0 );
}
}
/* Copy final part of signals from winner state to output and long-term filter states */
psDD = &psDelDec[ Winner_ind ];
last_smple_idx = smpl_buf_idx + decisionDelay;
for( i = 0; i < decisionDelay; i++ ) {
last_smple_idx = ( last_smple_idx - 1 ) & DECISION_DELAY_MASK;
pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gains_Q16[ 1 ] ), 14 ) );
NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->Shape_Q14[ last_smple_idx ];
}
subfr = 0;
}
/* Rewhiten with new A coefs */
start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
silk_assert( start_idx > 0 );
silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
NSQ->rewhite_flag = 1;
}
}
silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k,
psEncC->nStatesDelayedDecision, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay );
silk_noise_shape_quantizer_del_dec_sse4_1( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ],
Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );
x_Q3 += psEncC->subfr_length;
pulses += psEncC->subfr_length;
pxq += psEncC->subfr_length;
}
/* Find winner */
RDmin_Q10 = psDelDec[ 0 ].RD_Q10;
Winner_ind = 0;
for( k = 1; k < psEncC->nStatesDelayedDecision; k++ ) {
if( psDelDec[ k ].RD_Q10 < RDmin_Q10 ) {
RDmin_Q10 = psDelDec[ k ].RD_Q10;
Winner_ind = k;
}
}
/* Copy final part of signals from winner state to output and long-term filter states */
psDD = &psDelDec[ Winner_ind ];
psIndices->Seed = psDD->SeedInit;
last_smple_idx = smpl_buf_idx + decisionDelay;
Gain_Q10 = silk_RSHIFT32( Gains_Q16[ psEncC->nb_subfr - 1 ], 6 );
for( i = 0; i < decisionDelay; i++ ) {
last_smple_idx = ( last_smple_idx - 1 ) & DECISION_DELAY_MASK;
pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gain_Q10 ), 8 ) );
NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->Shape_Q14[ last_smple_idx ];
}
silk_memcpy( NSQ->sLPC_Q14, &psDD->sLPC_Q14[ psEncC->subfr_length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
silk_memcpy( NSQ->sAR2_Q14, psDD->sAR2_Q14, sizeof( psDD->sAR2_Q14 ) );
/* Update states */
NSQ->sLF_AR_shp_Q14 = psDD->LF_AR_Q14;
NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ];
/* Save quantized speech signal */
/* DEBUG_STORE_DATA( enc.pcm, &NSQ->xq[psEncC->ltp_mem_length], psEncC->frame_length * sizeof( opus_int16 ) ) */
silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
RESTORE_STACK;
}
/******************************************/
/* Noise shape quantizer for one subframe */
/******************************************/
static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
silk_nsq_state *NSQ, /* I/O NSQ state */
NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */
opus_int signalType, /* I Signal type */
const opus_int32 x_Q10[], /* I */
opus_int8 pulses[], /* O */
opus_int16 xq[], /* O */
opus_int32 sLTP_Q15[], /* I/O LTP filter state */
opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */
const opus_int16 a_Q12[], /* I Short term prediction coefs */
const opus_int16 b_Q14[], /* I Long term prediction coefs */
const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */
opus_int lag, /* I Pitch lag */
opus_int32 HarmShapeFIRPacked_Q14, /* I */
opus_int Tilt_Q14, /* I Spectral tilt */
opus_int32 LF_shp_Q14, /* I */
opus_int32 Gain_Q16, /* I */
opus_int Lambda_Q10, /* I */
opus_int offset_Q10, /* I */
opus_int length, /* I Input length */
opus_int subfr, /* I Subframe number */
opus_int shapingLPCOrder, /* I Shaping LPC filter order */
opus_int predictLPCOrder, /* I Prediction filter order */
opus_int warping_Q16, /* I */
opus_int nStatesDelayedDecision, /* I Number of states in decision tree */
opus_int *smpl_buf_idx, /* I Index to newest samples in buffers */
opus_int decisionDelay /* I */
)
{
opus_int i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx;
opus_int32 Winner_rand_state;
opus_int32 LTP_pred_Q14, LPC_pred_Q14, n_AR_Q14, n_LTP_Q14;
opus_int32 n_LF_Q14, r_Q10, rr_Q10, rd1_Q10, rd2_Q10, RDmin_Q10, RDmax_Q10;
opus_int32 q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
opus_int32 tmp1, tmp2, sLF_AR_shp_Q14;
opus_int32 *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14;
VARDECL( NSQ_sample_pair, psSampleState );
NSQ_del_dec_struct *psDD;
NSQ_sample_struct *psSS;
__m128i a_Q12_0123, a_Q12_4567, a_Q12_89AB, a_Q12_CDEF;
__m128i b_Q12_0123, b_sr_Q12_0123;
SAVE_STACK;
silk_assert( nStatesDelayedDecision > 0 );
ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair );
shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 );
a_Q12_0123 = OP_CVTEPI16_EPI32_M64( a_Q12 );
a_Q12_4567 = OP_CVTEPI16_EPI32_M64( a_Q12 + 4 );
if( opus_likely( predictLPCOrder == 16 ) ) {
a_Q12_89AB = OP_CVTEPI16_EPI32_M64( a_Q12 + 8 );
a_Q12_CDEF = OP_CVTEPI16_EPI32_M64( a_Q12 + 12 );
}
if( signalType == TYPE_VOICED ){
b_Q12_0123 = OP_CVTEPI16_EPI32_M64( b_Q14 );
b_sr_Q12_0123 = _mm_shuffle_epi32( b_Q12_0123, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
}
for( i = 0; i < length; i++ ) {
/* Perform common calculations used in all states */
/* Long-term prediction */
if( signalType == TYPE_VOICED ) {
/* Unrolled loop */
/* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
LTP_pred_Q14 = 2;
{
__m128i tmpa, tmpb, pred_lag_ptr_tmp;
pred_lag_ptr_tmp = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr[ -3 ] ) );
pred_lag_ptr_tmp = _mm_shuffle_epi32( pred_lag_ptr_tmp, 0x1B );
tmpa = _mm_mul_epi32( pred_lag_ptr_tmp, b_Q12_0123 );
tmpa = _mm_srli_si128( tmpa, 2 );
pred_lag_ptr_tmp = _mm_shuffle_epi32( pred_lag_ptr_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) );/* equal shift right 4 bytes */
pred_lag_ptr_tmp = _mm_mul_epi32( pred_lag_ptr_tmp, b_sr_Q12_0123 );
pred_lag_ptr_tmp = _mm_srli_si128( pred_lag_ptr_tmp, 2 );
pred_lag_ptr_tmp = _mm_add_epi32( pred_lag_ptr_tmp, tmpa );
tmpb = _mm_shuffle_epi32( pred_lag_ptr_tmp, _MM_SHUFFLE( 0, 0, 3, 2 ) );/* equal shift right 8 bytes */
pred_lag_ptr_tmp = _mm_add_epi32( pred_lag_ptr_tmp, tmpb );
LTP_pred_Q14 += _mm_cvtsi128_si32( pred_lag_ptr_tmp );
LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 ); /* Q13 -> Q14 */
pred_lag_ptr++;
}
} else {
LTP_pred_Q14 = 0;
}
/* Long-term shaping */
if( lag > 0 ) {
/* Symmetric, packed FIR coefficients */
n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 ); /* Q12 -> Q14 */
shp_lag_ptr++;
} else {
n_LTP_Q14 = 0;
}
{
__m128i tmpa, tmpb, psLPC_Q14_tmp, a_Q12_tmp;
for( k = 0; k < nStatesDelayedDecision; k++ ) {
/* Delayed decision state */
psDD = &psDelDec[ k ];
/* Sample state */
psSS = psSampleState[ k ];
/* Generate dither */
psDD->Seed = silk_RAND( psDD->Seed );
/* Pointer used in short term prediction and shaping */
psLPC_Q14 = &psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 + i ];
/* Short-term prediction */
silk_assert( predictLPCOrder == 10 || predictLPCOrder == 16 );
/* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
LPC_pred_Q14 = silk_RSHIFT( predictLPCOrder, 1 );
tmpb = _mm_setzero_si128();
/* step 1 */
psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -3 ] ) ); /* -3, -2 , -1, 0 */
psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B ); /* 0, -1, -2, -3 */
tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_0123 ); /* 0, -1, -2, -3 * 0123 -> 0*0, 2*-2 */
tmpa = _mm_srli_epi64( tmpa, 16 );
tmpb = _mm_add_epi32( tmpb, tmpa );
psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
a_Q12_tmp = _mm_shuffle_epi32( a_Q12_0123, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp ); /* 1*-1, 3*-3 */
psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
/* step 2 */
psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -7 ] ) );
psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_4567 );
tmpa = _mm_srli_epi64( tmpa, 16 );
tmpb = _mm_add_epi32( tmpb, tmpa );
psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
a_Q12_tmp = _mm_shuffle_epi32( a_Q12_4567, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
if ( opus_likely( predictLPCOrder == 16 ) )
{
/* step 3 */
psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -11 ] ) );
psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_89AB );
tmpa = _mm_srli_epi64( tmpa, 16 );
tmpb = _mm_add_epi32( tmpb, tmpa );
psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
a_Q12_tmp = _mm_shuffle_epi32( a_Q12_89AB, _MM_SHUFFLE(0, 3, 2, 1 ) );/* equal shift right 4 bytes */
psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
/* setp 4 */
psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -15 ] ) );
psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_CDEF );
tmpa = _mm_srli_epi64( tmpa, 16 );
tmpb = _mm_add_epi32( tmpb, tmpa );
psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
a_Q12_tmp = _mm_shuffle_epi32( a_Q12_CDEF, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
/* add at last */
/* equal shift right 8 bytes*/
tmpa = _mm_shuffle_epi32( tmpb, _MM_SHUFFLE( 0, 0, 3, 2 ) );
tmpb = _mm_add_epi32( tmpb, tmpa );
LPC_pred_Q14 += _mm_cvtsi128_si32( tmpb );
}
else
{
/* add at last */
tmpa = _mm_shuffle_epi32( tmpb, _MM_SHUFFLE( 0, 0, 3, 2 ) ); /* equal shift right 8 bytes*/
tmpb = _mm_add_epi32( tmpb, tmpa );
LPC_pred_Q14 += _mm_cvtsi128_si32( tmpb );
LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -8 ], a_Q12[ 8 ] );
LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -9 ], a_Q12[ 9 ] );
}
LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */
/* Noise shape feedback */
silk_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */
/* Output of lowpass section */
tmp2 = silk_SMLAWB( psLPC_Q14[ 0 ], psDD->sAR2_Q14[ 0 ], warping_Q16 );
/* Output of allpass section */
tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - tmp2, warping_Q16 );
psDD->sAR2_Q14[ 0 ] = tmp2;
n_AR_Q14 = silk_RSHIFT( shapingLPCOrder, 1 );
n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ 0 ] );
/* Loop over allpass sections */
for( j = 2; j < shapingLPCOrder; j += 2 ) {
/* Output of allpass section */
tmp2 = silk_SMLAWB( psDD->sAR2_Q14[ j - 1 ], psDD->sAR2_Q14[ j + 0 ] - tmp1, warping_Q16 );
psDD->sAR2_Q14[ j - 1 ] = tmp1;
n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ j - 1 ] );
/* Output of allpass section */
tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ j + 0 ], psDD->sAR2_Q14[ j + 1 ] - tmp2, warping_Q16 );
psDD->sAR2_Q14[ j + 0 ] = tmp2;
n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ j ] );
}
psDD->sAR2_Q14[ shapingLPCOrder - 1 ] = tmp1;
n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ shapingLPCOrder - 1 ] );
n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 1 ); /* Q11 -> Q12 */
n_AR_Q14 = silk_SMLAWB( n_AR_Q14, psDD->LF_AR_Q14, Tilt_Q14 ); /* Q12 */
n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 2 ); /* Q12 -> Q14 */
n_LF_Q14 = silk_SMULWB( psDD->Shape_Q14[ *smpl_buf_idx ], LF_shp_Q14 ); /* Q12 */
n_LF_Q14 = silk_SMLAWT( n_LF_Q14, psDD->LF_AR_Q14, LF_shp_Q14 ); /* Q12 */
n_LF_Q14 = silk_LSHIFT( n_LF_Q14, 2 ); /* Q12 -> Q14 */
/* Input minus prediction plus noise feedback */
/* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */
tmp1 = silk_ADD32( n_AR_Q14, n_LF_Q14 ); /* Q14 */
tmp2 = silk_ADD32( n_LTP_Q14, LPC_pred_Q14 ); /* Q13 */
tmp1 = silk_SUB32( tmp2, tmp1 ); /* Q13 */
tmp1 = silk_RSHIFT_ROUND( tmp1, 4 ); /* Q10 */
r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 ); /* residual error Q10 */
/* Flip sign depending on dither */
if ( psDD->Seed < 0 ) {
r_Q10 = -r_Q10;
}
r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 );
/* Find two quantization level candidates and measure their rate-distortion */
q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
if( q1_Q0 > 0 ) {
q1_Q10 = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 );
q2_Q10 = silk_ADD32( q1_Q10, 1024 );
rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 );
rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );
} else if( q1_Q0 == 0 ) {
q1_Q10 = offset_Q10;
q2_Q10 = silk_ADD32( q1_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 );
rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 );
rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );
} else if( q1_Q0 == -1 ) {
q2_Q10 = offset_Q10;
q1_Q10 = silk_SUB32( q2_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 );
rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 );
rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );
} else { /* q1_Q0 < -1 */
q1_Q10 = silk_ADD32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 );
q2_Q10 = silk_ADD32( q1_Q10, 1024 );
rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 );
rd2_Q10 = silk_SMULBB( -q2_Q10, Lambda_Q10 );
}
rr_Q10 = silk_SUB32( r_Q10, q1_Q10 );
rd1_Q10 = silk_RSHIFT( silk_SMLABB( rd1_Q10, rr_Q10, rr_Q10 ), 10 );
rr_Q10 = silk_SUB32( r_Q10, q2_Q10 );
rd2_Q10 = silk_RSHIFT( silk_SMLABB( rd2_Q10, rr_Q10, rr_Q10 ), 10 );
if( rd1_Q10 < rd2_Q10 ) {
psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 );
psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 );
psSS[ 0 ].Q_Q10 = q1_Q10;
psSS[ 1 ].Q_Q10 = q2_Q10;
} else {
psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 );
psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 );
psSS[ 0 ].Q_Q10 = q2_Q10;
psSS[ 1 ].Q_Q10 = q1_Q10;
}
/* Update states for best quantization */
/* Quantized excitation */
exc_Q14 = silk_LSHIFT32( psSS[ 0 ].Q_Q10, 4 );
if ( psDD->Seed < 0 ) {
exc_Q14 = -exc_Q14;
}
/* Add predictions */
LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
/* Update states */
sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 );
psSS[ 0 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
psSS[ 0 ].LF_AR_Q14 = sLF_AR_shp_Q14;
psSS[ 0 ].LPC_exc_Q14 = LPC_exc_Q14;
psSS[ 0 ].xq_Q14 = xq_Q14;
/* Update states for second best quantization */
/* Quantized excitation */
exc_Q14 = silk_LSHIFT32( psSS[ 1 ].Q_Q10, 4 );
if ( psDD->Seed < 0 ) {
exc_Q14 = -exc_Q14;
}
/* Add predictions */
LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
/* Update states */
sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 );
psSS[ 1 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
psSS[ 1 ].LF_AR_Q14 = sLF_AR_shp_Q14;
psSS[ 1 ].LPC_exc_Q14 = LPC_exc_Q14;
psSS[ 1 ].xq_Q14 = xq_Q14;
}
}
*smpl_buf_idx = ( *smpl_buf_idx - 1 ) & DECISION_DELAY_MASK; /* Index to newest samples */
last_smple_idx = ( *smpl_buf_idx + decisionDelay ) & DECISION_DELAY_MASK; /* Index to decisionDelay old samples */
/* Find winner */
RDmin_Q10 = psSampleState[ 0 ][ 0 ].RD_Q10;
Winner_ind = 0;
for( k = 1; k < nStatesDelayedDecision; k++ ) {
if( psSampleState[ k ][ 0 ].RD_Q10 < RDmin_Q10 ) {
RDmin_Q10 = psSampleState[ k ][ 0 ].RD_Q10;
Winner_ind = k;
}
}
/* Increase RD values of expired states */
Winner_rand_state = psDelDec[ Winner_ind ].RandState[ last_smple_idx ];
for( k = 0; k < nStatesDelayedDecision; k++ ) {
if( psDelDec[ k ].RandState[ last_smple_idx ] != Winner_rand_state ) {
psSampleState[ k ][ 0 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 0 ].RD_Q10, silk_int32_MAX >> 4 );
psSampleState[ k ][ 1 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 1 ].RD_Q10, silk_int32_MAX >> 4 );
silk_assert( psSampleState[ k ][ 0 ].RD_Q10 >= 0 );
}
}
/* Find worst in first set and best in second set */
RDmax_Q10 = psSampleState[ 0 ][ 0 ].RD_Q10;
RDmin_Q10 = psSampleState[ 0 ][ 1 ].RD_Q10;
RDmax_ind = 0;
RDmin_ind = 0;
for( k = 1; k < nStatesDelayedDecision; k++ ) {
/* find worst in first set */
if( psSampleState[ k ][ 0 ].RD_Q10 > RDmax_Q10 ) {
RDmax_Q10 = psSampleState[ k ][ 0 ].RD_Q10;
RDmax_ind = k;
}
/* find best in second set */
if( psSampleState[ k ][ 1 ].RD_Q10 < RDmin_Q10 ) {
RDmin_Q10 = psSampleState[ k ][ 1 ].RD_Q10;
RDmin_ind = k;
}
}
/* Replace a state if best from second set outperforms worst in first set */
if( RDmin_Q10 < RDmax_Q10 ) {
silk_memcpy( ( (opus_int32 *)&psDelDec[ RDmax_ind ] ) + i,
( (opus_int32 *)&psDelDec[ RDmin_ind ] ) + i, sizeof( NSQ_del_dec_struct ) - i * sizeof( opus_int32) );
silk_memcpy( &psSampleState[ RDmax_ind ][ 0 ], &psSampleState[ RDmin_ind ][ 1 ], sizeof( NSQ_sample_struct ) );
}
/* Write samples from winner to output and long-term filter states */
psDD = &psDelDec[ Winner_ind ];
if( subfr > 0 || i >= decisionDelay ) {
pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
xq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], delayedGain_Q10[ last_smple_idx ] ), 8 ) );
NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay ] = psDD->Shape_Q14[ last_smple_idx ];
sLTP_Q15[ NSQ->sLTP_buf_idx - decisionDelay ] = psDD->Pred_Q15[ last_smple_idx ];
}
NSQ->sLTP_shp_buf_idx++;
NSQ->sLTP_buf_idx++;
/* Update states */
for( k = 0; k < nStatesDelayedDecision; k++ ) {
psDD = &psDelDec[ k ];
psSS = &psSampleState[ k ][ 0 ];
psDD->LF_AR_Q14 = psSS->LF_AR_Q14;
psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14;
psDD->Xq_Q14[ *smpl_buf_idx ] = psSS->xq_Q14;
psDD->Q_Q10[ *smpl_buf_idx ] = psSS->Q_Q10;
psDD->Pred_Q15[ *smpl_buf_idx ] = silk_LSHIFT32( psSS->LPC_exc_Q14, 1 );
psDD->Shape_Q14[ *smpl_buf_idx ] = psSS->sLTP_shp_Q14;
psDD->Seed = silk_ADD32_ovflw( psDD->Seed, silk_RSHIFT_ROUND( psSS->Q_Q10, 10 ) );
psDD->RandState[ *smpl_buf_idx ] = psDD->Seed;
psDD->RD_Q10 = psSS->RD_Q10;
}
delayedGain_Q10[ *smpl_buf_idx ] = Gain_Q10;
}
/* Update LPC states */
for( k = 0; k < nStatesDelayedDecision; k++ ) {
psDD = &psDelDec[ k ];
silk_memcpy( psDD->sLPC_Q14, &psDD->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
}
RESTORE_STACK;
}
static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
const silk_encoder_state *psEncC, /* I Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */
const opus_int32 x_Q3[], /* I Input in Q3 */
opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */
const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
opus_int subfr, /* I Subframe number */
opus_int nStatesDelayedDecision, /* I Number of del dec states */
const opus_int LTP_scale_Q14, /* I LTP state scaling */
const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
const opus_int signal_type, /* I Signal type */
const opus_int decisionDelay /* I Decision delay */
)
{
opus_int i, k, lag;
opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
NSQ_del_dec_struct *psDD;
__m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
lag = pitchL[ subfr ];
inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
silk_assert( inv_gain_Q31 != 0 );
/* Calculate gain adjustment factor */
if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
} else {
gain_adj_Q16 = (opus_int32)1 << 16;
}
/* Scale input */
inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
/* prepare inv_gain_Q23 in packed 4 32-bits */
xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
/* equal shift right 4 bytes*/
xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
_mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ])), xmm_x_Q3_x2x0 );
}
for( ; i < psEncC->subfr_length; i++ ) {
x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
}
/* Save inverse gain */
NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
/* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
if( NSQ->rewhite_flag ) {
if( subfr == 0 ) {
/* Do LTP downscaling */
inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
}
for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
silk_assert( i < MAX_FRAME_LENGTH );
sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] );
}
}
/* Adjust for changing gain */
if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
/* Scale long-term shaping state */
{
__m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
/* prepare gain_adj_Q16 in packed 4 32-bits */
xmm_gain_adj_Q16 = _mm_set1_epi32( gain_adj_Q16 );
for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 )
{
xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
/* equal shift right 4 bytes*/
xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xmm_gain_adj_Q16 );
xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xmm_gain_adj_Q16 );
xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 16 );
xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 16 );
xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC );
_mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
}
for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
}
/* Scale long-term prediction state */
if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) {
sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
}
}
for( k = 0; k < nStatesDelayedDecision; k++ ) {
psDD = &psDelDec[ k ];
/* Scale scalar states */
psDD->LF_AR_Q14 = silk_SMULWW( gain_adj_Q16, psDD->LF_AR_Q14 );
/* Scale short-term prediction and shaping states */
for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
psDD->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sLPC_Q14[ i ] );
}
for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
psDD->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sAR2_Q14[ i ] );
}
for( i = 0; i < DECISION_DELAY; i++ ) {
psDD->Pred_Q15[ i ] = silk_SMULWW( gain_adj_Q16, psDD->Pred_Q15[ i ] );
psDD->Shape_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->Shape_Q14[ i ] );
}
}
}
}
}

720
silk/x86/NSQ_sse.c Normal file
View file

@ -0,0 +1,720 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <xmmintrin.h>
#include <emmintrin.h>
#include <smmintrin.h>
#include "main.h"
#include "celt/x86/x86cpu.h"
#include "stack_alloc.h"
static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
const silk_encoder_state *psEncC, /* I Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
const opus_int32 x_Q3[], /* I input in Q3 */
opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */
const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */
opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
opus_int subfr, /* I subframe number */
const opus_int LTP_scale_Q14, /* I */
const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
const opus_int signal_type /* I Signal type */
);
static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
silk_nsq_state *NSQ, /* I/O NSQ state */
opus_int signalType, /* I Signal type */
const opus_int32 x_sc_Q10[], /* I */
opus_int8 pulses[], /* O */
opus_int16 xq[], /* O */
opus_int32 sLTP_Q15[], /* I/O LTP state */
const opus_int16 a_Q12[], /* I Short term prediction coefs */
const opus_int16 b_Q14[], /* I Long term prediction coefs */
const opus_int16 AR_shp_Q13[], /* I Noise shaping AR coefs */
opus_int lag, /* I Pitch lag */
opus_int32 HarmShapeFIRPacked_Q14, /* I */
opus_int Tilt_Q14, /* I Spectral tilt */
opus_int32 LF_shp_Q14, /* I */
opus_int32 Gain_Q16, /* I */
opus_int offset_Q10, /* I */
opus_int length, /* I Input length */
opus_int32 table[][4] /* I */
);
void silk_NSQ_sse4_1(
const silk_encoder_state *psEncC, /* I/O Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
SideInfoIndices *psIndices, /* I/O Quantization Indices */
const opus_int32 x_Q3[], /* I Prefiltered input signal */
opus_int8 pulses[], /* O Quantized pulse signal */
const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
const opus_int LTP_scale_Q14 /* I LTP state scaling */
)
{
opus_int k, lag, start_idx, LSF_interpolation_flag;
const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13;
opus_int16 *pxq;
VARDECL( opus_int32, sLTP_Q15 );
VARDECL( opus_int16, sLTP );
opus_int32 HarmShapeFIRPacked_Q14;
opus_int offset_Q10;
VARDECL( opus_int32, x_sc_Q10 );
opus_int32 table[ 64 ][ 4 ];
opus_int32 tmp1;
opus_int32 q1_Q10, q2_Q10, rd1_Q20, rd2_Q20;
SAVE_STACK;
NSQ->rand_seed = psIndices->Seed;
/* Set unvoiced lag to the previous one, overwrite later for voiced */
lag = NSQ->lagPrev;
silk_assert( NSQ->prev_gain_Q16 != 0 );
offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
/* 0 */
q1_Q10 = offset_Q10;
q2_Q10 = offset_Q10 + ( 1024 - QUANT_LEVEL_ADJUST_Q10 );
rd1_Q20 = q1_Q10 * Lambda_Q10;
rd2_Q20 = q2_Q10 * Lambda_Q10;
table[ 32 ][ 0 ] = q1_Q10;
table[ 32 ][ 1 ] = q2_Q10;
table[ 32 ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
table[ 32 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
/* -1 */
q1_Q10 = offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 );
q2_Q10 = offset_Q10;
rd1_Q20 = - q1_Q10 * Lambda_Q10;
rd2_Q20 = q2_Q10 * Lambda_Q10;
table[ 31 ][ 0 ] = q1_Q10;
table[ 31 ][ 1 ] = q2_Q10;
table[ 31 ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
table[ 31 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
/* > 0 */
for (k = 1; k <= 31; k++)
{
tmp1 = offset_Q10 + silk_LSHIFT( k, 10 );
q1_Q10 = tmp1 - QUANT_LEVEL_ADJUST_Q10;
q2_Q10 = tmp1 - QUANT_LEVEL_ADJUST_Q10 + 1024;
rd1_Q20 = q1_Q10 * Lambda_Q10;
rd2_Q20 = q2_Q10 * Lambda_Q10;
table[ 32 + k ][ 0 ] = q1_Q10;
table[ 32 + k ][ 1 ] = q2_Q10;
table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
}
/* < -1 */
for (k = -32; k <= -2; k++)
{
tmp1 = offset_Q10 + silk_LSHIFT( k, 10 );
q1_Q10 = tmp1 + QUANT_LEVEL_ADJUST_Q10;
q2_Q10 = tmp1 + QUANT_LEVEL_ADJUST_Q10 + 1024;
rd1_Q20 = - q1_Q10 * Lambda_Q10;
rd2_Q20 = - q2_Q10 * Lambda_Q10;
table[ 32 + k ][ 0 ] = q1_Q10;
table[ 32 + k ][ 1 ] = q2_Q10;
table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
}
if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
LSF_interpolation_flag = 0;
} else {
LSF_interpolation_flag = 1;
}
ALLOC( sLTP_Q15,
psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
/* Set up pointers to start of sub frame */
NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
pxq = &NSQ->xq[ psEncC->ltp_mem_length ];
for( k = 0; k < psEncC->nb_subfr; k++ ) {
A_Q12 = &PredCoef_Q12[ (( k >> 1 ) | ( 1 - LSF_interpolation_flag )) * MAX_LPC_ORDER ];
B_Q14 = &LTPCoef_Q14[ k * LTP_ORDER ];
AR_shp_Q13 = &AR2_Q13[ k * MAX_SHAPE_LPC_ORDER ];
/* Noise shape parameters */
silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
NSQ->rewhite_flag = 0;
if( psIndices->signalType == TYPE_VOICED ) {
/* Voiced */
lag = pitchL[ k ];
/* Re-whitening */
if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
/* Rewhiten with new A coefs */
start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
silk_assert( start_idx > 0 );
silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
NSQ->rewhite_flag = 1;
NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
}
}
silk_nsq_scale_states_sse4_1( psEncC, NSQ, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );
if ( opus_likely( ( 10 == psEncC->shapingLPCOrder ) && ( 16 == psEncC->predictLPCOrder) ) )
{
silk_noise_shape_quantizer_10_16_sse4_1( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ],
offset_Q10, psEncC->subfr_length, &(table[32]) );
}
else
{
silk_noise_shape_quantizer( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10,
offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder, psEncC->predictLPCOrder );
}
x_Q3 += psEncC->subfr_length;
pulses += psEncC->subfr_length;
pxq += psEncC->subfr_length;
}
/* Update lagPrev for next frame */
NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ];
/* Save quantized speech and noise shaping signals */
/* DEBUG_STORE_DATA( enc.pcm, &NSQ->xq[ psEncC->ltp_mem_length ], psEncC->frame_length * sizeof( opus_int16 ) ) */
silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
RESTORE_STACK;
}
/***********************************/
/* silk_noise_shape_quantizer_10_16 */
/***********************************/
static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
silk_nsq_state *NSQ, /* I/O NSQ state */
opus_int signalType, /* I Signal type */
const opus_int32 x_sc_Q10[], /* I */
opus_int8 pulses[], /* O */
opus_int16 xq[], /* O */
opus_int32 sLTP_Q15[], /* I/O LTP state */
const opus_int16 a_Q12[], /* I Short term prediction coefs */
const opus_int16 b_Q14[], /* I Long term prediction coefs */
const opus_int16 AR_shp_Q13[], /* I Noise shaping AR coefs */
opus_int lag, /* I Pitch lag */
opus_int32 HarmShapeFIRPacked_Q14, /* I */
opus_int Tilt_Q14, /* I Spectral tilt */
opus_int32 LF_shp_Q14, /* I */
opus_int32 Gain_Q16, /* I */
opus_int offset_Q10, /* I */
opus_int length, /* I Input length */
opus_int32 table[][4] /* I */
)
{
opus_int i;
opus_int32 LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13;
opus_int32 n_LF_Q12, r_Q10, q1_Q0, q1_Q10, q2_Q10;
opus_int32 exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
opus_int32 tmp1, tmp2, sLF_AR_shp_Q14;
opus_int32 *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr;
__m128i xmm_tempa, xmm_tempb;
__m128i xmm_one;
__m128i psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF;
__m128i psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF;
__m128i a_Q12_01234567, a_Q12_89ABCDEF;
__m128i sAR2_Q14_hi_76543210, sAR2_Q14_lo_76543210;
__m128i AR_shp_Q13_76543210;
shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 );
/* Set up short term AR state */
psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 ];
sLF_AR_shp_Q14 = NSQ->sLF_AR_shp_Q14;
xq_Q14 = psLPC_Q14[ 0 ];
LTP_pred_Q13 = 0;
/* load a_Q12 */
xmm_one = _mm_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 );
/* load a_Q12[0] - a_Q12[7] */
a_Q12_01234567 = _mm_loadu_si128( (__m128i *)(&a_Q12[ 0 ] ) );
/* load a_Q12[ 8 ] - a_Q12[ 15 ] */
a_Q12_89ABCDEF = _mm_loadu_si128( (__m128i *)(&a_Q12[ 8 ] ) );
a_Q12_01234567 = _mm_shuffle_epi8( a_Q12_01234567, xmm_one );
a_Q12_89ABCDEF = _mm_shuffle_epi8( a_Q12_89ABCDEF, xmm_one );
/* load AR_shp_Q13 */
AR_shp_Q13_76543210 = _mm_loadu_si128( (__m128i *)(&AR_shp_Q13[0] ) );
/* load psLPC_Q14 */
xmm_one = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 );
xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-16]) );
xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-12]) );
xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
psLPC_Q14_hi_89ABCDEF = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
psLPC_Q14_lo_89ABCDEF = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -8 ]) );
xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -4 ]) );
xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
psLPC_Q14_hi_01234567 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
psLPC_Q14_lo_01234567 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
/* load sAR2_Q14 */
xmm_tempa = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 0 ]) ) );
xmm_tempb = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 4 ]) ) );
xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
sAR2_Q14_hi_76543210 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
sAR2_Q14_lo_76543210 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
/* prepare 1 in 8 * 16bit */
xmm_one = _mm_set1_epi16(1);
for( i = 0; i < length; i++ )
{
/* Short-term prediction */
__m128i xmm_hi_07, xmm_hi_8F, xmm_lo_07, xmm_lo_8F;
/* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
LPC_pred_Q10 = 8; /* silk_RSHIFT( predictLPCOrder, 1 ); */
/* shift psLPC_Q14 */
psLPC_Q14_hi_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF, 2 );
psLPC_Q14_lo_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF, 2 );
psLPC_Q14_hi_01234567 = _mm_srli_si128( psLPC_Q14_hi_01234567, 2 );
psLPC_Q14_lo_01234567 = _mm_srli_si128( psLPC_Q14_lo_01234567, 2 );
psLPC_Q14_hi_01234567 = _mm_insert_epi16( psLPC_Q14_hi_01234567, (xq_Q14 >> 16), 7 );
psLPC_Q14_lo_01234567 = _mm_insert_epi16( psLPC_Q14_lo_01234567, (xq_Q14), 7 );
/* high part, use pmaddwd, results in 4 32-bit */
xmm_hi_07 = _mm_madd_epi16( psLPC_Q14_hi_01234567, a_Q12_01234567 );
xmm_hi_8F = _mm_madd_epi16( psLPC_Q14_hi_89ABCDEF, a_Q12_89ABCDEF );
/* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed, _mm_srai_epi16(psLPC_Q14_lo_01234567, 15) */
xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_01234567 );
xmm_tempb = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_89ABCDEF );
xmm_tempa = _mm_and_si128( xmm_tempa, a_Q12_01234567 );
xmm_tempb = _mm_and_si128( xmm_tempb, a_Q12_89ABCDEF );
xmm_lo_07 = _mm_mulhi_epi16( psLPC_Q14_lo_01234567, a_Q12_01234567 );
xmm_lo_8F = _mm_mulhi_epi16( psLPC_Q14_lo_89ABCDEF, a_Q12_89ABCDEF );
xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa );
xmm_lo_8F = _mm_add_epi16( xmm_lo_8F, xmm_tempb );
xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one );
xmm_lo_8F = _mm_madd_epi16( xmm_lo_8F, xmm_one );
/* accumulate */
xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_hi_8F );
xmm_lo_07 = _mm_add_epi32( xmm_lo_07, xmm_lo_8F );
xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 );
xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) );
xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) );
LPC_pred_Q10 += _mm_cvtsi128_si32( xmm_hi_07 );
/* Long-term prediction */
if ( opus_likely( signalType == TYPE_VOICED ) ) {
/* Unrolled loop */
/* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
LTP_pred_Q13 = 2;
{
__m128i b_Q14_3210, b_Q14_0123, pred_lag_ptr_0123;
b_Q14_3210 = OP_CVTEPI16_EPI32_M64( b_Q14 );
b_Q14_0123 = _mm_shuffle_epi32( b_Q14_3210, 0x1B );
/* loaded: [0] [-1] [-2] [-3] */
pred_lag_ptr_0123 = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr[ -3 ] ) );
/* shuffle to [-3] [-2] [-1] [0] and to new xmm */
xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, 0x1B );
/*64-bit multiply, a[2] * b[-2], a[0] * b[0] */
xmm_tempa = _mm_mul_epi32( xmm_tempa, b_Q14_3210 );
/* right shift 2 bytes (16 bits), zero extended */
xmm_tempa = _mm_srli_si128( xmm_tempa, 2 );
/* a[1] * b[-1], a[3] * b[-3] */
pred_lag_ptr_0123 = _mm_mul_epi32( pred_lag_ptr_0123, b_Q14_0123 );
pred_lag_ptr_0123 = _mm_srli_si128( pred_lag_ptr_0123, 2 );
pred_lag_ptr_0123 = _mm_add_epi32( pred_lag_ptr_0123, xmm_tempa );
/* equal shift right 8 bytes*/
xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, _MM_SHUFFLE( 0, 0, 3, 2 ) );
xmm_tempa = _mm_add_epi32( xmm_tempa, pred_lag_ptr_0123 );
LTP_pred_Q13 += _mm_cvtsi128_si32( xmm_tempa );
LTP_pred_Q13 = silk_SMLAWB( LTP_pred_Q13, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
pred_lag_ptr++;
}
}
/* Noise shape feedback */
NSQ->sAR2_Q14[ 9 ] = NSQ->sAR2_Q14[ 8 ];
NSQ->sAR2_Q14[ 8 ] = _mm_cvtsi128_si32( _mm_srli_si128(_mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 ), 12 ) );
sAR2_Q14_hi_76543210 = _mm_slli_si128( sAR2_Q14_hi_76543210, 2 );
sAR2_Q14_lo_76543210 = _mm_slli_si128( sAR2_Q14_lo_76543210, 2 );
sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (xq_Q14 >> 16), 0 );
sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (xq_Q14), 0 );
/* high part, use pmaddwd, results in 4 32-bit */
xmm_hi_07 = _mm_madd_epi16( sAR2_Q14_hi_76543210, AR_shp_Q13_76543210 );
/* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed,_mm_srai_epi16(sAR2_Q14_lo_76543210, 15) */
xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), sAR2_Q14_lo_76543210 );
xmm_tempa = _mm_and_si128( xmm_tempa, AR_shp_Q13_76543210 );
xmm_lo_07 = _mm_mulhi_epi16( sAR2_Q14_lo_76543210, AR_shp_Q13_76543210 );
xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa );
xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one );
/* accumulate */
xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 );
xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) );
xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) );
n_AR_Q12 = 5 + _mm_cvtsi128_si32( xmm_hi_07 );
n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 8 ], AR_shp_Q13[ 8 ] );
n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 9 ], AR_shp_Q13[ 9 ] );
n_AR_Q12 = silk_LSHIFT32( n_AR_Q12, 1 ); /* Q11 -> Q12 */
n_AR_Q12 = silk_SMLAWB( n_AR_Q12, sLF_AR_shp_Q14, Tilt_Q14 );
n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 );
n_LF_Q12 = silk_SMLAWT( n_LF_Q12, sLF_AR_shp_Q14, LF_shp_Q14 );
silk_assert( lag > 0 || signalType != TYPE_VOICED );
/* Combine prediction and noise shaping signals */
tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 ); /* Q12 */
tmp1 = silk_SUB32( tmp1, n_LF_Q12 ); /* Q12 */
if( lag > 0 ) {
/* Symmetric, packed FIR coefficients */
n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 );
shp_lag_ptr++;
tmp2 = silk_SUB32( LTP_pred_Q13, n_LTP_Q13 ); /* Q13 */
tmp1 = silk_ADD_LSHIFT32( tmp2, tmp1, 1 ); /* Q13 */
tmp1 = silk_RSHIFT_ROUND( tmp1, 3 ); /* Q10 */
} else {
tmp1 = silk_RSHIFT_ROUND( tmp1, 2 ); /* Q10 */
}
r_Q10 = silk_SUB32( x_sc_Q10[ i ], tmp1 ); /* residual error Q10 */
/* Generate dither */
NSQ->rand_seed = silk_RAND( NSQ->rand_seed );
/* Flip sign depending on dither */
tmp2 = -r_Q10;
if ( NSQ->rand_seed < 0 ) r_Q10 = tmp2;
r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 );
/* Find two quantization level candidates and measure their rate-distortion */
q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
q1_Q10 = table[q1_Q0][0];
q2_Q10 = table[q1_Q0][1];
if (r_Q10 * table[q1_Q0][2] - table[q1_Q0][3] < 0)
{
q1_Q10 = q2_Q10;
}
pulses[ i ] = (opus_int8)silk_RSHIFT_ROUND( q1_Q10, 10 );
/* Excitation */
exc_Q14 = silk_LSHIFT( q1_Q10, 4 );
tmp2 = -exc_Q14;
if ( NSQ->rand_seed < 0 ) exc_Q14 = tmp2;
/* Add predictions */
LPC_exc_Q14 = silk_ADD_LSHIFT32( exc_Q14, LTP_pred_Q13, 1 );
xq_Q14 = silk_ADD_LSHIFT32( LPC_exc_Q14, LPC_pred_Q10, 4 );
/* Update states */
psLPC_Q14++;
*psLPC_Q14 = xq_Q14;
sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, n_AR_Q12, 2 );
NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 );
sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 );
NSQ->sLTP_shp_buf_idx++;
NSQ->sLTP_buf_idx++;
/* Make dither dependent on quantized signal */
NSQ->rand_seed = silk_ADD32_ovflw( NSQ->rand_seed, pulses[ i ] );
}
NSQ->sLF_AR_shp_Q14 = sLF_AR_shp_Q14;
/* Scale XQ back to normal level before saving */
psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH ];
/* write back sAR2_Q14 */
xmm_tempa = _mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 );
xmm_tempb = _mm_unpacklo_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 );
_mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 4 ]), xmm_tempa );
_mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 0 ]), xmm_tempb );
/* xq[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) ); */
{
__m128i xmm_Gain_Q10;
__m128i xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, xmm_xq_Q14_7654, xmm_xq_Q14_x7x5;
/* prepare (1 << 7) in packed 4 32-bits */
xmm_tempa = _mm_set1_epi32( (1 << 7) );
/* prepare Gain_Q10 in packed 4 32-bits */
xmm_Gain_Q10 = _mm_set1_epi32( Gain_Q10 );
/* process xq */
for (i = 0; i < length - 7; i += 8)
{
xmm_xq_Q14_3210 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 0 ] ) ) );
xmm_xq_Q14_7654 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 4 ] ) ) );
/* equal shift right 4 bytes*/
xmm_xq_Q14_x3x1 = _mm_shuffle_epi32( xmm_xq_Q14_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
/* equal shift right 4 bytes*/
xmm_xq_Q14_x7x5 = _mm_shuffle_epi32( xmm_xq_Q14_7654, _MM_SHUFFLE( 0, 3, 2, 1 ) );
xmm_xq_Q14_3210 = _mm_mul_epi32( xmm_xq_Q14_3210, xmm_Gain_Q10 );
xmm_xq_Q14_x3x1 = _mm_mul_epi32( xmm_xq_Q14_x3x1, xmm_Gain_Q10 );
xmm_xq_Q14_7654 = _mm_mul_epi32( xmm_xq_Q14_7654, xmm_Gain_Q10 );
xmm_xq_Q14_x7x5 = _mm_mul_epi32( xmm_xq_Q14_x7x5, xmm_Gain_Q10 );
xmm_xq_Q14_3210 = _mm_srli_epi64( xmm_xq_Q14_3210, 16 );
xmm_xq_Q14_x3x1 = _mm_slli_epi64( xmm_xq_Q14_x3x1, 16 );
xmm_xq_Q14_7654 = _mm_srli_epi64( xmm_xq_Q14_7654, 16 );
xmm_xq_Q14_x7x5 = _mm_slli_epi64( xmm_xq_Q14_x7x5, 16 );
xmm_xq_Q14_3210 = _mm_blend_epi16( xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, 0xCC );
xmm_xq_Q14_7654 = _mm_blend_epi16( xmm_xq_Q14_7654, xmm_xq_Q14_x7x5, 0xCC );
/* silk_RSHIFT_ROUND(xq, 8) */
xmm_xq_Q14_3210 = _mm_add_epi32( xmm_xq_Q14_3210, xmm_tempa );
xmm_xq_Q14_7654 = _mm_add_epi32( xmm_xq_Q14_7654, xmm_tempa );
xmm_xq_Q14_3210 = _mm_srai_epi32( xmm_xq_Q14_3210, 8 );
xmm_xq_Q14_7654 = _mm_srai_epi32( xmm_xq_Q14_7654, 8 );
/* silk_SAT16 */
xmm_xq_Q14_3210 = _mm_packs_epi32( xmm_xq_Q14_3210, xmm_xq_Q14_7654 );
/* save to xq */
_mm_storeu_si128( (__m128i *)(&xq[ i ] ), xmm_xq_Q14_3210 );
}
}
for ( ; i < length; i++)
{
xq[i] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) );
}
/* Update LPC synth buffer */
silk_memcpy( NSQ->sLPC_Q14, &NSQ->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
}
static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
const silk_encoder_state *psEncC, /* I Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
const opus_int32 x_Q3[], /* I input in Q3 */
opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */
const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */
opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
opus_int subfr, /* I subframe number */
const opus_int LTP_scale_Q14, /* I */
const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
const opus_int signal_type /* I Signal type */
)
{
opus_int i, lag;
opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
__m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
lag = pitchL[ subfr ];
inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
silk_assert( inv_gain_Q31 != 0 );
/* Calculate gain adjustment factor */
if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
} else {
gain_adj_Q16 = (opus_int32)1 << 16;
}
/* Scale input */
inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
/* prepare inv_gain_Q23 in packed 4 32-bits */
xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
/* equal shift right 4 bytes*/
xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
_mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x_Q3_x2x0 );
}
for( ; i < psEncC->subfr_length; i++ ) {
x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
}
/* Save inverse gain */
NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
/* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
if( NSQ->rewhite_flag ) {
if( subfr == 0 ) {
/* Do LTP downscaling */
inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
}
for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
silk_assert( i < MAX_FRAME_LENGTH );
sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] );
}
}
/* Adjust for changing gain */
if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
/* Scale long-term shaping state */
__m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
/* prepare gain_adj_Q16 in packed 4 32-bits */
xmm_gain_adj_Q16 = _mm_set1_epi32(gain_adj_Q16);
for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 )
{
xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
/* equal shift right 4 bytes*/
xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xmm_gain_adj_Q16 );
xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xmm_gain_adj_Q16 );
xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 16 );
xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 16 );
xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC );
_mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
}
for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
}
/* Scale long-term prediction state */
if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
}
}
NSQ->sLF_AR_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sLF_AR_shp_Q14 );
/* Scale short-term prediction and shaping states */
for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
NSQ->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLPC_Q14[ i ] );
}
for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
NSQ->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sAR2_Q14[ i ] );
}
}
}

View file

@ -0,0 +1,77 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef SIGPROC_FIX_SSE_H
#define SIGPROC_FIX_SSE_H
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
void silk_burg_modified_sse4_1(
opus_int32 *res_nrg, /* O Residual energy */
opus_int *res_nrg_Q, /* O Residual energy Q value */
opus_int32 A_Q16[], /* O Prediction coefficients (length order) */
const opus_int16 x[], /* I Input signal, length: nb_subfr * ( D + subfr_length ) */
const opus_int32 minInvGain_Q30, /* I Inverse of max prediction gain */
const opus_int subfr_length, /* I Input signal subframe length (incl. D preceding samples) */
const opus_int nb_subfr, /* I Number of subframes stacked in x */
const opus_int D, /* I Order */
int arch /* I Run-time architecture */
);
extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])(
opus_int32 *res_nrg, /* O Residual energy */
opus_int *res_nrg_Q, /* O Residual energy Q value */
opus_int32 A_Q16[], /* O Prediction coefficients (length order) */
const opus_int16 x[], /* I Input signal, length: nb_subfr * ( D + subfr_length ) */
const opus_int32 minInvGain_Q30, /* I Inverse of max prediction gain */
const opus_int subfr_length, /* I Input signal subframe length (incl. D preceding samples) */
const opus_int nb_subfr, /* I Number of subframes stacked in x */
const opus_int D, /* I Order */
int arch /* I Run-time architecture */);
# define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
((*SILK_BURG_MODIFIED_IMPL[(arch) & OPUS_ARCHMASK])(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
opus_int64 silk_inner_prod16_aligned_64_sse4_1(
const opus_int16 *inVec1,
const opus_int16 *inVec2,
const opus_int len
);
extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])(
const opus_int16 *inVec1,
const opus_int16 *inVec2,
const opus_int len);
# define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
((*SILK_INNER_PROD16_ALIGNED_64_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len))
#endif
#endif

277
silk/x86/VAD_sse.c Normal file
View file

@ -0,0 +1,277 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <xmmintrin.h>
#include <emmintrin.h>
#include <smmintrin.h>
#include "main.h"
#include "stack_alloc.h"
/* Weighting factors for tilt measure */
static const opus_int32 tiltWeights[ VAD_N_BANDS ] = { 30000, 6000, -12000, -12000 };
/***************************************/
/* Get the speech activity level in Q8 */
/***************************************/
opus_int silk_VAD_GetSA_Q8_sse4_1( /* O Return value, 0 if success */
silk_encoder_state *psEncC, /* I/O Encoder state */
const opus_int16 pIn[] /* I PCM input */
)
{
opus_int SA_Q15, pSNR_dB_Q7, input_tilt;
opus_int decimated_framelength1, decimated_framelength2;
opus_int decimated_framelength;
opus_int dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s;
opus_int32 sumSquared, smooth_coef_Q16;
opus_int16 HPstateTmp;
VARDECL( opus_int16, X );
opus_int32 Xnrg[ VAD_N_BANDS ];
opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ];
opus_int32 speech_nrg, x_tmp;
opus_int X_offset[ VAD_N_BANDS ];
opus_int ret = 0;
silk_VAD_state *psSilk_VAD = &psEncC->sVAD;
SAVE_STACK;
/* Safety checks */
silk_assert( VAD_N_BANDS == 4 );
silk_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
silk_assert( psEncC->frame_length <= 512 );
silk_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );
/***********************/
/* Filter and Decimate */
/***********************/
decimated_framelength1 = silk_RSHIFT( psEncC->frame_length, 1 );
decimated_framelength2 = silk_RSHIFT( psEncC->frame_length, 2 );
decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 );
/* Decimate into 4 bands:
0 L 3L L 3L 5L
- -- - -- --
8 8 2 4 4
[0-1 kHz| temp. |1-2 kHz| 2-4 kHz | 4-8 kHz |
They're arranged to allow the minimal ( frame_length / 4 ) extra
scratch space during the downsampling process */
X_offset[ 0 ] = 0;
X_offset[ 1 ] = decimated_framelength + decimated_framelength2;
X_offset[ 2 ] = X_offset[ 1 ] + decimated_framelength;
X_offset[ 3 ] = X_offset[ 2 ] + decimated_framelength2;
ALLOC( X, X_offset[ 3 ] + decimated_framelength1, opus_int16 );
/* 0-8 kHz to 0-4 kHz and 4-8 kHz */
silk_ana_filt_bank_1( pIn, &psSilk_VAD->AnaState[ 0 ],
X, &X[ X_offset[ 3 ] ], psEncC->frame_length );
/* 0-4 kHz to 0-2 kHz and 2-4 kHz */
silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState1[ 0 ],
X, &X[ X_offset[ 2 ] ], decimated_framelength1 );
/* 0-2 kHz to 0-1 kHz and 1-2 kHz */
silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState2[ 0 ],
X, &X[ X_offset[ 1 ] ], decimated_framelength2 );
/*********************************************/
/* HP filter on lowest band (differentiator) */
/*********************************************/
X[ decimated_framelength - 1 ] = silk_RSHIFT( X[ decimated_framelength - 1 ], 1 );
HPstateTmp = X[ decimated_framelength - 1 ];
for( i = decimated_framelength - 1; i > 0; i-- ) {
X[ i - 1 ] = silk_RSHIFT( X[ i - 1 ], 1 );
X[ i ] -= X[ i - 1 ];
}
X[ 0 ] -= psSilk_VAD->HPstate;
psSilk_VAD->HPstate = HPstateTmp;
/*************************************/
/* Calculate the energy in each band */
/*************************************/
for( b = 0; b < VAD_N_BANDS; b++ ) {
/* Find the decimated framelength in the non-uniformly divided bands */
decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) );
/* Split length into subframe lengths */
dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 );
dec_subframe_offset = 0;
/* Compute energy per sub-frame */
/* initialize with summed energy of last subframe */
Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ];
for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) {
__m128i xmm_X, xmm_acc;
sumSquared = 0;
xmm_acc = _mm_setzero_si128();
for( i = 0; i < dec_subframe_length - 7; i += 8 )
{
xmm_X = _mm_loadu_si128( (__m128i *)&(X[ X_offset[ b ] + i + dec_subframe_offset ] ) );
xmm_X = _mm_srai_epi16( xmm_X, 3 );
xmm_X = _mm_madd_epi16( xmm_X, xmm_X );
xmm_acc = _mm_add_epi32( xmm_acc, xmm_X );
}
xmm_acc = _mm_add_epi32( xmm_acc, _mm_unpackhi_epi64( xmm_acc, xmm_acc ) );
xmm_acc = _mm_add_epi32( xmm_acc, _mm_shufflelo_epi16( xmm_acc, 0x0E ) );
sumSquared += _mm_cvtsi128_si32( xmm_acc );
for( ; i < dec_subframe_length; i++ ) {
/* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2. */
/* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128) */
x_tmp = silk_RSHIFT(
X[ X_offset[ b ] + i + dec_subframe_offset ], 3 );
sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp );
/* Safety check */
silk_assert( sumSquared >= 0 );
}
/* Add/saturate summed energy of current subframe */
if( s < VAD_INTERNAL_SUBFRAMES - 1 ) {
Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared );
} else {
/* Look-ahead subframe */
Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) );
}
dec_subframe_offset += dec_subframe_length;
}
psSilk_VAD->XnrgSubfr[ b ] = sumSquared;
}
/********************/
/* Noise estimation */
/********************/
silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD );
/***********************************************/
/* Signal-plus-noise to noise ratio estimation */
/***********************************************/
sumSquared = 0;
input_tilt = 0;
for( b = 0; b < VAD_N_BANDS; b++ ) {
speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ];
if( speech_nrg > 0 ) {
/* Divide, with sufficient resolution */
if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) {
NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 );
} else {
NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 );
}
/* Convert to log domain */
SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128;
/* Sum-of-squares */
sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 ); /* Q14 */
/* Tilt measure */
if( speech_nrg < ( (opus_int32)1 << 20 ) ) {
/* Scale down SNR value for small subband speech energies */
SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 );
}
input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 );
} else {
NrgToNoiseRatio_Q8[ b ] = 256;
}
}
/* Mean-of-squares */
sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */
/* Root-mean-square approximation, scale to dBs, and write to output pointer */
pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */
/*********************************/
/* Speech Probability Estimation */
/*********************************/
SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 );
/**************************/
/* Frequency Tilt Measure */
/**************************/
psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 );
/**************************************************/
/* Scale the sigmoid output based on power levels */
/**************************************************/
speech_nrg = 0;
for( b = 0; b < VAD_N_BANDS; b++ ) {
/* Accumulate signal-without-noise energies, higher frequency bands have more weight */
speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
}
/* Power scaling */
if( speech_nrg <= 0 ) {
SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
} else if( speech_nrg < 32768 ) {
if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 16 );
} else {
speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 15 );
}
/* square-root */
speech_nrg = silk_SQRT_APPROX( speech_nrg );
SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 );
}
/* Copy the resulting speech activity in Q8 */
psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX );
/***********************************/
/* Energy Level and SNR estimation */
/***********************************/
/* Smoothing coefficient */
smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) );
if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
smooth_coef_Q16 >>= 1;
}
for( b = 0; b < VAD_N_BANDS; b++ ) {
/* compute smoothed energy-to-noise ratio per band */
psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ],
NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 );
/* signal to noise ratio in dB per band */
SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 );
/* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */
psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
}
RESTORE_STACK;
return( ret );
}

142
silk/x86/VQ_WMat_EC_sse.c Normal file
View file

@ -0,0 +1,142 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <xmmintrin.h>
#include <emmintrin.h>
#include <smmintrin.h>
#include "main.h"
#include "celt/x86/x86cpu.h"
/* Entropy constrained matrix-weighted VQ, hard-coded to 5-element vectors, for a single input data vector */
void silk_VQ_WMat_EC_sse4_1(
opus_int8 *ind, /* O index of best codebook vector */
opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
opus_int *gain_Q7, /* O sum of absolute LTP coefficients */
const opus_int16 *in_Q14, /* I input vector to be quantized */
const opus_int32 *W_Q18, /* I weighting matrix */
const opus_int8 *cb_Q7, /* I codebook */
const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */
const opus_uint8 *cl_Q5, /* I code length for each codebook vector */
const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */
const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */
opus_int L /* I number of vectors in codebook */
)
{
opus_int k, gain_tmp_Q7;
const opus_int8 *cb_row_Q7;
opus_int16 diff_Q14[ 5 ];
opus_int32 sum1_Q14, sum2_Q16;
__m128i C_tmp1, C_tmp2, C_tmp3, C_tmp4, C_tmp5;
/* Loop over codebook */
*rate_dist_Q14 = silk_int32_MAX;
cb_row_Q7 = cb_Q7;
for( k = 0; k < L; k++ ) {
gain_tmp_Q7 = cb_gain_Q7[k];
diff_Q14[ 0 ] = in_Q14[ 0 ] - silk_LSHIFT( cb_row_Q7[ 0 ], 7 );
C_tmp1 = OP_CVTEPI16_EPI32_M64( &in_Q14[ 1 ] );
C_tmp2 = OP_CVTEPI16_EPI32_M64( &cb_row_Q7[ 1 ] );
C_tmp2 = _mm_slli_epi32( C_tmp2, 7 );
C_tmp1 = _mm_sub_epi32( C_tmp1, C_tmp2 );
diff_Q14[ 1 ] = _mm_extract_epi16( C_tmp1, 0 );
diff_Q14[ 2 ] = _mm_extract_epi16( C_tmp1, 2 );
diff_Q14[ 3 ] = _mm_extract_epi16( C_tmp1, 4 );
diff_Q14[ 4 ] = _mm_extract_epi16( C_tmp1, 6 );
/* Weighted rate */
sum1_Q14 = silk_SMULBB( mu_Q9, cl_Q5[ k ] );
/* Penalty for too large gain */
sum1_Q14 = silk_ADD_LSHIFT32( sum1_Q14, silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 10 );
silk_assert( sum1_Q14 >= 0 );
/* first row of W_Q18 */
C_tmp3 = _mm_loadu_si128( (__m128i *)(&W_Q18[ 1 ] ) );
C_tmp4 = _mm_mul_epi32( C_tmp3, C_tmp1 );
C_tmp4 = _mm_srli_si128( C_tmp4, 2 );
C_tmp1 = _mm_shuffle_epi32( C_tmp1, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */
C_tmp3 = _mm_shuffle_epi32( C_tmp3, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */
C_tmp5 = _mm_mul_epi32( C_tmp3, C_tmp1 );
C_tmp5 = _mm_srli_si128( C_tmp5, 2 );
C_tmp5 = _mm_add_epi32( C_tmp4, C_tmp5 );
C_tmp5 = _mm_slli_epi32( C_tmp5, 1 );
C_tmp5 = _mm_add_epi32( C_tmp5, _mm_shuffle_epi32( C_tmp5, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
sum2_Q16 = _mm_cvtsi128_si32( C_tmp5 );
sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 0 ], diff_Q14[ 0 ] );
sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 0 ] );
/* second row of W_Q18 */
sum2_Q16 = silk_SMULWB( W_Q18[ 7 ], diff_Q14[ 2 ] );
sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 8 ], diff_Q14[ 3 ] );
sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 9 ], diff_Q14[ 4 ] );
sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 6 ], diff_Q14[ 1 ] );
sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 1 ] );
/* third row of W_Q18 */
sum2_Q16 = silk_SMULWB( W_Q18[ 13 ], diff_Q14[ 3 ] );
sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 14 ], diff_Q14[ 4 ] );
sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 12 ], diff_Q14[ 2 ] );
sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 2 ] );
/* fourth row of W_Q18 */
sum2_Q16 = silk_SMULWB( W_Q18[ 19 ], diff_Q14[ 4 ] );
sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 18 ], diff_Q14[ 3 ] );
sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 3 ] );
/* last row of W_Q18 */
sum2_Q16 = silk_SMULWB( W_Q18[ 24 ], diff_Q14[ 4 ] );
sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 4 ] );
silk_assert( sum1_Q14 >= 0 );
/* find best */
if( sum1_Q14 < *rate_dist_Q14 ) {
*rate_dist_Q14 = sum1_Q14;
*ind = (opus_int8)k;
*gain_Q7 = gain_tmp_Q7;
}
/* Go to next cbk vector */
cb_row_Q7 += LTP_ORDER;
}
}

228
silk/x86/main_sse.h Normal file
View file

@ -0,0 +1,228 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef MAIN_SSE_H
#define MAIN_SSE_H
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
# if defined(OPUS_X86_MAY_HAVE_SSE4_1)
# define OVERRIDE_silk_VQ_WMat_EC
void silk_VQ_WMat_EC_sse4_1(
opus_int8 *ind, /* O index of best codebook vector */
opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
opus_int *gain_Q7, /* O sum of absolute LTP coefficients */
const opus_int16 *in_Q14, /* I input vector to be quantized */
const opus_int32 *W_Q18, /* I weighting matrix */
const opus_int8 *cb_Q7, /* I codebook */
const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */
const opus_uint8 *cl_Q5, /* I code length for each codebook vector */
const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */
const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */
opus_int L /* I number of vectors in codebook */
);
extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])(
opus_int8 *ind, /* O index of best codebook vector */
opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
opus_int *gain_Q7, /* O sum of absolute LTP coefficients */
const opus_int16 *in_Q14, /* I input vector to be quantized */
const opus_int32 *W_Q18, /* I weighting matrix */
const opus_int8 *cb_Q7, /* I codebook */
const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */
const opus_uint8 *cl_Q5, /* I code length for each codebook vector */
const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */
const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */
opus_int L /* I number of vectors in codebook */
);
# define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
mu_Q9, max_gain_Q7, L, arch) \
((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
mu_Q9, max_gain_Q7, L))
# define OVERRIDE_silk_NSQ
void silk_NSQ_sse4_1(
const silk_encoder_state *psEncC, /* I/O Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
SideInfoIndices *psIndices, /* I/O Quantization Indices */
const opus_int32 x_Q3[], /* I Prefiltered input signal */
opus_int8 pulses[], /* O Quantized pulse signal */
const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
const opus_int LTP_scale_Q14 /* I LTP state scaling */
);
extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(
const silk_encoder_state *psEncC, /* I/O Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
SideInfoIndices *psIndices, /* I/O Quantization Indices */
const opus_int32 x_Q3[], /* I Prefiltered input signal */
opus_int8 pulses[], /* O Quantized pulse signal */
const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
const opus_int LTP_scale_Q14 /* I LTP state scaling */
);
# define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
((*SILK_NSQ_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
# define OVERRIDE_silk_NSQ_del_dec
void silk_NSQ_del_dec_sse4_1(
const silk_encoder_state *psEncC, /* I/O Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
SideInfoIndices *psIndices, /* I/O Quantization Indices */
const opus_int32 x_Q3[], /* I Prefiltered input signal */
opus_int8 pulses[], /* O Quantized pulse signal */
const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
const opus_int LTP_scale_Q14 /* I LTP state scaling */
);
extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
const silk_encoder_state *psEncC, /* I/O Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
SideInfoIndices *psIndices, /* I/O Quantization Indices */
const opus_int32 x_Q3[], /* I Prefiltered input signal */
opus_int8 pulses[], /* O Quantized pulse signal */
const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
const opus_int LTP_scale_Q14 /* I LTP state scaling */
);
# define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
void silk_noise_shape_quantizer(
silk_nsq_state *NSQ, /* I/O NSQ state */
opus_int signalType, /* I Signal type */
const opus_int32 x_sc_Q10[], /* I */
opus_int8 pulses[], /* O */
opus_int16 xq[], /* O */
opus_int32 sLTP_Q15[], /* I/O LTP state */
const opus_int16 a_Q12[], /* I Short term prediction coefs */
const opus_int16 b_Q14[], /* I Long term prediction coefs */
const opus_int16 AR_shp_Q13[], /* I Noise shaping AR coefs */
opus_int lag, /* I Pitch lag */
opus_int32 HarmShapeFIRPacked_Q14, /* I */
opus_int Tilt_Q14, /* I Spectral tilt */
opus_int32 LF_shp_Q14, /* I */
opus_int32 Gain_Q16, /* I */
opus_int Lambda_Q10, /* I */
opus_int offset_Q10, /* I */
opus_int length, /* I Input length */
opus_int shapingLPCOrder, /* I Noise shaping AR filter order */
opus_int predictLPCOrder /* I Prediction filter order */
);
/**************************/
/* Noise level estimation */
/**************************/
void silk_VAD_GetNoiseLevels(
const opus_int32 pX[ VAD_N_BANDS ], /* I subband energies */
silk_VAD_state *psSilk_VAD /* I/O Pointer to Silk VAD state */
);
# define OVERRIDE_silk_VAD_GetSA_Q8
opus_int silk_VAD_GetSA_Q8_sse4_1(
silk_encoder_state *psEnC,
const opus_int16 pIn[]
);
# define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \
((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn))
extern opus_int (*const SILK_VAD_GETSA_Q8_IMPL[OPUS_ARCHMASK + 1])(
silk_encoder_state *psEnC,
const opus_int16 pIn[]);
# define OVERRIDE_silk_warped_LPC_analysis_filter_FIX
void silk_warped_LPC_analysis_filter_FIX_sse4_1(
opus_int32 state[], /* I/O State [order + 1] */
opus_int32 res_Q2[], /* O Residual signal [length] */
const opus_int16 coef_Q13[], /* I Coefficients [order] */
const opus_int16 input[], /* I Input signal [length] */
const opus_int16 lambda_Q16, /* I Warping factor */
const opus_int length, /* I Length of input signal */
const opus_int order /* I Filter order (even) */
);
extern void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[OPUS_ARCHMASK + 1])(
opus_int32 state[], /* I/O State [order + 1] */
opus_int32 res_Q2[], /* O Residual signal [length] */
const opus_int16 coef_Q13[], /* I Coefficients [order] */
const opus_int16 input[], /* I Input signal [length] */
const opus_int16 lambda_Q16, /* I Warping factor */
const opus_int length, /* I Length of input signal */
const opus_int order /* I Filter order (even) */
);
# define silk_warped_LPC_analysis_filter_FIX(state, res_Q2, coef_Q13, input, lambda_Q16, length, order, arch) \
((*SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[(arch) & OPUS_ARCHMASK])(state, res_Q2, coef_Q13, input, lambda_Q16, length, order))
# endif
#endif

154
silk/x86/x86_silk_map.c Normal file
View file

@ -0,0 +1,154 @@
/* Copyright (c) 2014, Cisco Systems, INC
Written by XiangMingZhu WeiZhou MinPeng YanWang
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if defined(HAVE_CONFIG_H)
#include "config.h"
#endif
#include "celt/x86/x86cpu.h"
#include "structs.h"
#include "SigProc_FIX.h"
#include "pitch.h"
#include "main.h"
opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )(
const opus_int16 *inVec1,
const opus_int16 *inVec2,
const opus_int len
) = {
silk_inner_prod16_aligned_64_c, /* non-sse */
silk_inner_prod16_aligned_64_c,
MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ), /* sse4.1 */
NULL
};
opus_int (*const SILK_VAD_GETSA_Q8_IMPL[ OPUS_ARCHMASK + 1 ] )(
silk_encoder_state *psEncC,
const opus_int16 pIn[]
) = {
silk_VAD_GetSA_Q8_c, /* non-sse */
silk_VAD_GetSA_Q8_c,
MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 ), /* sse4.1 */
NULL
};
void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
const silk_encoder_state *psEncC, /* I/O Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
SideInfoIndices *psIndices, /* I/O Quantization Indices */
const opus_int32 x_Q3[], /* I Prefiltered input signal */
opus_int8 pulses[], /* O Quantized pulse signal */
const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
const opus_int LTP_scale_Q14 /* I LTP state scaling */
) = {
silk_NSQ_c, /* non-sse */
silk_NSQ_c,
MAY_HAVE_SSE4_1( silk_NSQ ), /* sse4.1 */
NULL
};
void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(
opus_int8 *ind, /* O index of best codebook vector */
opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
opus_int *gain_Q7, /* O sum of absolute LTP coefficients */
const opus_int16 *in_Q14, /* I input vector to be quantized */
const opus_int32 *W_Q18, /* I weighting matrix */
const opus_int8 *cb_Q7, /* I codebook */
const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */
const opus_uint8 *cl_Q5, /* I code length for each codebook vector */
const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */
const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */
opus_int L /* I number of vectors in codebook */
) = {
silk_VQ_WMat_EC_c, /* non-sse */
silk_VQ_WMat_EC_c,
MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ), /* sse4.1 */
NULL
};
void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
const silk_encoder_state *psEncC, /* I/O Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
SideInfoIndices *psIndices, /* I/O Quantization Indices */
const opus_int32 x_Q3[], /* I Prefiltered input signal */
opus_int8 pulses[], /* O Quantized pulse signal */
const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
const opus_int LTP_scale_Q14 /* I LTP state scaling */
) = {
silk_NSQ_del_dec_c, /* non-sse */
silk_NSQ_del_dec_c,
MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */
NULL
};
void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[ OPUS_ARCHMASK + 1 ] )(
opus_int32 state[], /* I/O State [order + 1] */
opus_int32 res_Q2[], /* O Residual signal [length] */
const opus_int16 coef_Q13[], /* I Coefficients [order] */
const opus_int16 input[], /* I Input signal [length] */
const opus_int16 lambda_Q16, /* I Warping factor */
const opus_int length, /* I Length of input signal */
const opus_int order /* I Filter order (even) */
) = {
silk_warped_LPC_analysis_filter_FIX_c, /* non-sse */
silk_warped_LPC_analysis_filter_FIX_c,
MAY_HAVE_SSE4_1( silk_warped_LPC_analysis_filter_FIX ), /* sse4.1 */
NULL
};
void (*const SILK_BURG_MODIFIED_IMPL[ OPUS_ARCHMASK + 1 ] )(
opus_int32 *res_nrg, /* O Residual energy */
opus_int *res_nrg_Q, /* O Residual energy Q value */
opus_int32 A_Q16[], /* O Prediction coefficients (length order) */
const opus_int16 x[], /* I Input signal, length: nb_subfr * ( D + subfr_length ) */
const opus_int32 minInvGain_Q30, /* I Inverse of max prediction gain */
const opus_int subfr_length, /* I Input signal subframe length (incl. D preceding samples) */
const opus_int nb_subfr, /* I Number of subframes stacked in x */
const opus_int D, /* I Order */
int arch /* I Run-time architecture */
) = {
silk_burg_modified_c, /* non-sse */
silk_burg_modified_c,
MAY_HAVE_SSE4_1( silk_burg_modified ), /* sse4.1 */
NULL
};

View file

@ -6,6 +6,7 @@ silk/API.h \
silk/typedef.h \
silk/define.h \
silk/main.h \
silk/x86/main_sse.h \
silk/PLC.h \
silk/structs.h \
silk/tables.h \
@ -19,6 +20,7 @@ silk/resampler_private.h \
silk/resampler_rom.h \
silk/resampler_structs.h \
silk/SigProc_FIX.h \
silk/x86/SigProc_FIX_sse.h \
silk/arm/macros_armv4.h \
silk/arm/macros_armv5e.h \
silk/arm/SigProc_FIX_armv4.h \

View file

@ -76,6 +76,11 @@ silk/stereo_encode_pred.c \
silk/stereo_find_predictor.c \
silk/stereo_quant_pred.c
SILK_SOURCES_SSE4_1 = silk/x86/NSQ_sse.c \
silk/x86/NSQ_del_dec_sse.c \
silk/x86/x86_silk_map.c \
silk/x86/VAD_sse.c \
silk/x86/VQ_WMat_EC_sse.c
SILK_SOURCES_FIXED = \
silk/fixed/LTP_analysis_filter_FIX.c \
@ -104,6 +109,10 @@ silk/fixed/vector_ops_FIX.c \
silk/fixed/schur64_FIX.c \
silk/fixed/schur_FIX.c
SILK_SOURCES_FIXED_SSE4_1 = silk/fixed/x86/vector_ops_FIX_sse.c \
silk/fixed/x86/burg_modified_FIX_sse.c \
silk/fixed/x86/prefilter_FIX_sse.c
SILK_SOURCES_FLOAT = \
silk/float/apply_sine_window_FLP.c \
silk/float/corrMatrix_FLP.c \

View file

@ -75,6 +75,7 @@ struct OpusDecoder {
#endif
opus_uint32 rangeFinal;
int arch;
};
@ -131,6 +132,7 @@ int opus_decoder_init(OpusDecoder *st, opus_int32 Fs, int channels)
st->prev_mode = 0;
st->frame_size = Fs/400;
st->arch = opus_select_arch();
return OPUS_OK;
}
@ -375,7 +377,7 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
/* Call SILK decoder */
int first_frame = decoded_samples == 0;
silk_ret = silk_Decode( silk_dec, &st->DecControl,
lost_flag, first_frame, &dec, pcm_ptr, &silk_frame_size );
lost_flag, first_frame, &dec, pcm_ptr, &silk_frame_size, st->arch );
if( silk_ret ) {
if (lost_flag) {
/* PLC failure should not be fatal */

View file

@ -1450,7 +1450,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
if (float_api)
{
opus_val32 sum;
sum = celt_inner_prod(&pcm_buf[total_buffer*st->channels], &pcm_buf[total_buffer*st->channels], frame_size*st->channels);
sum = celt_inner_prod(&pcm_buf[total_buffer*st->channels], &pcm_buf[total_buffer*st->channels], frame_size*st->channels, st->arch);
/* This should filter out both NaNs and ridiculous signals that could
cause NaNs further down. */
if (!(sum < 1e9f) || celt_isnan(sum))