Optimize NSQ_del_dec() for AVX2
The optimization is bit-exact with C function. This optimization speeds up SILK encoder (floating point) as following: AMD Zen: Complexity 0-5 : 0% Complexity 6-7 : 3 - 7% Complexity 8-10: 8 - 15% Intel Skylake: Complexity 0-5 : 0% Complexity 6-7 : 14 - 18% Complexity 8-10: 17 - 22% Adapted by Jean-Marc Valin
This commit is contained in:
parent
452aa95211
commit
735c40706f
5 changed files with 1105 additions and 2 deletions
|
@ -70,6 +70,7 @@ LPCNET_SOURCES += $(DNN_SOURCES_SSE4_1)
|
|||
endif
|
||||
endif
|
||||
if HAVE_AVX2
|
||||
SILK_SOURCES += $(SILK_SOURCES_AVX2)
|
||||
CELT_SOURCES += $(CELT_SOURCES_AVX2)
|
||||
if ENABLE_DEEP_PLC
|
||||
LPCNET_SOURCES += $(DNN_SOURCES_AVX2)
|
||||
|
@ -425,6 +426,7 @@ endif
|
|||
|
||||
if HAVE_AVX2
|
||||
AVX2_OBJ = $(CELT_SOURCES_AVX2:.c=.lo) \
|
||||
$(SILK_SOURCES_AVX2:.c=.lo) \
|
||||
$(DNN_SOURCES_AVX2:.c=.lo)
|
||||
$(AVX2_OBJ): CFLAGS += $(OPUS_X86_AVX2_CFLAGS)
|
||||
endif
|
||||
|
|
1072
silk/x86/NSQ_del_dec_avx2.c
Normal file
1072
silk/x86/NSQ_del_dec_avx2.c
Normal file
File diff suppressed because it is too large
Load diff
|
@ -154,7 +154,33 @@ void silk_NSQ_del_dec_sse4_1(
|
|||
const opus_int LTP_scale_Q14 /* I LTP state scaling */
|
||||
);
|
||||
|
||||
# if defined OPUS_X86_PRESUME_SSE4_1
|
||||
void silk_NSQ_del_dec_avx2(
|
||||
const silk_encoder_state *psEncC, /* I Encoder State */
|
||||
silk_nsq_state *NSQ, /* I/O NSQ state */
|
||||
SideInfoIndices *psIndices, /* I/O Quantization Indices */
|
||||
const opus_int16 x16[], /* I Input */
|
||||
opus_int8 pulses[], /* O Quantized pulse signal */
|
||||
const opus_int16 PredCoef_Q12[2 * MAX_LPC_ORDER], /* I Short term prediction coefs */
|
||||
const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR], /* I Long term prediction coefs */
|
||||
const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER], /* I Noise shaping coefs */
|
||||
const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR], /* I Long term shaping coefs */
|
||||
const opus_int Tilt_Q14[MAX_NB_SUBFR], /* I Spectral tilt */
|
||||
const opus_int32 LF_shp_Q14[MAX_NB_SUBFR], /* I Low frequency shaping coefs */
|
||||
const opus_int32 Gains_Q16[MAX_NB_SUBFR], /* I Quantization step sizes */
|
||||
const opus_int32 pitchL[MAX_NB_SUBFR], /* I Pitch lags */
|
||||
const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
|
||||
const opus_int LTP_scale_Q14 /* I LTP state scaling */
|
||||
);
|
||||
|
||||
# if defined (OPUS_X86_PRESUME_AVX2)
|
||||
|
||||
# define OVERRIDE_silk_NSQ_del_dec
|
||||
# define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
|
||||
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
|
||||
((void)(arch),silk_NSQ_del_dec_avx2(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
|
||||
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
|
||||
|
||||
# elif defined (OPUS_X86_PRESUME_SSE4_1)
|
||||
|
||||
# define OVERRIDE_silk_NSQ_del_dec
|
||||
# define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
|
||||
|
|
|
@ -132,7 +132,7 @@ void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
|
|||
silk_NSQ_del_dec_c,
|
||||
silk_NSQ_del_dec_c,
|
||||
MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */
|
||||
MAY_HAVE_SSE4_1( silk_NSQ_del_dec ) /* avx */
|
||||
MAY_HAVE_AVX2( silk_NSQ_del_dec ) /* avx */
|
||||
};
|
||||
|
||||
#if defined(FIXED_POINT)
|
||||
|
|
|
@ -86,6 +86,9 @@ silk/x86/NSQ_del_dec_sse4_1.c \
|
|||
silk/x86/VAD_sse4_1.c \
|
||||
silk/x86/VQ_WMat_EC_sse4_1.c
|
||||
|
||||
SILK_SOURCES_AVX2 = \
|
||||
silk/x86/NSQ_del_dec_avx2.c
|
||||
|
||||
SILK_SOURCES_ARM_RTCD = \
|
||||
silk/arm/arm_silk_map.c
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue