Optimize NSQ_del_dec() for AVX2

The optimization is bit-exact with C function. This optimization speeds up SILK encoder (floating point) as following: AMD Zen: Complexity 0-5 : 0% Complexity 6-7 : 3 - 7% Complexity 8-10: 8 - 15% Intel Skylake: Complexity 0-5 : 0% Complexity 6-7 : 14 - 18% Complexity 8-10: 17 - 22% Adapted by Jean-Marc Valin
2023-11-17 23:58:19 -05:00 · 2023-11-17 23:58:19 -05:00 · 735c40706f
commit 735c40706f
parent 452aa95211
5 changed files with 1105 additions and 2 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -70,6 +70,7 @@ LPCNET_SOURCES += $(DNN_SOURCES_SSE4_1)
 endif
 endif
 if HAVE_AVX2
+SILK_SOURCES += $(SILK_SOURCES_AVX2)
 CELT_SOURCES += $(CELT_SOURCES_AVX2)
 if ENABLE_DEEP_PLC
 LPCNET_SOURCES += $(DNN_SOURCES_AVX2)
@ -425,6 +426,7 @@ endif

 if HAVE_AVX2
 AVX2_OBJ = $(CELT_SOURCES_AVX2:.c=.lo) \
+           $(SILK_SOURCES_AVX2:.c=.lo) \
           $(DNN_SOURCES_AVX2:.c=.lo)
 $(AVX2_OBJ): CFLAGS += $(OPUS_X86_AVX2_CFLAGS)
 endif
--- a/silk/x86/NSQ_del_dec_avx2.c
+++ b/silk/x86/NSQ_del_dec_avx2.c
--- a/silk/x86/main_sse.h
+++ b/silk/x86/main_sse.h
@ -154,7 +154,33 @@ void silk_NSQ_del_dec_sse4_1(
    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 );

-#  if defined OPUS_X86_PRESUME_SSE4_1
+void silk_NSQ_del_dec_avx2(
+    const silk_encoder_state *psEncC,                            /* I    Encoder State               */
+    silk_nsq_state *NSQ,                                         /* I/O  NSQ state                   */
+    SideInfoIndices *psIndices,                                  /* I/O  Quantization Indices        */
+    const opus_int16 x16[],                                      /* I    Input                       */
+    opus_int8 pulses[],                                          /* O    Quantized pulse signal      */
+    const opus_int16 PredCoef_Q12[2 * MAX_LPC_ORDER],            /* I    Short term prediction coefs */
+    const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],      /* I    Long term prediction coefs  */
+    const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER], /* I    Noise shaping coefs         */
+    const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],              /* I    Long term shaping coefs     */
+    const opus_int Tilt_Q14[MAX_NB_SUBFR],                       /* I    Spectral tilt               */
+    const opus_int32 LF_shp_Q14[MAX_NB_SUBFR],                   /* I    Low frequency shaping coefs */
+    const opus_int32 Gains_Q16[MAX_NB_SUBFR],                    /* I    Quantization step sizes     */
+    const opus_int32 pitchL[MAX_NB_SUBFR],                       /* I    Pitch lags                  */
+    const opus_int Lambda_Q10,                                   /* I    Rate/distortion tradeoff    */
+    const opus_int LTP_scale_Q14                                 /* I    LTP state scaling           */
+);
+
+#  if defined (OPUS_X86_PRESUME_AVX2)
+
+#   define OVERRIDE_silk_NSQ_del_dec
+#   define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
+                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+    ((void)(arch),silk_NSQ_del_dec_avx2(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
+                           HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
+
+#  elif defined (OPUS_X86_PRESUME_SSE4_1)

 #   define OVERRIDE_silk_NSQ_del_dec
 #   define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
--- a/silk/x86/x86_silk_map.c
+++ b/silk/x86/x86_silk_map.c
@ -132,7 +132,7 @@ void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
  silk_NSQ_del_dec_c,
  silk_NSQ_del_dec_c,
  MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */
-  MAY_HAVE_SSE4_1( silk_NSQ_del_dec )  /* avx */
+  MAY_HAVE_AVX2( silk_NSQ_del_dec )  /* avx */
 };

 #if defined(FIXED_POINT)
--- a/silk_sources.mk
+++ b/silk_sources.mk
@ -86,6 +86,9 @@ silk/x86/NSQ_del_dec_sse4_1.c \
 silk/x86/VAD_sse4_1.c \
 silk/x86/VQ_WMat_EC_sse4_1.c

+SILK_SOURCES_AVX2 =  \
+silk/x86/NSQ_del_dec_avx2.c
+
 SILK_SOURCES_ARM_RTCD = \
 silk/arm/arm_silk_map.c