From 43db56225b5716b61c16a3275f65b8d3d3ea5343 Mon Sep 17 00:00:00 2001
From: Linfeng Zhang <linfengz@google.com>
Date: Thu, 1 Sep 2016 14:20:57 -0700
Subject: [PATCH] Optimize silk_biquad_alt_stride2() for ARM NEON

The optimization is bit exact with C function.

Change-Id: Ifb8f04b19f2d576e79ce5dcfa7e0fc374d71d6c8

Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>
---
 silk/SigProc_FIX.h              |   7 +-
 silk/arm/arm_silk_map.c         |  14 +++
 silk/arm/biquad_alt_arm.h       |  68 ++++++++++++++
 silk/arm/biquad_alt_neon_intr.c | 156 ++++++++++++++++++++++++++++++++
 silk/biquad_alt.c               |   2 +-
 silk_headers.mk                 |   1 +
 silk_sources.mk                 |   1 +
 src/opus_encoder.c              |   7 +-
 8 files changed, 251 insertions(+), 5 deletions(-)
 create mode 100644 silk/arm/biquad_alt_arm.h
 create mode 100644 silk/arm/biquad_alt_neon_intr.c

diff --git a/silk/SigProc_FIX.h b/silk/SigProc_FIX.h
index 43f9ffac..f9ae3263 100644
--- a/silk/SigProc_FIX.h
+++ b/silk/SigProc_FIX.h
@@ -48,6 +48,7 @@ extern "C"
 #endif
 
 #if (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+#include "arm/biquad_alt_arm.h"
 #include "arm/LPC_inv_pred_gain_arm.h"
 #endif
 
@@ -109,7 +110,7 @@ void silk_biquad_alt_stride1(
     const opus_int32            len                 /* I     signal length (must be even)                               */
 );
 
-void silk_biquad_alt_stride2(
+void silk_biquad_alt_stride2_c(
     const opus_int16            *in,                /* I     input signal                                               */
     const opus_int32            *B_Q28,             /* I     MA coefficients [3]                                        */
     const opus_int32            *A_Q28,             /* I     AR coefficients [2]                                        */
@@ -158,6 +159,10 @@ void silk_ana_filt_bank_1(
     const opus_int32            N                   /* I    Number of input samples                                     */
 );
 
+#if !defined(OVERRIDE_silk_biquad_alt_stride2)
+#define silk_biquad_alt_stride2(in, B_Q28, A_Q28, S, out, len, arch) ((void)(arch), silk_biquad_alt_stride2_c(in, B_Q28, A_Q28, S, out, len))
+#endif
+
 #if !defined(OVERRIDE_silk_LPC_inverse_pred_gain)
 #define silk_LPC_inverse_pred_gain(A_Q12, order, arch)     ((void)(arch), silk_LPC_inverse_pred_gain_c(A_Q12, order))
 #endif
diff --git a/silk/arm/arm_silk_map.c b/silk/arm/arm_silk_map.c
index 7ac9ad38..53a60a03 100644
--- a/silk/arm/arm_silk_map.c
+++ b/silk/arm/arm_silk_map.c
@@ -37,6 +37,20 @@ POSSIBILITY OF SUCH DAMAGE.
 # if (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && \
  !defined(OPUS_ARM_PRESUME_NEON_INTR))
 
+void (*const SILK_BIQUAD_ALT_STRIDE2_IMPL[OPUS_ARCHMASK + 1])(
+        const opus_int16            *in,                /* I     input signal                                               */
+        const opus_int32            *B_Q28,             /* I     MA coefficients [3]                                        */
+        const opus_int32            *A_Q28,             /* I     AR coefficients [2]                                        */
+        opus_int32                  *S,                 /* I/O   State vector [4]                                           */
+        opus_int16                  *out,               /* O     output signal                                              */
+        const opus_int32            len                 /* I     signal length (must be even)                               */
+) = {
+      silk_biquad_alt_stride2_c,              /* ARMv4 */
+      silk_biquad_alt_stride2_c,              /* EDSP */
+      silk_biquad_alt_stride2_c,              /* Media */
+      MAY_HAVE_NEON(silk_biquad_alt_stride2), /* Neon */
+};
+
 opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK + 1])( /* O   Returns inverse prediction gain in energy domain, Q30        */
         const opus_int16            *A_Q12,                             /* I   Prediction coefficients, Q12 [order]                         */
         const opus_int              order                               /* I   Prediction order                                             */
diff --git a/silk/arm/biquad_alt_arm.h b/silk/arm/biquad_alt_arm.h
new file mode 100644
index 00000000..66ea9f43
--- /dev/null
+++ b/silk/arm/biquad_alt_arm.h
@@ -0,0 +1,68 @@
+/***********************************************************************
+Copyright (c) 2017 Google Inc.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifndef SILK_BIQUAD_ALT_ARM_H
+# define SILK_BIQUAD_ALT_ARM_H
+
+# include "celt/arm/armcpu.h"
+
+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+void silk_biquad_alt_stride2_neon(
+    const opus_int16            *in,                /* I     input signal                                               */
+    const opus_int32            *B_Q28,             /* I     MA coefficients [3]                                        */
+    const opus_int32            *A_Q28,             /* I     AR coefficients [2]                                        */
+    opus_int32                  *S,                 /* I/O   State vector [4]                                           */
+    opus_int16                  *out,               /* O     output signal                                              */
+    const opus_int32            len                 /* I     signal length (must be even)                               */
+);
+
+#  if !defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_PRESUME_NEON)
+#   define OVERRIDE_silk_biquad_alt_stride2                   (1)
+#   define silk_biquad_alt_stride2(in, B_Q28, A_Q28, S, out, len, arch) ((void)(arch), PRESUME_NEON(silk_biquad_alt_stride2)(in, B_Q28, A_Q28, S, out, len))
+#  endif
+# endif
+
+# if !defined(OVERRIDE_silk_biquad_alt_stride2)
+/*Is run-time CPU detection enabled on this platform?*/
+#  if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern void (*const SILK_BIQUAD_ALT_STRIDE2_IMPL[OPUS_ARCHMASK+1])(
+        const opus_int16            *in,                /* I     input signal                                               */
+        const opus_int32            *B_Q28,             /* I     MA coefficients [3]                                        */
+        const opus_int32            *A_Q28,             /* I     AR coefficients [2]                                        */
+        opus_int32                  *S,                 /* I/O   State vector [4]                                           */
+        opus_int16                  *out,               /* O     output signal                                              */
+        const opus_int32            len                 /* I     signal length (must be even)                               */
+    );
+#   define OVERRIDE_silk_biquad_alt_stride2                  (1)
+#   define silk_biquad_alt_stride2(in, B_Q28, A_Q28, S, out, len, arch) ((*SILK_BIQUAD_ALT_STRIDE2_IMPL[(arch)&OPUS_ARCHMASK])(in, B_Q28, A_Q28, S, out, len))
+#  elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+#   define OVERRIDE_silk_biquad_alt_stride2                  (1)
+#   define silk_biquad_alt_stride2(in, B_Q28, A_Q28, S, out, len, arch) ((void)(arch), silk_biquad_alt_stride2_neon(in, B_Q28, A_Q28, S, out, len))
+#  endif
+# endif
+
+#endif /* end SILK_BIQUAD_ALT_ARM_H */
diff --git a/silk/arm/biquad_alt_neon_intr.c b/silk/arm/biquad_alt_neon_intr.c
new file mode 100644
index 00000000..97157331
--- /dev/null
+++ b/silk/arm/biquad_alt_neon_intr.c
@@ -0,0 +1,156 @@
+/***********************************************************************
+Copyright (c) 2017 Google Inc.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <arm_neon.h>
+#ifdef OPUS_CHECK_ASM
+# include <string.h>
+# include "stack_alloc.h"
+#endif
+#include "SigProc_FIX.h"
+
+static inline void silk_biquad_alt_stride2_kernel( const int32x4_t A_L_s32x4, const int32x4_t A_U_s32x4, const int32x4_t B_Q28_s32x4, const int32x2_t t_s32x2, const int32x4_t in_s32x4, int32x4_t *S_s32x4, int32x2_t *out32_Q14_s32x2 )
+{
+    int32x4_t t_s32x4, out32_Q14_s32x4;
+
+    *out32_Q14_s32x2 = vadd_s32( vget_low_s32( *S_s32x4 ), t_s32x2 );              /* silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in{0,1} )                                      */
+    *S_s32x4         = vcombine_s32( vget_high_s32( *S_s32x4 ), vdup_n_s32( 0 ) ); /* S{0,1} = S{2,3}; S{2,3} = 0;                                                    */
+    *out32_Q14_s32x2 = vshl_n_s32( *out32_Q14_s32x2, 2 );                          /* out32_Q14_{0,1} = silk_LSHIFT( silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in{0,1} ), 2 ); */
+    out32_Q14_s32x4  = vcombine_s32( *out32_Q14_s32x2, *out32_Q14_s32x2 );         /* out32_Q14_{0,1,0,1}                                                             */
+    t_s32x4          = vqdmulhq_s32( out32_Q14_s32x4, A_L_s32x4 );                 /* silk_SMULWB( out32_Q14_{0,1,0,1}, A{0,0,1,1}_L_Q28 )                            */
+    *S_s32x4         = vrsraq_n_s32( *S_s32x4, t_s32x4, 14 );                      /* S{0,1} = S{2,3} + silk_RSHIFT_ROUND();  S{2,3} = silk_RSHIFT_ROUND();           */
+    t_s32x4          = vqdmulhq_s32( out32_Q14_s32x4, A_U_s32x4 );                 /* silk_SMULWB( out32_Q14_{0,1,0,1}, A{0,0,1,1}_U_Q28 )                            */
+    *S_s32x4         = vaddq_s32( *S_s32x4, t_s32x4 );                             /* S0 = silk_SMLAWB( S{0,1,2,3}, out32_Q14_{0,1,0,1}, A{0,0,1,1}_U_Q28 );          */
+    t_s32x4          = vqdmulhq_s32( in_s32x4, B_Q28_s32x4 );                      /* silk_SMULWB( B_Q28[ {1,1,2,2} ], in{0,1,0,1} )                                  */
+    *S_s32x4         = vaddq_s32( *S_s32x4, t_s32x4 );                             /* S0 = silk_SMLAWB( S0, B_Q28[ {1,1,2,2} ], in{0,1,0,1} );                        */
+}
+
+void silk_biquad_alt_stride2_neon(
+    const opus_int16            *in,                /* I     input signal                                               */
+    const opus_int32            *B_Q28,             /* I     MA coefficients [3]                                        */
+    const opus_int32            *A_Q28,             /* I     AR coefficients [2]                                        */
+    opus_int32                  *S,                 /* I/O   State vector [4]                                           */
+    opus_int16                  *out,               /* O     output signal                                              */
+    const opus_int32            len                 /* I     signal length (must be even)                               */
+)
+{
+    /* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */
+    opus_int        k            = 0;
+    const int32x2_t offset_s32x2 = vdup_n_s32( (1<<14) - 1 );
+    const int32x4_t offset_s32x4 = vcombine_s32( offset_s32x2, offset_s32x2 );
+    int16x4_t       in_s16x4  = vdup_n_s16( 0 );
+    int16x4_t       out_s16x4;
+    int32x2_t       A_Q28_s32x2, A_L_s32x2, A_U_s32x2, B_Q28_s32x2, t_s32x2;
+    int32x4_t       A_L_s32x4, A_U_s32x4, B_Q28_s32x4, S_s32x4, out32_Q14_s32x4;
+    int32x2x2_t     t0_s32x2x2, t1_s32x2x2, t2_s32x2x2, S_s32x2x2;
+
+#ifdef OPUS_CHECK_ASM
+    opus_int32 S_c[ 4 ];
+    VARDECL( opus_int16, out_c );
+    SAVE_STACK;
+    ALLOC( out_c, 2 * len, opus_int16 );
+
+    silk_memcpy( &S_c, S, sizeof( S_c ) );
+    silk_biquad_alt_stride2_c( in, B_Q28, A_Q28, S_c, out_c, len );
+#endif
+
+    /* Negate A_Q28 values and split in two parts */
+    A_Q28_s32x2 = vld1_s32( A_Q28 );
+    A_Q28_s32x2 = vneg_s32( A_Q28_s32x2 );
+    A_L_s32x2   = vshl_n_s32( A_Q28_s32x2, 18 );                                                        /* ( -A_Q28[] & 0x00003FFF ) << 18                                                     */
+    A_L_s32x2   = vreinterpret_s32_u32( vshr_n_u32( vreinterpret_u32_s32( A_L_s32x2 ), 3 ) );           /* ( -A_Q28[] & 0x00003FFF ) << 15                                                     */
+    A_U_s32x2   = vshr_n_s32( A_Q28_s32x2, 14 );                                                        /* silk_RSHIFT( -A_Q28[], 14 )                                                         */
+    A_U_s32x2   = vshl_n_s32( A_U_s32x2, 16 );                                                          /* silk_RSHIFT( -A_Q28[], 14 ) << 16 (Clip two leading bits to conform to C function.) */
+    A_U_s32x2   = vshr_n_s32( A_U_s32x2, 1 );                                                           /* silk_RSHIFT( -A_Q28[], 14 ) << 15                                                   */
+
+    B_Q28_s32x2  = vld1_s32( B_Q28 );
+    t_s32x2      = vld1_s32( B_Q28 + 1 );
+    t0_s32x2x2   = vzip_s32( A_L_s32x2, A_L_s32x2 );
+    t1_s32x2x2   = vzip_s32( A_U_s32x2, A_U_s32x2 );
+    t2_s32x2x2   = vzip_s32( t_s32x2, t_s32x2 );
+    A_L_s32x4    = vcombine_s32( t0_s32x2x2.val[ 0 ], t0_s32x2x2.val[ 1 ] );                            /* A{0,0,1,1}_L_Q28          */
+    A_U_s32x4    = vcombine_s32( t1_s32x2x2.val[ 0 ], t1_s32x2x2.val[ 1 ] );                            /* A{0,0,1,1}_U_Q28          */
+    B_Q28_s32x4  = vcombine_s32( t2_s32x2x2.val[ 0 ], t2_s32x2x2.val[ 1 ] );                            /* B_Q28[ {1,1,2,2} ]        */
+    S_s32x4      = vld1q_s32( S );                                                                      /* S0 = S[ 0 ]; S3 = S[ 3 ]; */
+    S_s32x2x2    = vtrn_s32( vget_low_s32( S_s32x4 ), vget_high_s32( S_s32x4 ) );                       /* S2 = S[ 1 ]; S1 = S[ 2 ]; */
+    S_s32x4      = vcombine_s32( S_s32x2x2.val[ 0 ], S_s32x2x2.val[ 1 ] );
+
+    for( ; k < len - 1; k += 2 ) {
+        int32x4_t in_s32x4[ 2 ], t_s32x4;
+        int32x2_t out32_Q14_s32x2[ 2 ];
+
+        /* S[ 2 * i + 0 ], S[ 2 * i + 1 ], S[ 2 * i + 2 ], S[ 2 * i + 3 ]: Q12 */
+        in_s16x4      = vld1_s16( &in[ 2 * k ] );                                                       /* in{0,1,2,3} = in[ 2 * k + {0,1,2,3} ]; */
+        in_s32x4[ 0 ] = vshll_n_s16( in_s16x4, 15 );                                                    /* in{0,1,2,3} << 15                      */
+        t_s32x4       = vqdmulhq_lane_s32( in_s32x4[ 0 ], B_Q28_s32x2, 0 );                             /* silk_SMULWB( B_Q28[ 0 ], in{0,1,2,3} ) */
+        in_s32x4[ 1 ] = vcombine_s32( vget_high_s32( in_s32x4[ 0 ] ), vget_high_s32( in_s32x4[ 0 ] ) ); /* in{2,3,2,3} << 15                      */
+        in_s32x4[ 0 ] = vcombine_s32( vget_low_s32 ( in_s32x4[ 0 ] ), vget_low_s32 ( in_s32x4[ 0 ] ) ); /* in{0,1,0,1} << 15                      */
+        silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_low_s32 ( t_s32x4 ), in_s32x4[ 0 ], &S_s32x4, &out32_Q14_s32x2[ 0 ] );
+        silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_high_s32( t_s32x4 ), in_s32x4[ 1 ], &S_s32x4, &out32_Q14_s32x2[ 1 ] );
+
+        /* Scale back to Q0 and saturate */
+        out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2[ 0 ], out32_Q14_s32x2[ 1 ] );                   /* out32_Q14_{0,1,2,3}                                                                                        */
+        out32_Q14_s32x4 = vaddq_s32( out32_Q14_s32x4, offset_s32x4 );                                   /* out32_Q14_{0,1,2,3} + (1<<14) - 1                                                                          */
+        out_s16x4       = vqshrn_n_s32( out32_Q14_s32x4, 14 );                                          /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) )                             */
+        vst1_s16( &out[ 2 * k ], out_s16x4 );                                                           /* out[ 2 * k + {0,1,2,3} ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ); */
+    }
+
+    /* Process leftover. */
+    if( k < len ) {
+        int32x4_t in_s32x4;
+        int32x2_t out32_Q14_s32x2;
+
+        /* S[ 2 * i + 0 ], S[ 2 * i + 1 ]: Q12 */
+        in_s16x4     = vld1_lane_s16( &in[ 2 * k + 0 ], in_s16x4, 0 );                                  /* in{0,1} = in[ 2 * k + {0,1} ];     */
+        in_s16x4     = vld1_lane_s16( &in[ 2 * k + 1 ], in_s16x4, 1 );                                  /* in{0,1} = in[ 2 * k + {0,1} ];     */
+        in_s32x4     = vshll_n_s16( in_s16x4, 15 );                                                     /* in{0,1} << 15                      */
+        t_s32x2      = vqdmulh_lane_s32( vget_low_s32( in_s32x4 ), B_Q28_s32x2, 0 );                    /* silk_SMULWB( B_Q28[ 0 ], in{0,1} ) */
+        in_s32x4     = vcombine_s32( vget_low_s32( in_s32x4 ), vget_low_s32( in_s32x4 ) );              /* in{0,1,0,1} << 15                  */
+        silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, t_s32x2, in_s32x4, &S_s32x4, &out32_Q14_s32x2 );
+
+        /* Scale back to Q0 and saturate */
+        out32_Q14_s32x2 = vadd_s32( out32_Q14_s32x2, offset_s32x2 );                                    /* out32_Q14_{0,1} + (1<<14) - 1                                                              */
+        out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2, out32_Q14_s32x2 );                             /* out32_Q14_{0,1,0,1} + (1<<14) - 1                                                          */
+        out_s16x4       = vqshrn_n_s32( out32_Q14_s32x4, 14 );                                          /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,0,1} + (1<<14) - 1, 14 ) )             */
+        vst1_lane_s16( &out[ 2 * k + 0 ], out_s16x4, 0 );                                               /* out[ 2 * k + 0 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_0 + (1<<14) - 1, 14 ) ); */
+        vst1_lane_s16( &out[ 2 * k + 1 ], out_s16x4, 1 );                                               /* out[ 2 * k + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_1 + (1<<14) - 1, 14 ) ); */
+    }
+
+    vst1q_lane_s32( &S[ 0 ], S_s32x4, 0 );                                                              /* S[ 0 ] = S0; */
+    vst1q_lane_s32( &S[ 1 ], S_s32x4, 2 );                                                              /* S[ 1 ] = S2; */
+    vst1q_lane_s32( &S[ 2 ], S_s32x4, 1 );                                                              /* S[ 2 ] = S1; */
+    vst1q_lane_s32( &S[ 3 ], S_s32x4, 3 );                                                              /* S[ 3 ] = S3; */
+
+#ifdef OPUS_CHECK_ASM
+    silk_assert( !memcmp( S_c, S, sizeof( S_c ) ) );
+    silk_assert( !memcmp( out_c, out, 2 * len * sizeof( opus_int16 ) ) );
+    RESTORE_STACK;
+#endif
+}
diff --git a/silk/biquad_alt.c b/silk/biquad_alt.c
index 73381294..54566a43 100644
--- a/silk/biquad_alt.c
+++ b/silk/biquad_alt.c
@@ -76,7 +76,7 @@ void silk_biquad_alt_stride1(
     }
 }
 
-void silk_biquad_alt_stride2(
+void silk_biquad_alt_stride2_c(
     const opus_int16            *in,                /* I     input signal                                               */
     const opus_int32            *B_Q28,             /* I     MA coefficients [3]                                        */
     const opus_int32            *A_Q28,             /* I     AR coefficients [2]                                        */
diff --git a/silk_headers.mk b/silk_headers.mk
index 278500b5..2588067c 100644
--- a/silk_headers.mk
+++ b/silk_headers.mk
@@ -22,6 +22,7 @@ silk/resampler_rom.h \
 silk/resampler_structs.h \
 silk/SigProc_FIX.h \
 silk/x86/SigProc_FIX_sse.h \
+silk/arm/biquad_alt_arm.h \
 silk/arm/LPC_inv_pred_gain_arm.h \
 silk/arm/macros_armv4.h \
 silk/arm/macros_armv5e.h \
diff --git a/silk_sources.mk b/silk_sources.mk
index c0312145..6d143920 100644
--- a/silk_sources.mk
+++ b/silk_sources.mk
@@ -85,6 +85,7 @@ silk/x86/VQ_WMat_EC_sse.c
 
 SILK_SOURCES_ARM_NEON_INTR = \
 silk/arm/arm_silk_map.c \
+silk/arm/biquad_alt_neon_intr.c \
 silk/arm/LPC_inv_pred_gain_neon_intr.c \
 silk/arm/NSQ_del_dec_neon_intr.c \
 silk/arm/NSQ_neon.c
diff --git a/src/opus_encoder.c b/src/opus_encoder.c
index 62dadcfb..56fadb57 100644
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -341,10 +341,11 @@ static void silk_biquad_float(
 }
 #endif
 
-static void hp_cutoff(const opus_val16 *in, opus_int32 cutoff_Hz, opus_val16 *out, opus_val32 *hp_mem, int len, int channels, opus_int32 Fs)
+static void hp_cutoff(const opus_val16 *in, opus_int32 cutoff_Hz, opus_val16 *out, opus_val32 *hp_mem, int len, int channels, opus_int32 Fs, int arch)
 {
    opus_int32 B_Q28[ 3 ], A_Q28[ 2 ];
    opus_int32 Fc_Q19, r_Q28, r_Q22;
+   (void)arch;
 
    silk_assert( cutoff_Hz <= silk_int32_MAX / SILK_FIX_CONST( 1.5 * 3.14159 / 1000, 19 ) );
    Fc_Q19 = silk_DIV32_16( silk_SMULBB( SILK_FIX_CONST( 1.5 * 3.14159 / 1000, 19 ), cutoff_Hz ), Fs/1000 );
@@ -367,7 +368,7 @@ static void hp_cutoff(const opus_val16 *in, opus_int32 cutoff_Hz, opus_val16 *ou
    if( channels == 1 ) {
       silk_biquad_alt_stride1( in, B_Q28, A_Q28, hp_mem, out, len );
    } else {
-      silk_biquad_alt_stride2( in, B_Q28, A_Q28, hp_mem, out, len );
+      silk_biquad_alt_stride2( in, B_Q28, A_Q28, hp_mem, out, len, arch );
    }
 #else
    silk_biquad_float( in, B_Q28, A_Q28, hp_mem, out, len, channels );
@@ -1609,7 +1610,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
 
     if (st->application == OPUS_APPLICATION_VOIP)
     {
-       hp_cutoff(pcm, cutoff_Hz, &pcm_buf[total_buffer*st->channels], st->hp_mem, frame_size, st->channels, st->Fs);
+       hp_cutoff(pcm, cutoff_Hz, &pcm_buf[total_buffer*st->channels], st->hp_mem, frame_size, st->channels, st->Fs, st->arch);
     } else {
        dc_reject(pcm, 3, &pcm_buf[total_buffer*st->channels], st->hp_mem, frame_size, st->channels, st->Fs);
     }