Trying to use fma instructions when possible

Compilers sometimes replace vmlaq*() with fmul+fadd instead of fmla. Trying to use vfmaq*() instead when possible.
2025-06-06 07:21:03 +00:00 · 2023-11-28 14:16:57 -05:00 · 2023-11-28 14:16:57 -05:00 · db26e381a4
commit db26e381a4
parent 72cc88dfdd
3 changed files with 21 additions and 0 deletions
--- a/celt/arm/celt_neon_intr.c
+++ b/celt/arm/celt_neon_intr.c
@ -97,6 +97,14 @@ void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_va
 }

 #else
+
+#if defined(__ARM_FEATURE_FMA) && defined(__ARM_ARCH_ISA_A64)
+/* If we can, force the compiler to use an FMA instruction rather than break
+ *    vmlaq_f32() into fmul/fadd. */
+#define vmlaq_lane_f32(a,b,c,lane) vfmaq_lane_f32(a,b,c,lane)
+#endif
+
+
 /*
 * Function: xcorr_kernel_neon_float
 * ---------------------------------
--- a/celt/arm/pitch_neon_intr.c
+++ b/celt/arm/pitch_neon_intr.c
@ -130,6 +130,13 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus

 /* ========================================================================== */

+#ifdef __ARM_FEATURE_FMA
+/* If we can, force the compiler to use an FMA instruction rather than break
+   vmlaq_f32() into fmul/fadd. */
+#define vmlaq_f32(a,b,c) vfmaq_f32(a,b,c)
+#endif
+
+
 #ifdef OPUS_CHECK_ASM

 /* This part of code simulates floating-point NEON operations. */
--- a/dnn/vec_neon.h
+++ b/dnn/vec_neon.h
@ -49,6 +49,12 @@ static OPUS_INLINE int16x8_t vmull_high_s8(int8x16_t a, int8x16_t b) {
 }
 #endif

+#ifdef __ARM_FEATURE_FMA
+/* If we can, force the compiler to use an FMA instruction rather than break
+   vmlaq_f32() into fmul/fadd. */
+#define vmlaq_f32(a,b,c) vfmaq_f32(a,b,c)
+#endif
+
 #ifndef LPCNET_TEST
 static inline float32x4_t exp4_approx(float32x4_t x) {
  int32x4_t i;