Make celt_pitch_xcorr_edsp() work on ARMv5TE.

We were assuming that LDR, LDRD, and STRD could be used on unaligned addresses, but this turns out not to be true on really old hardware.
2025-06-01 08:07:41 +00:00 · 2013-11-26 21:51:39 -08:00 · 2013-11-26 21:51:39 -08:00 · 5c02c5ffb5
commit 5c02c5ffb5
parent aa8ff201b5
3 changed files with 138 additions and 32 deletions
--- a/celt/arm/celt_pitch_xcorr_arm.s
+++ b/celt/arm/celt_pitch_xcorr_arm.s
@ -412,8 +412,8 @@ IF OPUS_ARM_MAY_HAVE_EDSP
 xcorr_kernel_edsp PROC
  ; input:
  ;   r3      = int         len
-  ;   r4      = opus_val16 *_x
-  ;   r5      = opus_val16 *_y
+  ;   r4      = opus_val16 *_x (must be 32-bit aligned)
+  ;   r5      = opus_val16 *_y (must be 32-bit aligned)
  ;   r6...r9 = opus_val32  sum[4]
  ; output:
  ;   r6...r9 = opus_val32  sum[4]
@ -423,8 +423,9 @@ xcorr_kernel_edsp PROC
  ;   r12,r14 = opus_val16  x[4]
  ;   r10,r11 = opus_val16  y[4]
  STMFD        sp!, {r2,r4,r5,lr}
+  LDR          r10, [r5], #4      ; Load y[0...1]
  SUBS         r2, r3, #4         ; j = len-4
-  LDRD         r10, r11, [r5], #8 ; Load y[0...3]
+  LDR          r11, [r5], #4      ; Load y[2...3]
  BLE xcorr_kernel_edsp_process4_done
  LDR          r12, [r4], #4      ; Load x[0...1]
  ; Stall
@ -493,8 +494,8 @@ xcorr_kernel_edsp_done

 celt_pitch_xcorr_edsp PROC
  ; input:
-  ;   r0  = opus_val16 *_x
-  ;   r1  = opus_val16 *_y
+  ;   r0  = opus_val16 *_x (must be 32-bit aligned)
+  ;   r1  = opus_val16 *_y (only needs to be 16-bit aligned)
  ;   r2  = opus_val32 *xcorr
  ;   r3  = int         len
  ; output:
@ -512,11 +513,56 @@ celt_pitch_xcorr_edsp PROC
  MOV          r5, r1
  LDR          r1, [sp, #36]
  MOV          r4, r0
+  TST          r5, #3
  ; maxcorr = 1
  MOV          r0, #1
-  ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process4_done
+  BEQ          celt_pitch_xcorr_edsp_process1u_done
+; Compute one sum at the start to make y 32-bit aligned.
+  SUBS         r12, r3, #4
+  ; r14 = sum = 0
+  MOV          r14, #0
+  LDRH         r8, [r5], #2
+  BLE celt_pitch_xcorr_edsp_process1u_loop4_done
+  LDR          r6, [r4], #4
+  LDR          r9, [r5], #4
+  LDR          r7, [r4], #4
+celt_pitch_xcorr_edsp_process1u_loop4
+  SMLABB       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
+  SUBS         r12, r12, #4         ; j-=4
+  SMLATB       r14, r6, r9, r14     ; sum = MAC16_16(sum, x_1, y_1)
+  LDR          r10, [r5], #4
+  SMLABT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
+  LDRGT        r6, [r4], #4
+  SMLATB       r14, r7, r10, r14    ; sum = MAC16_16(sum, x_3, y_3)
+  LDRGT        r9, [r5], #4
+  MOV          r8, r10, LSR #16
+  LDRGT        r7, [r4], #4
+  BGT celt_pitch_xcorr_edsp_process1u_loop4
+celt_pitch_xcorr_edsp_process1u_loop4_done
+  ADDS         r12, r12, #4
+celt_pitch_xcorr_edsp_process1u_loop1
+  LDRGEH       r6, [r4], #2
+  ; Stall
+  SMLABBGE     r14, r6, r8, r14    ; sum = MAC16_16(sum, *x, *y)
+  SUBGES       r12, r12, #1
+  LDRGTH       r8, [r5], #2
+  BGT celt_pitch_xcorr_edsp_process1u_loop1
+  ; Restore _x
+  SUB          r4, r4, r3, LSL #1
+  ; Restore and advance _y
+  SUB          r5, r5, r3, LSL #1
+  ; maxcorr = max(maxcorr, sum)
+  CMP          r0, r14
+  ADD          r5, r5, #2
+  MOVLT        r0, r14
+  SUBS         r1, r1, #1
+  ; xcorr[i] = sum
+  STR          r14, [r2], #4
+  BLE celt_pitch_xcorr_edsp_done
+celt_pitch_xcorr_edsp_process1u_done
+  ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
  SUBS         r1, r1, #4
-  BLT celt_pitch_xcorr_edsp_process4_done
+  BLT celt_pitch_xcorr_edsp_process2
 celt_pitch_xcorr_edsp_process4
  ; xcorr_kernel_edsp parameters:
  ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
@ -531,30 +577,93 @@ celt_pitch_xcorr_edsp_process4
  ADD          r5, r5, #8
  MOVLT        r0, r6
  CMP          r0, r7
-  STRD         r6, r7, [r2], #8
  MOVLT        r0, r7
  CMP          r0, r8
-  STRD         r8, r9, [r2], #8
  MOVLT        r0, r8
  CMP          r0, r9
  MOVLT        r0, r9
+  STMIA        r2!, {r6-r9}
  SUBS         r1, r1, #4
  BGE celt_pitch_xcorr_edsp_process4
-celt_pitch_xcorr_edsp_process4_done
-  ADDS         r1, r1, #4
-  BLE celt_pitch_xcorr_edsp_done
-; Now compute each remaining sum one at a time.
-celt_pitch_xcorr_edsp_process_remaining
+celt_pitch_xcorr_edsp_process2
+  ADDS         r1, r1, #2
+  BLT celt_pitch_xcorr_edsp_process1a
+  SUBS         r12, r3, #4
+  ; {r10, r11} = {sum0, sum1} = {0, 0}
+  MOV          r10, #0
+  MOV          r11, #0
+  LDR          r8, [r5], #4
+  BLE celt_pitch_xcorr_edsp_process2_loop_done
+  LDR          r6, [r4], #4
+  LDR          r9, [r5], #4
+celt_pitch_xcorr_edsp_process2_loop4
+  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
+  LDR          r7, [r4], #4
+  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
+  SUBS         r12, r12, #4         ; j-=4
+  SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
+  LDR          r8, [r5], #4
+  SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
+  LDRGT        r6, [r4], #4
+  SMLABB       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_2, y_2)
+  SMLABT       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_2, y_3)
+  SMLATT       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_3, y_3)
+  LDRGT        r9, [r5], #4
+  SMLATB       r11, r7, r8, r11     ; sum1 = MAC16_16(sum1, x_3, y_4)
+  BGT celt_pitch_xcorr_edsp_process2_loop4
+celt_pitch_xcorr_edsp_process2_loop_done
+  ADDS         r12, r12, #2
+  BLE  celt_pitch_xcorr_edsp_process2_1
+  LDR          r6, [r4], #4
+  ; Stall
+  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
+  LDR          r9, [r5], #4
+  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
+  SUB          r12, r12, #2
+  SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
+  MOV          r8, r9
+  SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
+celt_pitch_xcorr_edsp_process2_1
+  LDRH         r6, [r4], #2
+  ADDS         r12, r12, #1
+  ; Stall
+  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
+  LDRGTH       r7, [r4], #2
+  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
+  BLE celt_pitch_xcorr_edsp_process2_done
+  LDRH         r9, [r5], #2
+  SMLABT       r10, r7, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_1)
+  SMLABB       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_0, y_2)
+celt_pitch_xcorr_edsp_process2_done
+  ; Restore _x
+  SUB          r4, r4, r3, LSL #1
+  ; Restore and advance _y
+  SUB          r5, r5, r3, LSL #1
+  ; maxcorr = max(maxcorr, sum0)
+  CMP          r0, r10
+  ADD          r5, r5, #2
+  MOVLT        r0, r10
+  SUB          r1, r1, #2
+  ; maxcorr = max(maxcorr, sum1)
+  CMP          r0, r11
+  ; xcorr[i] = sum
+  STR          r10, [r2], #4
+  MOVLT        r0, r11
+  STR          r11, [r2], #4
+celt_pitch_xcorr_edsp_process1a
+  ADDS         r1, r1, #1
+  BLT celt_pitch_xcorr_edsp_done
  SUBS         r12, r3, #4
  ; r14 = sum = 0
  MOV          r14, #0
-  BLT celt_pitch_xcorr_edsp_process_remaining_loop_done
-  LDRD         r6, r7, [r4], #8
-  LDRD         r8, r9, [r5], #8
-  ; Stall
-celt_pitch_xcorr_edsp_process_remaining_loop4
+  BLT celt_pitch_xcorr_edsp_process1a_loop_done
+  LDR          r6, [r4], #4
+  LDR          r8, [r5], #4
+  LDR          r7, [r4], #4
+  LDR          r9, [r5], #4
+celt_pitch_xcorr_edsp_process1a_loop4
  SMLABB       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
-  SUBS         r12, r12, #4         ; j--
+  SUBS         r12, r12, #4         ; j-=4
  SMLATT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)
  LDRGE        r6, [r4], #4
  SMLABB       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
@ -562,8 +671,8 @@ celt_pitch_xcorr_edsp_process_remaining_loop4
  SMLATT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_3, y_3)
  LDRGE        r7, [r4], #4
  LDRGE        r9, [r5], #4
-  BGE celt_pitch_xcorr_edsp_process_remaining_loop4
-celt_pitch_xcorr_edsp_process_remaining_loop_done
+  BGE celt_pitch_xcorr_edsp_process1a_loop4
+celt_pitch_xcorr_edsp_process1a_loop_done
  ADDS         r12, r12, #2
  LDRGE        r6, [r4], #4
  LDRGE        r8, [r5], #4
@ -574,21 +683,12 @@ celt_pitch_xcorr_edsp_process_remaining_loop_done
  ADDS         r12, r12, #1
  LDRGEH       r6, [r4], #2
  LDRGEH       r8, [r5], #2
-  ; Restore _x
-  SUB          r4, r4, r3, LSL #1
-  ; Stall
  SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, *x, *y)
-  ; Restore and advance _y
-  SUB          r5, r5, r3, LSL #1
  ; maxcorr = max(maxcorr, sum)
-  ; Stall
  CMP          r0, r14
-  ADD          r5, r5, #2
-  MOVLT        r0, r14
-  SUBS         r1, r1, #1
  ; xcorr[i] = sum
  STR          r14, [r2], #4
-  BGT celt_pitch_xcorr_edsp_process_remaining
+  MOVLT        r0, r14
 celt_pitch_xcorr_edsp_done
  LDMFD        sp!, {r4-r11, pc}
  ENDP
--- a/celt/pitch.c
+++ b/celt/pitch.c
@ -253,6 +253,11 @@ void
 celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch)
 {
   int i,j;
+   /*The EDSP version requires that max_pitch is at least 1, and that _x is
+      32-bit aligned.
+     Since it's hard to put asserts in assembly, put them here.*/
+   celt_assert(max_pitch>0);
+   celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
 #ifdef FIXED_POINT
   opus_val32 maxcorr=1;
 #endif
--- a/celt/pitch.h
+++ b/celt/pitch.h
@ -61,6 +61,7 @@ static OPUS_INLINE void xcorr_kernel(const opus_val16 * x, const opus_val16 * y,
 {
   int j;
   opus_val16 y_0, y_1, y_2, y_3;
+   celt_assert(len>=3);
   y_3=0; /* gcc doesn't realize that y_3 can't be used uninitialized */
   y_0=*y++;
   y_1=*y++;