From 5c02c5ffb58a08c1b705e6fa2ab73f631f658823 Mon Sep 17 00:00:00 2001 From: "Timothy B. Terriberry" Date: Tue, 26 Nov 2013 21:51:39 -0800 Subject: [PATCH] Make celt_pitch_xcorr_edsp() work on ARMv5TE. We were assuming that LDR, LDRD, and STRD could be used on unaligned addresses, but this turns out not to be true on really old hardware. --- celt/arm/celt_pitch_xcorr_arm.s | 164 +++++++++++++++++++++++++------- celt/pitch.c | 5 + celt/pitch.h | 1 + 3 files changed, 138 insertions(+), 32 deletions(-) diff --git a/celt/arm/celt_pitch_xcorr_arm.s b/celt/arm/celt_pitch_xcorr_arm.s index 2db681d2..0f1bf5f6 100644 --- a/celt/arm/celt_pitch_xcorr_arm.s +++ b/celt/arm/celt_pitch_xcorr_arm.s @@ -412,8 +412,8 @@ IF OPUS_ARM_MAY_HAVE_EDSP xcorr_kernel_edsp PROC ; input: ; r3 = int len - ; r4 = opus_val16 *_x - ; r5 = opus_val16 *_y + ; r4 = opus_val16 *_x (must be 32-bit aligned) + ; r5 = opus_val16 *_y (must be 32-bit aligned) ; r6...r9 = opus_val32 sum[4] ; output: ; r6...r9 = opus_val32 sum[4] @@ -423,8 +423,9 @@ xcorr_kernel_edsp PROC ; r12,r14 = opus_val16 x[4] ; r10,r11 = opus_val16 y[4] STMFD sp!, {r2,r4,r5,lr} + LDR r10, [r5], #4 ; Load y[0...1] SUBS r2, r3, #4 ; j = len-4 - LDRD r10, r11, [r5], #8 ; Load y[0...3] + LDR r11, [r5], #4 ; Load y[2...3] BLE xcorr_kernel_edsp_process4_done LDR r12, [r4], #4 ; Load x[0...1] ; Stall @@ -493,8 +494,8 @@ xcorr_kernel_edsp_done celt_pitch_xcorr_edsp PROC ; input: - ; r0 = opus_val16 *_x - ; r1 = opus_val16 *_y + ; r0 = opus_val16 *_x (must be 32-bit aligned) + ; r1 = opus_val16 *_y (only needs to be 16-bit aligned) ; r2 = opus_val32 *xcorr ; r3 = int len ; output: @@ -512,11 +513,56 @@ celt_pitch_xcorr_edsp PROC MOV r5, r1 LDR r1, [sp, #36] MOV r4, r0 + TST r5, #3 ; maxcorr = 1 MOV r0, #1 - ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process4_done + BEQ celt_pitch_xcorr_edsp_process1u_done +; Compute one sum at the start to make y 32-bit aligned. + SUBS r12, r3, #4 + ; r14 = sum = 0 + MOV r14, #0 + LDRH r8, [r5], #2 + BLE celt_pitch_xcorr_edsp_process1u_loop4_done + LDR r6, [r4], #4 + LDR r9, [r5], #4 + LDR r7, [r4], #4 +celt_pitch_xcorr_edsp_process1u_loop4 + SMLABB r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0) + SUBS r12, r12, #4 ; j-=4 + SMLATB r14, r6, r9, r14 ; sum = MAC16_16(sum, x_1, y_1) + LDR r10, [r5], #4 + SMLABT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2) + LDRGT r6, [r4], #4 + SMLATB r14, r7, r10, r14 ; sum = MAC16_16(sum, x_3, y_3) + LDRGT r9, [r5], #4 + MOV r8, r10, LSR #16 + LDRGT r7, [r4], #4 + BGT celt_pitch_xcorr_edsp_process1u_loop4 +celt_pitch_xcorr_edsp_process1u_loop4_done + ADDS r12, r12, #4 +celt_pitch_xcorr_edsp_process1u_loop1 + LDRGEH r6, [r4], #2 + ; Stall + SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y) + SUBGES r12, r12, #1 + LDRGTH r8, [r5], #2 + BGT celt_pitch_xcorr_edsp_process1u_loop1 + ; Restore _x + SUB r4, r4, r3, LSL #1 + ; Restore and advance _y + SUB r5, r5, r3, LSL #1 + ; maxcorr = max(maxcorr, sum) + CMP r0, r14 + ADD r5, r5, #2 + MOVLT r0, r14 + SUBS r1, r1, #1 + ; xcorr[i] = sum + STR r14, [r2], #4 + BLE celt_pitch_xcorr_edsp_done +celt_pitch_xcorr_edsp_process1u_done + ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2 SUBS r1, r1, #4 - BLT celt_pitch_xcorr_edsp_process4_done + BLT celt_pitch_xcorr_edsp_process2 celt_pitch_xcorr_edsp_process4 ; xcorr_kernel_edsp parameters: ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0} @@ -531,30 +577,93 @@ celt_pitch_xcorr_edsp_process4 ADD r5, r5, #8 MOVLT r0, r6 CMP r0, r7 - STRD r6, r7, [r2], #8 MOVLT r0, r7 CMP r0, r8 - STRD r8, r9, [r2], #8 MOVLT r0, r8 CMP r0, r9 MOVLT r0, r9 + STMIA r2!, {r6-r9} SUBS r1, r1, #4 BGE celt_pitch_xcorr_edsp_process4 -celt_pitch_xcorr_edsp_process4_done - ADDS r1, r1, #4 - BLE celt_pitch_xcorr_edsp_done -; Now compute each remaining sum one at a time. -celt_pitch_xcorr_edsp_process_remaining +celt_pitch_xcorr_edsp_process2 + ADDS r1, r1, #2 + BLT celt_pitch_xcorr_edsp_process1a + SUBS r12, r3, #4 + ; {r10, r11} = {sum0, sum1} = {0, 0} + MOV r10, #0 + MOV r11, #0 + LDR r8, [r5], #4 + BLE celt_pitch_xcorr_edsp_process2_loop_done + LDR r6, [r4], #4 + LDR r9, [r5], #4 +celt_pitch_xcorr_edsp_process2_loop4 + SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0) + LDR r7, [r4], #4 + SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1) + SUBS r12, r12, #4 ; j-=4 + SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1) + LDR r8, [r5], #4 + SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2) + LDRGT r6, [r4], #4 + SMLABB r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_2, y_2) + SMLABT r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_2, y_3) + SMLATT r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_3, y_3) + LDRGT r9, [r5], #4 + SMLATB r11, r7, r8, r11 ; sum1 = MAC16_16(sum1, x_3, y_4) + BGT celt_pitch_xcorr_edsp_process2_loop4 +celt_pitch_xcorr_edsp_process2_loop_done + ADDS r12, r12, #2 + BLE celt_pitch_xcorr_edsp_process2_1 + LDR r6, [r4], #4 + ; Stall + SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0) + LDR r9, [r5], #4 + SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1) + SUB r12, r12, #2 + SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1) + MOV r8, r9 + SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2) +celt_pitch_xcorr_edsp_process2_1 + LDRH r6, [r4], #2 + ADDS r12, r12, #1 + ; Stall + SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0) + LDRGTH r7, [r4], #2 + SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1) + BLE celt_pitch_xcorr_edsp_process2_done + LDRH r9, [r5], #2 + SMLABT r10, r7, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_1) + SMLABB r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_0, y_2) +celt_pitch_xcorr_edsp_process2_done + ; Restore _x + SUB r4, r4, r3, LSL #1 + ; Restore and advance _y + SUB r5, r5, r3, LSL #1 + ; maxcorr = max(maxcorr, sum0) + CMP r0, r10 + ADD r5, r5, #2 + MOVLT r0, r10 + SUB r1, r1, #2 + ; maxcorr = max(maxcorr, sum1) + CMP r0, r11 + ; xcorr[i] = sum + STR r10, [r2], #4 + MOVLT r0, r11 + STR r11, [r2], #4 +celt_pitch_xcorr_edsp_process1a + ADDS r1, r1, #1 + BLT celt_pitch_xcorr_edsp_done SUBS r12, r3, #4 ; r14 = sum = 0 MOV r14, #0 - BLT celt_pitch_xcorr_edsp_process_remaining_loop_done - LDRD r6, r7, [r4], #8 - LDRD r8, r9, [r5], #8 - ; Stall -celt_pitch_xcorr_edsp_process_remaining_loop4 + BLT celt_pitch_xcorr_edsp_process1a_loop_done + LDR r6, [r4], #4 + LDR r8, [r5], #4 + LDR r7, [r4], #4 + LDR r9, [r5], #4 +celt_pitch_xcorr_edsp_process1a_loop4 SMLABB r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0) - SUBS r12, r12, #4 ; j-- + SUBS r12, r12, #4 ; j-=4 SMLATT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1) LDRGE r6, [r4], #4 SMLABB r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2) @@ -562,8 +671,8 @@ celt_pitch_xcorr_edsp_process_remaining_loop4 SMLATT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_3, y_3) LDRGE r7, [r4], #4 LDRGE r9, [r5], #4 - BGE celt_pitch_xcorr_edsp_process_remaining_loop4 -celt_pitch_xcorr_edsp_process_remaining_loop_done + BGE celt_pitch_xcorr_edsp_process1a_loop4 +celt_pitch_xcorr_edsp_process1a_loop_done ADDS r12, r12, #2 LDRGE r6, [r4], #4 LDRGE r8, [r5], #4 @@ -574,21 +683,12 @@ celt_pitch_xcorr_edsp_process_remaining_loop_done ADDS r12, r12, #1 LDRGEH r6, [r4], #2 LDRGEH r8, [r5], #2 - ; Restore _x - SUB r4, r4, r3, LSL #1 - ; Stall SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y) - ; Restore and advance _y - SUB r5, r5, r3, LSL #1 ; maxcorr = max(maxcorr, sum) - ; Stall CMP r0, r14 - ADD r5, r5, #2 - MOVLT r0, r14 - SUBS r1, r1, #1 ; xcorr[i] = sum STR r14, [r2], #4 - BGT celt_pitch_xcorr_edsp_process_remaining + MOVLT r0, r14 celt_pitch_xcorr_edsp_done LDMFD sp!, {r4-r11, pc} ENDP diff --git a/celt/pitch.c b/celt/pitch.c index 2d63a5ac..d2b30544 100644 --- a/celt/pitch.c +++ b/celt/pitch.c @@ -253,6 +253,11 @@ void celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch) { int i,j; + /*The EDSP version requires that max_pitch is at least 1, and that _x is + 32-bit aligned. + Since it's hard to put asserts in assembly, put them here.*/ + celt_assert(max_pitch>0); + celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0); #ifdef FIXED_POINT opus_val32 maxcorr=1; #endif diff --git a/celt/pitch.h b/celt/pitch.h index 4d82ee08..596cda8f 100644 --- a/celt/pitch.h +++ b/celt/pitch.h @@ -61,6 +61,7 @@ static OPUS_INLINE void xcorr_kernel(const opus_val16 * x, const opus_val16 * y, { int j; opus_val16 y_0, y_1, y_2, y_3; + celt_assert(len>=3); y_3=0; /* gcc doesn't realize that y_3 can't be used uninitialized */ y_0=*y++; y_1=*y++;