mirror of
https://github.com/xiph/opus.git
synced 2025-05-14 15:38:32 +00:00
AVX version of celt_pitch_xcorr()
Not used by anything yet
This commit is contained in:
parent
02f352c75e
commit
31a8028e97
3 changed files with 108 additions and 0 deletions
|
@ -52,6 +52,9 @@ endif
|
|||
if HAVE_SSE4_1
|
||||
CELT_SOURCES += $(CELT_SOURCES_SSE4_1)
|
||||
endif
|
||||
if HAVE_AVX
|
||||
CELT_SOURCES += $(CELT_SOURCES_AVX)
|
||||
endif
|
||||
endif
|
||||
|
||||
if CPU_ARM
|
||||
|
@ -392,6 +395,11 @@ SSE4_1_OBJ = $(CELT_SOURCES_SSE4_1:.c=.lo) \
|
|||
$(SSE4_1_OBJ): CFLAGS += $(OPUS_X86_SSE4_1_CFLAGS)
|
||||
endif
|
||||
|
||||
if HAVE_AVX
|
||||
AVX_OBJ = $(CELT_SOURCES_AVX:.c=.lo)
|
||||
$(AVX_OBJ): CFLAGS += $(OPUS_X86_AVX_CFLAGS)
|
||||
endif
|
||||
|
||||
if HAVE_ARM_NEON_INTR
|
||||
ARM_NEON_INTR_OBJ = $(CELT_SOURCES_ARM_NEON_INTR:.c=.lo) \
|
||||
$(SILK_SOURCES_ARM_NEON_INTR:.c=.lo) \
|
||||
|
|
97
celt/x86/pitch_avx.c
Normal file
97
celt/x86/pitch_avx.c
Normal file
|
@ -0,0 +1,97 @@
|
|||
/* Copyright (c) 2023 Amazon */
|
||||
/*
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
|
||||
#include <immintrin.h>
|
||||
#include "x86cpu.h"
|
||||
#include "pitch.h"
|
||||
|
||||
/* Like the "regular" xcorr_kernel(), but computes 8 results at a time. */
|
||||
static void xcorr_kernel_avx(const float *x, const float *y, float sum[8], int len)
|
||||
{
|
||||
__m256 xsum0, xsum1, xsum2, xsum3, xsum4, xsum5, xsum6, xsum7;
|
||||
xsum7 = xsum6 = xsum5 = xsum4 = xsum3 = xsum2 = xsum1 = xsum0 = _mm256_setzero_ps();
|
||||
int i;
|
||||
__m256 x0;
|
||||
/* Compute 8 inner products using partial sums. */
|
||||
for (i=0;i<len-7;i+=8)
|
||||
{
|
||||
x0 = _mm256_loadu_ps(x+i);
|
||||
xsum0 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i ), xsum0);
|
||||
xsum1 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+1), xsum1);
|
||||
xsum2 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+2), xsum2);
|
||||
xsum3 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+3), xsum3);
|
||||
xsum4 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+4), xsum4);
|
||||
xsum5 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+5), xsum5);
|
||||
xsum6 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+6), xsum6);
|
||||
xsum7 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+7), xsum7);
|
||||
}
|
||||
if (i != len) {
|
||||
static const int mask[15] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
__m256i m;
|
||||
m = _mm256_loadu_si256((__m256i*)(mask + 7+i-len));
|
||||
x0 = _mm256_maskload_ps(x+i, m);
|
||||
xsum0 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i , m), xsum0);
|
||||
xsum1 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+1, m), xsum1);
|
||||
xsum2 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+2, m), xsum2);
|
||||
xsum3 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+3, m), xsum3);
|
||||
xsum4 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+4, m), xsum4);
|
||||
xsum5 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+5, m), xsum5);
|
||||
xsum6 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+6, m), xsum6);
|
||||
xsum7 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+7, m), xsum7);
|
||||
}
|
||||
/* 8 horizontal adds. */
|
||||
/* Compute [0 4] [1 5] [2 6] [3 7] */
|
||||
xsum0 = _mm256_add_ps(_mm256_permute2f128_ps(xsum0, xsum4, 2<<4), _mm256_permute2f128_ps(xsum0, xsum4, 1 | (3<<4)));
|
||||
xsum1 = _mm256_add_ps(_mm256_permute2f128_ps(xsum1, xsum5, 2<<4), _mm256_permute2f128_ps(xsum1, xsum5, 1 | (3<<4)));
|
||||
xsum2 = _mm256_add_ps(_mm256_permute2f128_ps(xsum2, xsum6, 2<<4), _mm256_permute2f128_ps(xsum2, xsum6, 1 | (3<<4)));
|
||||
xsum3 = _mm256_add_ps(_mm256_permute2f128_ps(xsum3, xsum7, 2<<4), _mm256_permute2f128_ps(xsum3, xsum7, 1 | (3<<4)));
|
||||
/* Compute [0 1 4 5] [2 3 6 7] */
|
||||
xsum0 = _mm256_hadd_ps(xsum0, xsum1);
|
||||
xsum1 = _mm256_hadd_ps(xsum2, xsum3);
|
||||
/* Compute [0 1 2 3 4 5 6 7] */
|
||||
xsum0 = _mm256_hadd_ps(xsum0, xsum1);
|
||||
_mm256_storeu_ps(sum, xsum0);
|
||||
}
|
||||
|
||||
void celt_pitch_xcorr_avx(const float *_x, const float *_y, float *xcorr, int len, int max_pitch, int arch)
|
||||
{
|
||||
int i;
|
||||
celt_assert(max_pitch>0);
|
||||
(void)arch;
|
||||
for (i=0;i<max_pitch-7;i+=8)
|
||||
{
|
||||
xcorr_kernel_avx(_x, _y+i, &xcorr[i], len);
|
||||
}
|
||||
for (;i<max_pitch;i++)
|
||||
{
|
||||
xcorr[i] = celt_inner_prod(_x, _y+i, len, arch);
|
||||
}
|
||||
}
|
|
@ -33,6 +33,9 @@ CELT_SOURCES_SSE4_1 = \
|
|||
celt/x86/celt_lpc_sse4_1.c \
|
||||
celt/x86/pitch_sse4_1.c
|
||||
|
||||
CELT_SOURCES_AVX = \
|
||||
celt/x86/pitch_avx.c
|
||||
|
||||
CELT_SOURCES_ARM_RTCD = \
|
||||
celt/arm/armcpu.c \
|
||||
celt/arm/arm_celt_map.c
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue