mirror of
https://github.com/xiph/opus.git
synced 2025-05-25 12:49:12 +00:00
Doing a bit of unrolling to speed things up
This commit is contained in:
parent
3e7ab9ff87
commit
60d6eab63d
1 changed files with 34 additions and 1 deletions
|
@ -502,7 +502,40 @@ static inline void sparse_sgemv_accum8x4(float *_out, const qweight *w, int rows
|
||||||
colblocks = *idx++;
|
colblocks = *idx++;
|
||||||
y = &out[i];
|
y = &out[i];
|
||||||
vy0 = _mm256_loadu_si256((const __m256i *)&y[0]);
|
vy0 = _mm256_loadu_si256((const __m256i *)&y[0]);
|
||||||
for (j=0;j<colblocks;j++)
|
j=0;
|
||||||
|
#if 1 /* Unrolling by 4 gives some gain, comment out if it does not. */
|
||||||
|
for (;j<colblocks-3;j+=4)
|
||||||
|
{
|
||||||
|
__m256i tmp;
|
||||||
|
__m256i vxj;
|
||||||
|
__m256i vw;
|
||||||
|
vxj = _mm256_set1_epi32(*(int*)&x[*idx++]);
|
||||||
|
vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?
|
||||||
|
tmp = _mm256_maddubs_epi16(vxj, vw); //swap?
|
||||||
|
tmp = _mm256_madd_epi16(tmp, ones);
|
||||||
|
vy0 = _mm256_add_epi32(vy0, tmp);
|
||||||
|
w += 32;
|
||||||
|
vxj = _mm256_set1_epi32(*(int*)&x[*idx++]);
|
||||||
|
vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?
|
||||||
|
tmp = _mm256_maddubs_epi16(vxj, vw); //swap?
|
||||||
|
tmp = _mm256_madd_epi16(tmp, ones);
|
||||||
|
vy0 = _mm256_add_epi32(vy0, tmp);
|
||||||
|
w += 32;
|
||||||
|
vxj = _mm256_set1_epi32(*(int*)&x[*idx++]);
|
||||||
|
vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?
|
||||||
|
tmp = _mm256_maddubs_epi16(vxj, vw); //swap?
|
||||||
|
tmp = _mm256_madd_epi16(tmp, ones);
|
||||||
|
vy0 = _mm256_add_epi32(vy0, tmp);
|
||||||
|
w += 32;
|
||||||
|
vxj = _mm256_set1_epi32(*(int*)&x[*idx++]);
|
||||||
|
vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?
|
||||||
|
tmp = _mm256_maddubs_epi16(vxj, vw); //swap?
|
||||||
|
tmp = _mm256_madd_epi16(tmp, ones);
|
||||||
|
vy0 = _mm256_add_epi32(vy0, tmp);
|
||||||
|
w += 32;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
for (;j<colblocks;j++)
|
||||||
{
|
{
|
||||||
__m256i tmp;
|
__m256i tmp;
|
||||||
__m256i vxj;
|
__m256i vxj;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue