Doing a bit of unrolling to speed things up

This commit is contained in:
Jean-Marc Valin 2021-07-06 17:33:33 -04:00
parent 3e7ab9ff87
commit 60d6eab63d

View file

@ -502,7 +502,40 @@ static inline void sparse_sgemv_accum8x4(float *_out, const qweight *w, int rows
colblocks = *idx++; colblocks = *idx++;
y = &out[i]; y = &out[i];
vy0 = _mm256_loadu_si256((const __m256i *)&y[0]); vy0 = _mm256_loadu_si256((const __m256i *)&y[0]);
for (j=0;j<colblocks;j++) j=0;
#if 1 /* Unrolling by 4 gives some gain, comment out if it does not. */
for (;j<colblocks-3;j+=4)
{
__m256i tmp;
__m256i vxj;
__m256i vw;
vxj = _mm256_set1_epi32(*(int*)&x[*idx++]);
vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?
tmp = _mm256_maddubs_epi16(vxj, vw); //swap?
tmp = _mm256_madd_epi16(tmp, ones);
vy0 = _mm256_add_epi32(vy0, tmp);
w += 32;
vxj = _mm256_set1_epi32(*(int*)&x[*idx++]);
vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?
tmp = _mm256_maddubs_epi16(vxj, vw); //swap?
tmp = _mm256_madd_epi16(tmp, ones);
vy0 = _mm256_add_epi32(vy0, tmp);
w += 32;
vxj = _mm256_set1_epi32(*(int*)&x[*idx++]);
vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?
tmp = _mm256_maddubs_epi16(vxj, vw); //swap?
tmp = _mm256_madd_epi16(tmp, ones);
vy0 = _mm256_add_epi32(vy0, tmp);
w += 32;
vxj = _mm256_set1_epi32(*(int*)&x[*idx++]);
vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?
tmp = _mm256_maddubs_epi16(vxj, vw); //swap?
tmp = _mm256_madd_epi16(tmp, ones);
vy0 = _mm256_add_epi32(vy0, tmp);
w += 32;
}
#endif
for (;j<colblocks;j++)
{ {
__m256i tmp; __m256i tmp;
__m256i vxj; __m256i vxj;