diff --git a/dnn/vec_neon.h b/dnn/vec_neon.h index 3e1632ba..39432232 100644 --- a/dnn/vec_neon.h +++ b/dnn/vec_neon.h @@ -217,70 +217,70 @@ static inline void sparse_sgemv_accum16(float *out, const float *w, int rows, co #define SCALE_1 (1.f/128.f/127.f) #define MAX_INPUTS 2048 +#define MAX_OUTPUTS 8192 -static inline void sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, int col_stride, const float *_x) +static inline int32x4_t vdotprod(int32x4_t acc, int8x16_t a, int8x16_t b) +{ + return vpadalq_s16(acc, vpaddq_s16(vmull_s8(vget_low_s8(a), vget_low_s8(b)), vmull_high_s8(a, b))); +} + +static inline void sgemv_accum8x4(float *_out, const qweight *w, int rows, int cols, int col_stride, const float *_x) { int i, j; signed char x[MAX_INPUTS]; + int out[MAX_OUTPUTS]; (void)col_stride; - for (i=0;i