mirror of
https://github.com/xiph/opus.git
synced 2025-05-31 23:57:42 +00:00
Unroll the 3x3 convolution case
Gets us about 2x speedup on x86
This commit is contained in:
parent
d720955d61
commit
f512c9206b
1 changed files with 32 additions and 1 deletions
33
dnn/nnet.c
33
dnn/nnet.c
|
@ -394,6 +394,34 @@ void conv2d_float(float *out, const float *weights, int in_channels, int out_cha
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void conv2d_3x3_float(float *out, const float *weights, int in_channels, int out_channels, const float *in, int height, int hstride)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
int in_stride;
|
||||||
|
int kheight, ktime;
|
||||||
|
kheight = ktime = 3;
|
||||||
|
in_stride = height+kheight-1;
|
||||||
|
for (i=0;i<out_channels;i++) {
|
||||||
|
int m;
|
||||||
|
OPUS_CLEAR(&out[i*hstride], height);
|
||||||
|
for (m=0;m<in_channels;m++) {
|
||||||
|
int j;
|
||||||
|
for (j=0;j<height;j++) {
|
||||||
|
/* Unrolled version of previous function -- compiler will figure out the indexing simplifications. */
|
||||||
|
out[i*hstride + j] += weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 0]*in[0*in_channels*in_stride + m*in_stride + j + 0]
|
||||||
|
+ weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 1]*in[0*in_channels*in_stride + m*in_stride + j + 1]
|
||||||
|
+ weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 2]*in[0*in_channels*in_stride + m*in_stride + j + 2]
|
||||||
|
+ weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 0]*in[1*in_channels*in_stride + m*in_stride + j + 0]
|
||||||
|
+ weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 1]*in[1*in_channels*in_stride + m*in_stride + j + 1]
|
||||||
|
+ weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 2]*in[1*in_channels*in_stride + m*in_stride + j + 2]
|
||||||
|
+ weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 0]*in[2*in_channels*in_stride + m*in_stride + j + 0]
|
||||||
|
+ weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 1]*in[2*in_channels*in_stride + m*in_stride + j + 1]
|
||||||
|
+ weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 2]*in[2*in_channels*in_stride + m*in_stride + j + 2];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#define MAX_CONV2D_INPUTS 8192
|
#define MAX_CONV2D_INPUTS 8192
|
||||||
|
|
||||||
void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation)
|
void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation)
|
||||||
|
@ -409,7 +437,10 @@ void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float
|
||||||
OPUS_COPY(&in_buf[(conv->ktime-1)*time_stride], in, time_stride);
|
OPUS_COPY(&in_buf[(conv->ktime-1)*time_stride], in, time_stride);
|
||||||
OPUS_COPY(mem, &in_buf[time_stride], (conv->ktime-1)*time_stride);
|
OPUS_COPY(mem, &in_buf[time_stride], (conv->ktime-1)*time_stride);
|
||||||
bias = conv->bias;
|
bias = conv->bias;
|
||||||
conv2d_float(out, conv->float_weights, conv->in_channels, conv->out_channels, conv->ktime, conv->kheight, in_buf, height, hstride);
|
if (conv->kheight == 3 && conv->ktime == 3)
|
||||||
|
conv2d_3x3_float(out, conv->float_weights, conv->in_channels, conv->out_channels, in_buf, height, hstride);
|
||||||
|
else
|
||||||
|
conv2d_float(out, conv->float_weights, conv->in_channels, conv->out_channels, conv->ktime, conv->kheight, in_buf, height, hstride);
|
||||||
if (bias != NULL) {
|
if (bias != NULL) {
|
||||||
for (i=0;i<conv->out_channels;i++) {
|
for (i=0;i<conv->out_channels;i++) {
|
||||||
int j;
|
int j;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue