diff --git a/dnn/nnet.c b/dnn/nnet.c index a82c04ab..e794e450 100644 --- a/dnn/nnet.c +++ b/dnn/nnet.c @@ -212,91 +212,3 @@ void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, fl OPUS_COPY(&mem[input_size*dilation*(ksize-1)-input_size], input, input_size); } } - - -/* Computes non-padded convolution for input [ ksize1 x in_channels x (len2+ksize2) ], - kernel [ out_channels x in_channels x ksize1 x ksize2 ], - storing the output as [ out_channels x len2 ]. - We assume that the output dimension along the ksize1 axis is 1, - i.e. processing one frame at a time. */ -static void conv2d_float(float *out, const float *weights, int in_channels, int out_channels, int ktime, int kheight, const float *in, int height, int hstride) -{ - int i; - int in_stride; - in_stride = height+kheight-1; - for (i=0;iin_channels*(height+conv->kheight-1); - celt_assert(conv->ktime*time_stride <= MAX_CONV2D_INPUTS); - OPUS_COPY(in_buf, mem, (conv->ktime-1)*time_stride); - OPUS_COPY(&in_buf[(conv->ktime-1)*time_stride], in, time_stride); - OPUS_COPY(mem, &in_buf[time_stride], (conv->ktime-1)*time_stride); - bias = conv->bias; - if (conv->kheight == 3 && conv->ktime == 3) - conv2d_3x3_float(out, conv->float_weights, conv->in_channels, conv->out_channels, in_buf, height, hstride); - else - conv2d_float(out, conv->float_weights, conv->in_channels, conv->out_channels, conv->ktime, conv->kheight, in_buf, height, hstride); - if (bias != NULL) { - for (i=0;iout_channels;i++) { - int j; - for (j=0;jout_channels;i++) { - compute_activation(&out[i*hstride], &out[i*hstride], height, activation, arch); - } -} diff --git a/dnn/nnet.h b/dnn/nnet.h index f891fa3e..4a42beca 100644 --- a/dnn/nnet.h +++ b/dnn/nnet.h @@ -185,12 +185,11 @@ int gru_init(GRULayer *layer, const WeightArray *arrays, int activation, int reset_after); -void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation, int arch); - - void compute_linear_c(const LinearLayer *linear, float *out, const float *in); void compute_activation_c(float *output, const float *input, int N, int activation); +void compute_conv2d_c(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation); + #if defined(OPUS_X86_MAY_HAVE_SSE2) #include "x86/dnn_x86.h" @@ -204,6 +203,9 @@ void compute_activation_c(float *output, const float *input, int N, int activati #define compute_activation(output, input, N, activation, arch) ((void)(arch),compute_activation_c(output, input, N, activation)) #endif +#ifndef OVERRIDE_COMPUTE_CONV2D +#define compute_conv2d(conv, out, mem, in, height, hstride, activation, arch) ((void)(arch),compute_conv2d_c(conv, out, mem, in, height, hstride, activation)) +#endif #if defined(__x86_64__) && !defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2) #if defined(_MSC_VER) diff --git a/dnn/nnet_arch.h b/dnn/nnet_arch.h index 4d577f8d..3c53e619 100644 --- a/dnn/nnet_arch.h +++ b/dnn/nnet_arch.h @@ -127,5 +127,93 @@ void RTCD_SUF(compute_linear_) (const LinearLayer *linear, float *out, const flo } } +/* Computes non-padded convolution for input [ ksize1 x in_channels x (len2+ksize2) ], + kernel [ out_channels x in_channels x ksize1 x ksize2 ], + storing the output as [ out_channels x len2 ]. + We assume that the output dimension along the ksize1 axis is 1, + i.e. processing one frame at a time. */ +static void conv2d_float(float *out, const float *weights, int in_channels, int out_channels, int ktime, int kheight, const float *in, int height, int hstride) +{ + int i; + int in_stride; + in_stride = height+kheight-1; + for (i=0;iin_channels*(height+conv->kheight-1); + celt_assert(conv->ktime*time_stride <= MAX_CONV2D_INPUTS); + OPUS_COPY(in_buf, mem, (conv->ktime-1)*time_stride); + OPUS_COPY(&in_buf[(conv->ktime-1)*time_stride], in, time_stride); + OPUS_COPY(mem, &in_buf[time_stride], (conv->ktime-1)*time_stride); + bias = conv->bias; + if (conv->kheight == 3 && conv->ktime == 3) + conv2d_3x3_float(out, conv->float_weights, conv->in_channels, conv->out_channels, in_buf, height, hstride); + else + conv2d_float(out, conv->float_weights, conv->in_channels, conv->out_channels, conv->ktime, conv->kheight, in_buf, height, hstride); + if (bias != NULL) { + for (i=0;iout_channels;i++) { + int j; + for (j=0;jout_channels;i++) { + RTCD_SUF(compute_activation_)(&out[i*hstride], &out[i*hstride], height, activation); + } +} #endif diff --git a/dnn/x86/dnn_x86.h b/dnn/x86/dnn_x86.h index 94f95ce8..f2183327 100644 --- a/dnn/x86/dnn_x86.h +++ b/dnn/x86/dnn_x86.h @@ -34,16 +34,19 @@ #if defined(OPUS_X86_MAY_HAVE_SSE2) void compute_linear_sse2(const LinearLayer *linear, float *out, const float *in); void compute_activation_sse2(float *output, const float *input, int N, int activation); +void compute_conv2d_sse2(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation); #endif #if defined(OPUS_X86_MAY_HAVE_SSE4_1) void compute_linear_sse4_1(const LinearLayer *linear, float *out, const float *in); void compute_activation_sse4_1(float *output, const float *input, int N, int activation); +void compute_conv2d_sse4_1(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation); #endif #if defined(OPUS_X86_MAY_HAVE_AVX2) void compute_linear_avx2(const LinearLayer *linear, float *out, const float *in); void compute_activation_avx2(float *output, const float *input, int N, int activation); +void compute_conv2d_avx2(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation); #endif @@ -53,6 +56,8 @@ void compute_activation_avx2(float *output, const float *input, int N, int activ #define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_avx2(linear, out, in)) #define OVERRIDE_COMPUTE_ACTIVATION #define compute_activation(output, input, N, activation, arch) ((void)(arch),compute_activation_avx2(output, input, N, activation)) +#define OVERRIDE_COMPUTE_CONV2D +#define compute_conv2d(conv, out, mem, in, height, hstride, activation, arch) ((void)(arch),compute_conv2d_avx2(conv, out, mem, in, height, hstride, activation)) #elif defined(OPUS_X86_PRESUME_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2) @@ -60,6 +65,8 @@ void compute_activation_avx2(float *output, const float *input, int N, int activ #define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_sse4_1(linear, out, in)) #define OVERRIDE_COMPUTE_ACTIVATION #define compute_activation(output, input, N, activation, arch) ((void)(arch),compute_activation_sse4_1(output, input, N, activation)) +#define OVERRIDE_COMPUTE_CONV2D +#define compute_conv2d(conv, out, mem, in, height, hstride, activation, arch) ((void)(arch),compute_conv2d_sse4_1(conv, out, mem, in, height, hstride, activation)) #elif defined(OPUS_X86_PRESUME_SSE2) && !defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_MAY_HAVE_SSE4_1) @@ -67,6 +74,8 @@ void compute_activation_avx2(float *output, const float *input, int N, int activ #define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_sse2(linear, out, in)) #define OVERRIDE_COMPUTE_ACTIVATION #define compute_activation(output, input, N, activation, arch) ((void)(arch),compute_activation_sse2(output, input, N, activation)) +#define OVERRIDE_COMPUTE_CONV2D +#define compute_conv2d(conv, out, mem, in, height, hstride, activation, arch) ((void)(arch),compute_conv2d_sse2(conv, out, mem, in, height, hstride, activation)) #elif defined(OPUS_HAVE_RTCD) && (defined(OPUS_X86_MAY_HAVE_AVX2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) @@ -91,6 +100,20 @@ extern void (*const DNN_COMPUTE_ACTIVATION_IMPL[OPUS_ARCHMASK + 1])( ((*DNN_COMPUTE_ACTIVATION_IMPL[(arch) & OPUS_ARCHMASK])(output, input, N, activation)) +extern void (*const DNN_COMPUTE_CONV2D_IMPL[OPUS_ARCHMASK + 1])( + const Conv2dLayer *conv, + float *out, + float *mem, + const float *in, + int height, + int hstride, + int activation + ); +#define OVERRIDE_COMPUTE_CONV2D +#define compute_conv2d(conv, out, mem, in, height, hstride, activation, arch) \ + ((*DNN_COMPUTE_CONV2D_IMPL[(arch) & OPUS_ARCHMASK])(conv, out, mem, in, height, hstride, activation)) + + #endif diff --git a/dnn/x86/x86_dnn_map.c b/dnn/x86/x86_dnn_map.c index f39ae372..d673e134 100644 --- a/dnn/x86/x86_dnn_map.c +++ b/dnn/x86/x86_dnn_map.c @@ -61,6 +61,22 @@ void (*const DNN_COMPUTE_ACTIVATION_IMPL[OPUS_ARCHMASK + 1])( MAY_HAVE_AVX2(compute_activation) /* avx */ }; +void (*const DNN_COMPUTE_CONV2D_IMPL[OPUS_ARCHMASK + 1])( + const Conv2dLayer *conv, + float *out, + float *mem, + const float *in, + int height, + int hstride, + int activation +) = { + compute_conv2d_c, /* non-sse */ + compute_conv2d_c, + MAY_HAVE_SSE2(compute_conv2d), + MAY_HAVE_SSE4_1(compute_conv2d), /* sse4.1 */ + MAY_HAVE_AVX2(compute_conv2d) /* avx */ +}; + #endif