mirror of
https://github.com/xiph/opus.git
synced 2025-05-25 12:49:12 +00:00
WIP: 8-bit SIMD for GRU B
This commit is contained in:
parent
e695355ba5
commit
40b309d92b
5 changed files with 58 additions and 7 deletions
|
@ -31,6 +31,7 @@
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#include <unistd.h>
|
||||||
#include "kiss_fft.h"
|
#include "kiss_fft.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
@ -141,6 +142,7 @@ int main(int argc, char **argv) {
|
||||||
int encode = 0;
|
int encode = 0;
|
||||||
int decode = 0;
|
int decode = 0;
|
||||||
int quantize = 0;
|
int quantize = 0;
|
||||||
|
srand(getpid());
|
||||||
st = lpcnet_encoder_create();
|
st = lpcnet_encoder_create();
|
||||||
if (argc == 5 && strcmp(argv[1], "-train")==0) training = 1;
|
if (argc == 5 && strcmp(argv[1], "-train")==0) training = 1;
|
||||||
if (argc == 5 && strcmp(argv[1], "-qtrain")==0) {
|
if (argc == 5 && strcmp(argv[1], "-qtrain")==0) {
|
||||||
|
|
|
@ -140,6 +140,7 @@ void compute_mdense(const MDenseLayer *layer, float *output, const float *input)
|
||||||
compute_activation(output, output, N, layer->activation);
|
compute_activation(output, output, N, layer->activation);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
void compute_gru(const GRULayer *gru, float *state, const float *input)
|
void compute_gru(const GRULayer *gru, float *state, const float *input)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
|
@ -201,6 +202,7 @@ void compute_gru(const GRULayer *gru, float *state, const float *input)
|
||||||
for (i=0;i<N;i++)
|
for (i=0;i<N;i++)
|
||||||
state[i] = h[i];
|
state[i] = h[i];
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
void compute_gru2(const GRULayer *gru, float *state, const float *input)
|
void compute_gru2(const GRULayer *gru, float *state, const float *input)
|
||||||
{
|
{
|
||||||
|
@ -224,7 +226,11 @@ void compute_gru2(const GRULayer *gru, float *state, const float *input)
|
||||||
/* Compute update gate. */
|
/* Compute update gate. */
|
||||||
for (i=0;i<3*N;i++)
|
for (i=0;i<3*N;i++)
|
||||||
zrh[i] = gru->bias[i];
|
zrh[i] = gru->bias[i];
|
||||||
|
#if 1
|
||||||
|
sgemv_accum8x4(zrh, gru->input_weights, 3*N, M, stride, input);
|
||||||
|
#else
|
||||||
sgemv_accum(zrh, gru->input_weights, 3*N, M, stride, input);
|
sgemv_accum(zrh, gru->input_weights, 3*N, M, stride, input);
|
||||||
|
#endif
|
||||||
for (i=0;i<3*N;i++)
|
for (i=0;i<3*N;i++)
|
||||||
recur[i] = gru->bias[3*N + i];
|
recur[i] = gru->bias[3*N + i];
|
||||||
sgemv_accum(recur, gru->recurrent_weights, 3*N, N, stride, state);
|
sgemv_accum(recur, gru->recurrent_weights, 3*N, N, stride, state);
|
||||||
|
|
|
@ -56,7 +56,7 @@ typedef struct {
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
const float *bias;
|
const float *bias;
|
||||||
const float *input_weights;
|
const qweight *input_weights;
|
||||||
const float *recurrent_weights;
|
const float *recurrent_weights;
|
||||||
int nb_inputs;
|
int nb_inputs;
|
||||||
int nb_neurons;
|
int nb_neurons;
|
||||||
|
|
|
@ -39,7 +39,10 @@ max_rnn_neurons = 1
|
||||||
max_conv_inputs = 1
|
max_conv_inputs = 1
|
||||||
max_mdense_tmp = 1
|
max_mdense_tmp = 1
|
||||||
|
|
||||||
def printVector(f, vector, name, dtype='float'):
|
def printVector(f, vector, name, dtype='float', dotp=False):
|
||||||
|
if dotp:
|
||||||
|
vector = vector.reshape((vector.shape[0]//4, 4, vector.shape[1]//8, 8))
|
||||||
|
vector = vector.transpose((2, 0, 3, 1))
|
||||||
v = np.reshape(vector, (-1));
|
v = np.reshape(vector, (-1));
|
||||||
#print('static const float ', name, '[', len(v), '] = \n', file=f)
|
#print('static const float ', name, '[', len(v), '] = \n', file=f)
|
||||||
f.write('static const {} {}[{}] = {{\n '.format(dtype, name, len(v)))
|
f.write('static const {} {}[{}] = {{\n '.format(dtype, name, len(v)))
|
||||||
|
@ -127,7 +130,12 @@ def dump_gru_layer(self, f, hf):
|
||||||
name = self.name
|
name = self.name
|
||||||
print("printing layer " + name + " of type " + self.__class__.__name__)
|
print("printing layer " + name + " of type " + self.__class__.__name__)
|
||||||
weights = self.get_weights()
|
weights = self.get_weights()
|
||||||
|
f.write('#ifdef DOT_PROD\n')
|
||||||
|
qweight = np.clip((128*weights[0]).astype('int'), -128, 127)
|
||||||
|
printVector(f, qweight, name + '_weights', dotp=True, dtype='qweight')
|
||||||
|
f.write('#else /*DOT_PROD*/\n')
|
||||||
printVector(f, weights[0], name + '_weights')
|
printVector(f, weights[0], name + '_weights')
|
||||||
|
f.write('#endif /*DOT_PROD*/\n')
|
||||||
printVector(f, weights[1], name + '_recurrent_weights')
|
printVector(f, weights[1], name + '_recurrent_weights')
|
||||||
printVector(f, weights[-1], name + '_bias')
|
printVector(f, weights[-1], name + '_bias')
|
||||||
if hasattr(self, 'activation'):
|
if hasattr(self, 'activation'):
|
||||||
|
|
45
dnn/vec.h
45
dnn/vec.h
|
@ -41,10 +41,11 @@
|
||||||
#include "vec_neon.h"
|
#include "vec_neon.h"
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
#define MAX_INPUTS (2048)
|
||||||
|
|
||||||
#define NO_OPTIMIZATIONS
|
#define NO_OPTIMIZATIONS
|
||||||
|
|
||||||
//#define DOT_PROD
|
#define DOT_PROD
|
||||||
//#define USE_SU_BIAS
|
//#define USE_SU_BIAS
|
||||||
|
|
||||||
#ifdef DOT_PROD
|
#ifdef DOT_PROD
|
||||||
|
@ -193,13 +194,47 @@ static inline void sparse_sgemv_accum16(float *out, const float *w, int rows, co
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef DOT_PROD
|
#ifdef DOT_PROD
|
||||||
|
|
||||||
#define MAX_INPUTS (2048)
|
|
||||||
|
|
||||||
|
|
||||||
#define SCALE (128.f*127.f)
|
#define SCALE (128.f*127.f)
|
||||||
#define SCALE_1 (1.f/128.f/127.f)
|
#define SCALE_1 (1.f/128.f/127.f)
|
||||||
|
|
||||||
|
static inline void sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, int col_stride, const float *_x)
|
||||||
|
{
|
||||||
|
int i, j;
|
||||||
|
signed char x[MAX_INPUTS];
|
||||||
|
(void)col_stride;
|
||||||
|
for (i=0;i<rows;i++) out[i] *= SCALE;
|
||||||
|
for (i=0;i<cols;i++) x[i] = (int)floor(.5+127*_x[i]);
|
||||||
|
for (i=0;i<rows;i+=8)
|
||||||
|
{
|
||||||
|
for (j=0;j<cols;j+=4)
|
||||||
|
{
|
||||||
|
float * restrict y;
|
||||||
|
float xj0, xj1, xj2, xj3;
|
||||||
|
xj0 = x[j+0];
|
||||||
|
xj1 = x[j+1];
|
||||||
|
xj2 = x[j+2];
|
||||||
|
xj3 = x[j+3];
|
||||||
|
y = &out[i];
|
||||||
|
y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
|
||||||
|
y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
|
||||||
|
y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
|
||||||
|
y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
|
||||||
|
y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
|
||||||
|
y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
|
||||||
|
y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
|
||||||
|
y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
|
||||||
|
w += 32;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (i=0;i<rows;i++) out[i] *= SCALE_1;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
#define sgemv_accum sgemv_accum8x4
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef DOT_PROD
|
||||||
|
|
||||||
|
|
||||||
#ifdef USE_SU_BIAS
|
#ifdef USE_SU_BIAS
|
||||||
static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, const int *idx, const float *_x)
|
static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, const int *idx, const float *_x)
|
||||||
{
|
{
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue