__kernel void forward(__global int *_dResult,
__global int *_beta0,
__global int *_beta1,
__global int *_beta2,
__global int *_beta3)
{
const int sm_lut[32] = {0, 1, 6, 7, 2, 3, 4, 5,
5, 4, 3, 2, 7, 6, 1, 0,
1, 0, 7, 6, 3, 2, 5, 4,
4, 5, 2, 3, 6, 7, 0, 1
};
volatile int alpha_t[8];
volatile int new_alpha_t[8];
int sum[4] = {0};
int gid = get_global_id(0);
int lid = get_local_id(0);
int idx = lid << 2;
int size = get_global_size(0) / get_local_size(0);
for (int i = 0; i < 8; i++)
new_alpha_t[i] = alpha_t[i] = 0;
for (int t = 0; t < size; t++)
{
sum[0] = alpha_t[s_to_sm1[idx]] + _beta0[gid];
sum[1] = alpha_t[s_to_sm1[idx + 1]] + _beta1[gid];
sum[2] = alpha_t[s_to_sm1[idx + 2]] + _beta2[gid];
sum[3] = alpha_t[s_to_sm1[idx + 3]] + _beta3[gid];
new_alpha_t[lid] = max4(sum[0], sum[1], sum[2], sum[3]);
alpha_t[lid] = new_alpha_t[lid];
}
_dResult[gid] = alpha_t[lid];
}