I need synchronization (e.g., all work-groups are done before the next iteration) due to the innermost loop (ie. z) using alpha_t from the previous loop to compute the sum values in subsequent loops.
I used a global worksize of 6016 (= 752 * 8) and local worksize of 8 for my kernel. The data size of _beta0 - _beta3 is 6016 and they were computed from another kernel. However I am unable to get the correct results using the kernel below:
Code:
__kernel void forward(__global int *_dResult,
__global int *_beta0,
__global int *_beta1,
__global int *_beta2,
__global int *_beta3)
{
const int sm_lut[32] = {0, 1, 6, 7, 2, 3, 4, 5,
5, 4, 3, 2, 7, 6, 1, 0,
1, 0, 7, 6, 3, 2, 5, 4,
4, 5, 2, 3, 6, 7, 0, 1
};
volatile int alpha_t[8];
volatile int new_alpha_t[8];
int sum[4] = {0};
int gid = get_global_id(0);
int lid = get_local_id(0);
int idx = lid << 2;
int size = get_global_size(0) / get_local_size(0);
for (int i = 0; i < 8; i++)
new_alpha_t[i] = alpha_t[i] = 0;
for (int t = 0; t < size; t++)
{
sum[0] = alpha_t[s_to_sm1[idx]] + _beta0[gid];
sum[1] = alpha_t[s_to_sm1[idx + 1]] + _beta1[gid];
sum[2] = alpha_t[s_to_sm1[idx + 2]] + _beta2[gid];
sum[3] = alpha_t[s_to_sm1[idx + 3]] + _beta3[gid];
new_alpha_t[lid] = max4(sum[0], sum[1], sum[2], sum[3]);
alpha_t[lid] = new_alpha_t[lid];
}
_dResult[gid] = alpha_t[lid];
}
Could u pls let me know how i could compute the (6016) results through multiple kernel executions?
Thanks for your help.