Hi dear OpenCL Community,

for my OpenCL understanding i want to port some simple code from Java to OpenCL.
The OpenCL Code works for small sizes but when i increase the vector size then my
Kernel crash.
Here is the Java Part:

Code :
FloatBuffer a = FloatBuffer.wrap(new float[]{1, 1, 1, 1});
FloatBuffer b = FloatBuffer.wrap(new float[]{2.2f, 2, 3, 4, 5});
 
for (int i = 0; i < a.capacity(); i++) {
    float v= 0;
    for (int j = 0; j < b.capacity(); j++) {
        v+= a.get(i) + b.get(j);
    }
    System.out.println(v);
}

And the OpenCL equivalent, i call the kernel with globalsize[a.capacity(), b.capacity()]
and the output has the same size like a:

Code :
 
__kernel void kernel(__global const float* a, __global const float* b, __global float* output, __local float* block)
{
    int gri = get_group_id(0);
    int grj = get_group_id(1);
 
    int ti = get_local_id(0);
    int tj = get_local_id(1);
 
    int gi = get_global_size(0);
    int gj = get_global_size(1);
 
    int ni = get_local_size(0);
    int nj = get_local_size(1);
 
    int gti = gri * ni + ti;
 
    int nb = gj / nj;
 
    float va = a[gti];
    float sum = 0;
 
    for (int jb = 0; jb < nb; jb++){
 
        block[tj] = b[jb * nj + tj];
        barrier(CLK_LOCAL_MEM_FENCE);
 
        for(int k = 0;k < nj; k++){
            sum += va + block[k];
        }
 
        barrier(CLK_LOCAL_MEM_FENCE);
    }
 
    output[gti] = sum;
};

Thanks in advance