when i execute the following kernel without barriers everything works fine.
if i use barriers, the program hangs at clEnqueueWriteBuffer after kernel execution and then after 30 seconds the display driver restarts.

is it possible that the workers wait for each other while accessing the output_mat vector and its a memory conflict or something when i read/write a vector on host from/to the device? i have no more ideas where to problem could be!

please help! thanks!

Code :
__kernel void
            Wave2DEuler(   __global       float * v1,
                           __global       float * v2,
                           __global       float * v3,                    
                           __global float * Floats,
                           __global int * Ints,
                           __global int * Act
                             ) 
		{
            int global_id_x = get_global_id(0);
            int global_id_y = get_global_id(1);
 
            __global float * input_matTm0;                
            __global float * input_matTm1;
            __global float * output_mat;
 
            float tau = Floats[0];
            float c = Floats[1];
            float h = Floats[2];
            float damp = Floats[3];
            int width = Ints[0];
            int height = Ints[1];
            int border = Ints[2];
            int iterations = Ints[3];
            int log_type = Ints[4];
			int first_input = 1;
            int left_pos;
            int right_pos;
            int cur_pos;
            int top_pos;
            int bot_pos;
            int padding = 1;
            int i = 0;
            long field_index = 0;
            int actIndex = 0;
			int iteration_step = 5;
			float val = 0;
			int pos = 0;
 
			while(i < iterations) {
					if (first_input == 1)
					{
						input_matTm0 = v3;
						input_matTm1 = v2;
						output_mat = v1;
					}
					else if (first_input == 2)
					{
					   input_matTm0 = v1;
					   input_matTm1 = v3;
					   output_mat = v2; 
					}
					else if (first_input == 3)
					{
						input_matTm0 = v2;
						input_matTm1 = v1;
						output_mat = v3; 
					}
					first_input = first_input + 1;
					if(first_input == 4) {
					   first_input = 1;
					}
 
                cur_pos = global_id_x + 0 + (global_id_y + 0) * width;
                left_pos = cur_pos - 1;
                right_pos = cur_pos + 1;
                top_pos = global_id_x + 0 + (global_id_y + 0 - 1) * width;
                bot_pos = global_id_x + 0 + (global_id_y + 0 + 1) * width;
 
                if(global_id_x > padding - 1 && global_id_x < width - padding && global_id_y > padding - 1 && global_id_y < height - padding) {
					output_mat[cur_pos] = input_matTm0[right_pos] - 4 * input_matTm0[cur_pos] + input_matTm0[left_pos] + input_matTm0[top_pos] + input_matTm0[bot_pos] + 2 * input_matTm0[cur_pos] - input_matTm1[cur_pos];
 
					if(border == 1) {
						//barrier(CLK_GLOBAL_MEM_FENCE);
						if(global_id_x == 1) {
							output_mat[left_pos] = output_mat[cur_pos];
						}
						if(global_id_x == width - 2) {
							output_mat[right_pos] = output_mat[cur_pos];
						}
						if(global_id_y == 1) {
							output_mat[top_pos] = output_mat[cur_pos];
						}
						if(global_id_y == height - 2) {
							output_mat[bot_pos] = output_mat[cur_pos];
						}
					}
 
					if(damp != 0) {
						//barrier(CLK_GLOBAL_MEM_FENCE);
						output_mat[cur_pos] = output_mat[cur_pos] * damp;
					}	
                } 
 
				while(Act[actIndex] != -1) {
					if(i > Act[actIndex]) {
						output_mat[width * Act[actIndex + 1] + Act[actIndex + 2]] = tau * tau;
					}
					actIndex = actIndex + 3;
				}
 
				//barrier(CLK_GLOBAL_MEM_FENCE);
 
                i++;
            }
		}