Hi all, i'm looking at the nbody example that comes with snowleopard (specifically the one in QC 4.0) and I'm trying to understand why the barrier command is used in the kernel (full code below).
I understand it is to synchronize the different work-items from the same work-group, but what does that benefit in this case?


Code :
__kernel void integrateNBodySystem(__global const float4 *oldPos, __global const float4 *oldVel, 
				  			  __global float4 *newPos, __global float4 *newVel, float deltaTime, 
		 		  			  const int numBodies,  const float damping, const float softening, __local float4 *sharedPos) 
{
	int 				index = get_global_id(0),
					tidx = get_local_id(0),
					blockDimX = get_local_size(0),
					tile = 0,
					i, j;
	float4 			pos = oldPos[index],
					acc = make_float4(0.0f, 0.0f, 0.0f, 0.0f),
					vel;
	float 			mass = pos.w,
					softeningSq = softening * softening;
 
	for(i = 0; i < numBodies; i+= blockDimX, tile++) {
		sharedPos[tidx] = oldPos[tile * blockDimX + tidx];
 
        	barrier(CLK_LOCAL_MEM_FENCE);
 
        	for(j = 0; j < blockDimX; ) {
        		acc += bodyBodyInteraction(sharedPos[j++], pos, softeningSq); 
#if LOOP_UNROLL >= 1
        		acc += bodyBodyInteraction(sharedPos[j++], pos, softeningSq); 
#endif
#if LOOP_UNROLL >= 2
        		acc += bodyBodyInteraction(sharedPos[j++], pos, softeningSq); 
        		acc += bodyBodyInteraction(sharedPos[j++], pos, softeningSq); 
#endif
#if LOOP_UNROLL >= 4
        		acc += bodyBodyInteraction(sharedPos[j++], pos, softeningSq); 
        		acc += bodyBodyInteraction(sharedPos[j++], pos, softeningSq); 
        		acc += bodyBodyInteraction(sharedPos[j++], pos, softeningSq); 
        		acc += bodyBodyInteraction(sharedPos[j++], pos, softeningSq); 
#endif  	
        }        
        barrier(CLK_LOCAL_MEM_FENCE);
	}
 
	//deltaTime *= 5.;
 
	vel = oldVel[index];
 
  	vel += acc * deltaTime;
   	vel *= damping;
 
    	pos += vel * deltaTime;
    	pos.w = mass;
 
    	newPos[index] = pos;
    	newVel[index] = vel;
}