I am trying to write a function that performs a few vector operations, which are followed by a reduction operation.

However, I am confused how I go about incorporate the parallel reduction example(using work groups). Or if the reduction operation is correct in this case?

Kernel code is a hack, as I tried to incorporate a reduction example into an existing vector routine.
Code :
__kernel void vSubRed( __global float * a, __global float * b, __global float * c,__local float * temp,  __local float * local_sums, __global float * partial_sums, const unsigned int count)
{
   int i = get_global_id(0);
   int num_work_items = get_local_size(0);
   int local_id = get_local_id(0);
   float accum = 0.0f;
   float sum = 0.0f;
   float8 partial_sum_vec = 0.0f;
   int jstart = 0;
   int jend = 0;
 
   if(i < count)
   {
       a[i] = b[i] - a[i];
       c[i] = a[i];
       temp[i] = a[i] * a[i];
 
       jstart = (group_id * num_work_items + local_id) * count;
       jend = jstart + count;
 
       for(int j = jstart; j < jend; j+8)
       {
            // Not sure how to assign the values from the vector temp to the float vector
           // there is probably a more elegant way than below
            partial_sum_vec.s0 = temp[j];
            partial_sum_vec.s1 = temp[j+1];
            partial_sum_vec.s2 = temp[j+2];
            partial_sum_vec.s3 = temp[j+3];
            partial_sum_vec.s4 = temp[j+4];
            partial_sum_vec.s5 = temp[j+5];
            partial_sum_vec.s6 = temp[j+6];
            partial_sum_vec.s7 = temp[j+7];
 
           // Accumulate in parallel the values into a float8
           accum += partial_sum_vec.s0 + partial_sum_vec.s1 + partial_sum_vec.s2 + partial_sum_vec.s3 + partial_sum_vec.s4 +
                         partial_sum_vec.s5 + partial_sum_vec.s6 + partial_sum_vec.s7;
       }
 
       local_sums[local_id] = accum;
       barrier(CLK_LOCAL_MEM_FENCE);
 
       if(local_id == 0)
       {
            for(int k = 0; k < num_work_items; k++)
            {
                sum += local_sums[k];
            }
           partial_sums[i] = sum;
       }
   }
}

Thanks in advance
David