Parallel Reduction combined with other Operations

I am trying to write a function that performs a few vector operations, which are followed by a reduction operation.

However, I am confused how I go about incorporate the parallel reduction example(using work groups). Or if the reduction operation is correct in this case?

Kernel code is a hack, as I tried to incorporate a reduction example into an existing vector routine.

Code :

__kernel void vSubRed( __global float * a, __global float * b, __global float * c,__local float * temp, __local float * local_sums, __global float * partial_sums, const unsigned int count)
{
int i = get_global_id(0);
int num_work_items = get_local_size(0);
int local_id = get_local_id(0);
float accum = 0.0f;
float sum = 0.0f;
float8 partial_sum_vec = 0.0f;
int jstart = 0;
int jend = 0;
if(i < count)
{
a[i] = b[i] - a[i];
c[i] = a[i];
temp[i] = a[i] * a[i];
jstart = (group_id * num_work_items + local_id) * count;
jend = jstart + count;
for(int j = jstart; j < jend; j+8)
{
// Not sure how to assign the values from the vector temp to the float vector
// there is probably a more elegant way than below
partial_sum_vec.s0 = temp[j];
partial_sum_vec.s1 = temp[j+1];
partial_sum_vec.s2 = temp[j+2];
partial_sum_vec.s3 = temp[j+3];
partial_sum_vec.s4 = temp[j+4];
partial_sum_vec.s5 = temp[j+5];
partial_sum_vec.s6 = temp[j+6];
partial_sum_vec.s7 = temp[j+7];
// Accumulate in parallel the values into a float8
accum += partial_sum_vec.s0 + partial_sum_vec.s1 + partial_sum_vec.s2 + partial_sum_vec.s3 + partial_sum_vec.s4 +
partial_sum_vec.s5 + partial_sum_vec.s6 + partial_sum_vec.s7;
}
local_sums[local_id] = accum;
barrier(CLK_LOCAL_MEM_FENCE);
if(local_id == 0)
{
for(int k = 0; k < num_work_items; k++)
{
sum += local_sums[k];
}
partial_sums[i] = sum;
}
}
}

Thanks in advance

David