I recently wrote this kernel in open cl,which is working well and returning correct results.

Code :
spmv_csr_scalar_kernel(const int num_rows,
                       const int * ptr,
                       const int * indices,
                       const float * data,
                       const float * x,
                       float * y )
{
    int row = get_global_id(0);
    if(row < num_rows)
    {
        float dot = 0;
        int row_start = ptr[row];
        int row_end = ptr[row+1];
        for (int jj = row_start; jj < row_end; jj++)
        {  
            dot += data[jj] * x[indices[jj]];
        }
        y[row] += dot;
    }
}

It is multiplication of a sparse matrix in compressed row storage with a column vector.It returns correct result.Uses one work item for each for loop (from row_start to row_end).

I want to convert the above code to using two work items per single for loop.But I am getting incorrect answers.here is what I could come write.

Code :
__kernel void mykernel(__global int* colvector,
                       __global int* val,
                       __global int* result,
                       __global int* index,
                       __global int* rowptr,
                       __global int* sync )
{
    __global int vals[8]={0,0,0,0,0,0,0,0};
    for(int i=0;i<4;i++)
    {
        result[i]=0;
    }
    barrier(CLK_GLOBAL_MEM_FENCE);
 
    int thread_id=get_global_id(0);
    int warp_id=thread_id/2;
    int lane=(thread_id)&1;
    int row=warp_id;
 
    if(row<4)
    {
        int row_start = rowptr[row];
        int row_end = rowptr[row+1];
        vals[thread_id]=0;
 
        for (int i = row_start+lane; i<row_end; i+=2)
        {
            vals[thread_id]+=val[i]*colvector[index[i]];
        }
        if(lane==0)
        {
        vals[thread_id]+=vals[thread_id+1];
        }
        if(lane==0)
        {
            result[row] += vals[thread_id];
        }
    }               
}

Can anybody help me plzzzzz?My deadline is tomorrow and its returning incorrect results.