Hi,

I found different sources of kernel code to do the sparse matrix vector multiplication (when the matrix is compressed with the CSR format), but I don't get the expected result, and I don't understand why.

Here is my kernel code
__kernel void spmv(__global float *values, __global int *col_idx, __global int *row_ptr, __global float* vec_in, __global float* vec_out, const int num_rows, __local float *red_mem)
{
const int items_per_row = 32;
const int idx = get_global_id(0)/items_per_row;
if (idx >= num_rows) return;

float sum = 0;
int row = idx;
int s = row_ptr[row];
int e = row_ptr[row+1];
const int local_id = get_local_id(0);
const int local_row_id = local_id/items_per_row;
const int local_row_offset = local_id%items_per_row;
for (int i = s + local_row_offset; i<e; i+=items_per_row)
{
sum += values[i]*vec_in[col_idx[i]];
}


red_mem[local_id] = sum;
barrier(CLK_LOCAL_MEM_FENCE);


//reduction step
if (local_row_offset < 16) red_mem[local_id] += red_mem[local_id + 16];
if (local_row_offset < red_mem[local_id] += red_mem[local_id + 8];
if (local_row_offset < 4) red_mem[local_id] += red_mem[local_id +4];
if (local_row_offset < 2) red_mem[local_id] += red_mem[local_id + 2];
if (local_row_offset < 1) red_mem[local_id] += red_mem[local_id + 1];

if(local_row_offset==0)
{
vec_out[row] += red_mem[local_id];
}

}

and here is how I launch the kernel (blockSize is initialized to 32)

cl::Event ndrEvent;
int sRes = wRes *hRes;
if ((sRes%blockSize)!=0)
sRes = sRes +blockSize-(sRes%blockSize);

cl::NDRange globalSize(sRes,1); //sRes is the size of the output vector
cl::NDRange localSize(blockSize,1);

//I set the kernel's arguments

err = queue.enqueueNDRangeKernel (
kernel,
cl::NullRange,
globalSize,
localSize,
NULL,
&ndrEvent);

I'm kind of desperate, so if someone could help me, it would be great !!

Thanks in advance