flyingtabmow

06-17-2010, 03:34 AM

I'm in the process of writing a 2D (non-separable) convolution routine. I'm trying a couple of approaches towards optimizing the routine, one of which is to use images/textures. I've written a test kernel (only for multiple of 8 sized convolution filters at the moment, but I'm just trying to get baseline measurements) and it runs great on the CPU device. Changing to the GPU however yields a totally different result however (and literally all I'm changing is the C to a G in the context creation call). Here is the code for the kernel:

const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |

CLK_ADDRESS_CLAMP_TO_EDGE |

CLK_FILTER_NEAREST;

kernel void filter(read_only image2d_t image,

constant float *filter,

int filt_width, int filt_height,

write_only image2d_t output) {

int2 ic = (int2)(get_global_id(0), get_global_id(1));

int2 fc;

int filt_indx = 0;

filt_width /= 4;

float4 sum = 0;

if (filt_width%2 == 0) {

int half_filt_width = filt_width/2;

int half_filt_height = filt_height/2;

float4 imval1, imval2, fval;

for (fc.y = -half_filt_height; fc.y < half_filt_height; fc.y++) {

fc.x = -half_filt_width;

imval2 = read_imagef(image, sampler, ic + fc);

for (fc.x++; fc.x <= half_filt_width; fc.x++) {

imval1 = imval2;

imval2 = read_imagef(image, sampler, ic + fc);

fval = vload4(filt_indx++, filter);

sum.x += fval.x*imval1.x;

sum.x += fval.y*imval1.y;

sum.x += fval.z*imval1.z;

sum.x += fval.w*imval1.w;

sum.y += fval.x*imval1.y;

sum.y += fval.y*imval1.z;

sum.y += fval.z*imval1.w;

sum.y += fval.w*imval2.x;

sum.z += fval.x*imval1.z;

sum.z += fval.y*imval1.w;

sum.z += fval.z*imval2.x;

sum.z += fval.w*imval2.y;

sum.w += fval.x*imval1.w;

sum.w += fval.y*imval2.x;

sum.w += fval.z*imval2.y;

sum.w += fval.w*imval2.z;

}

}

}

else {

}

write_imagef(output, ic, sum);

}

The code I'm running it with is pretty straightforward (I load a JPEG file, construct the convolution filter, set up the image for the output, enqueue the kernel, read the output back out). For instance, here is how the images are constructed (I'm using the C++ bindings):

cl::Image2D im1Buf(context, CL_MEM_COPY_HOST_PTR,

cl::ImageFormat(CL_RGBA, CL_FLOAT),

image1.width()/nelems, image1.height(),

image1.width()*sizeof(float),

&image1.data()[0]);

cl::Image2D output(context, CL_MEM_READ_WRITE,

cl::ImageFormat(CL_RGBA, CL_FLOAT),

image1.width()/nelems, image1.height(),

image1.width()*sizeof(float));

As I said, this works great on the CPU, and flies like the wind on the GPU, but doesn't get the correct result. Any ideas what might be going on?

const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |

CLK_ADDRESS_CLAMP_TO_EDGE |

CLK_FILTER_NEAREST;

kernel void filter(read_only image2d_t image,

constant float *filter,

int filt_width, int filt_height,

write_only image2d_t output) {

int2 ic = (int2)(get_global_id(0), get_global_id(1));

int2 fc;

int filt_indx = 0;

filt_width /= 4;

float4 sum = 0;

if (filt_width%2 == 0) {

int half_filt_width = filt_width/2;

int half_filt_height = filt_height/2;

float4 imval1, imval2, fval;

for (fc.y = -half_filt_height; fc.y < half_filt_height; fc.y++) {

fc.x = -half_filt_width;

imval2 = read_imagef(image, sampler, ic + fc);

for (fc.x++; fc.x <= half_filt_width; fc.x++) {

imval1 = imval2;

imval2 = read_imagef(image, sampler, ic + fc);

fval = vload4(filt_indx++, filter);

sum.x += fval.x*imval1.x;

sum.x += fval.y*imval1.y;

sum.x += fval.z*imval1.z;

sum.x += fval.w*imval1.w;

sum.y += fval.x*imval1.y;

sum.y += fval.y*imval1.z;

sum.y += fval.z*imval1.w;

sum.y += fval.w*imval2.x;

sum.z += fval.x*imval1.z;

sum.z += fval.y*imval1.w;

sum.z += fval.z*imval2.x;

sum.z += fval.w*imval2.y;

sum.w += fval.x*imval1.w;

sum.w += fval.y*imval2.x;

sum.w += fval.z*imval2.y;

sum.w += fval.w*imval2.z;

}

}

}

else {

}

write_imagef(output, ic, sum);

}

The code I'm running it with is pretty straightforward (I load a JPEG file, construct the convolution filter, set up the image for the output, enqueue the kernel, read the output back out). For instance, here is how the images are constructed (I'm using the C++ bindings):

cl::Image2D im1Buf(context, CL_MEM_COPY_HOST_PTR,

cl::ImageFormat(CL_RGBA, CL_FLOAT),

image1.width()/nelems, image1.height(),

image1.width()*sizeof(float),

&image1.data()[0]);

cl::Image2D output(context, CL_MEM_READ_WRITE,

cl::ImageFormat(CL_RGBA, CL_FLOAT),

image1.width()/nelems, image1.height(),

image1.width()*sizeof(float));

As I said, this works great on the CPU, and flies like the wind on the GPU, but doesn't get the correct result. Any ideas what might be going on?