Hello Everyone,

I've written a OpenCL code for a function for fractal encoding (fixed partitioning). I get same encoding results with C only code and OpenCL based Code (Intel Core i5-3210M 2.5GHz HD Graphics 4000). My OpenCL code executes 5 times faster on a better GPU Nvidia GeForce 660 GTX . I want to get feedback on my code, if there is a possibility of optimizing and get better results on the Intel Integrated GPU. Please help & review the code. The steps of execution are as below:


Host side:
1) Read 256x256 gray scale 8bit image
2) Divide image into range(4x4) rMobj and domain blocks(range size*2) dMobj and store into 1D array(pool of ranges and domains).
3) Average each domain block(4 pixel average of each domain block) and store into 1D array dMobj . The averaged domain block will become 4x4 size.
4) Apply transformations i.e. rotate and flip the image at different angles (0: dMobj ,90: d90Mobj ,180,270... ,)

Kernel side :
compare (using Mean square error) each range block with each domain block and its transformations and find out the best matching domain block and store it. Here I've taken a tolerance parameter. If MSE < tolerance then that domain block will saved and the further domain blocks will not be searched (this saves lot of time). return best matching domain block to host and it will be written in the file.

rCount is numbers of range blocks
dCount is number of domain blocks
rSize is range size (4x4)
Curr Domain is temporary memory block used in for loop.
iType is nothing but number of transformations of domain blocks.

Kernel code *****************************************

__kernel void calculateRms( __global struct rangeBlock* rMobj, __global struct domainBlock* dMobj, __global struct domainBlock* d90Mobj,
__global struct domainBlock* d180Mobj, __global struct domainBlock* d270Mobj, __global struct domainBlock* ddiaMobj,
__global struct domainBlock* ddia90Mobj, __global struct domainBlock* ddia180Mobj, __global struct domainBlock* ddia270Mobj,
__global struct rdmapping* mappingMobj, int rCount , int dCount ,int rSize, __global struct domainBlock* Curr_domain)
{
int l = get_global_id(0);

int i=0, j=0, iType=0, k=0;
float m_fTolerance = 10.0;
struct rdmapping data;


long sumaa=0, sumbb=0 ,sumab=0 ,suma=0, sumb=0;
long a=0, b=0;

float s=0, o=0, d=0, dd=0;
int iArea=rSize*rSize;
m_fTolerance = 10.0;
dd=9999999999.0;
Curr_domain = dMobj;

for(k =0;k<dCount;k++)
{
for(iType=0; iType<8; iType++)
{
if (iType == 0){ Curr_domain = dMobj;}
if (iType == 1){ Curr_domain = d90Mobj;}
if (iType == 2){ Curr_domain = d180Mobj;}
if (iType == 3){ Curr_domain = d270Mobj;}
if (iType == 4){ Curr_domain = ddiaMobj;}
if (iType == 5){ Curr_domain = ddia90Mobj;}
if (iType == 6){ Curr_domain = ddia180Mobj;}
if (iType == 7){ Curr_domain = ddia270Mobj;}
s = o= d=0;
sumaa=sumbb=sumab=suma=sumb=0;
for(i=0;i<rSize;i++)
{
for(j=0;j<rSize;j++)
{
a=(int)(rMobj[l].intensity[j*rSize+i]);
b=(int)(Curr_domain[k].intensity[j*rSize+i]);
sumaa+=a*a;
sumbb+=b*b;
sumab+=a*b;
suma+=(long)a;
sumb+=(long)b;
}
}
if((iArea*sumbb-sumb*sumb)==0)
s=0;
else
s=((double)(iArea*sumab-suma*sumb))/((double)(iArea*sumbb-sumb*sumb));

o=((double)(suma-s*sumb))/((double)iArea);
d=((double)(sumaa+s*(s*sumbb-2*sumab+2*o*sumb)+o*(o*iArea-2*suma)))/((double)iArea);

if(d<dd)
{
data.trans = iType;
data.o=o;
data.s=s;
dd=d;
data.dX=Curr_domain[k].x;
data.dY=Curr_domain[k].y;
}
}
if(d < m_fTolerance)
{
iType=8;
k= dCount;
}
}

mappingMobj[l].s = data.s;
mappingMobj[l].o = data.o;
mappingMobj[l].trans = data.trans;
mappingMobj[l].dX = data.dX;
mappingMobj[l].dY =data.dY;
}