Hello,

i have bayer demosaic algorithms for C++ and optimized assembler version. Now i want to develop a openCL implemenation to compare performances. I expect a much higher performance but was really surprised how slow my kernel was. (My first stupid implementation)

I am new to openCL and so i want to ask for hints, how i can improve the performance.

First my system: i5 with 4x3GHz 8 GB RAM, Radeon HD6870, W7 64 bit, with AMD APP SDKv2.

Performance measure over 100 trys, input image size 2336x1752 output should be a RGB Image. I know that RGBA image has a better memory alginment, but i need RGB to.

Values are bayer demosaic operations per second
C++ Single: 26
ASM Single: 120
C++ openMP: 89
ASM OpenMP: 440
openCL: 120

And here my openCL kernel and thank you for your help.

Code :
__kernel void convert_3x3_Bayer8_to_RGB8_GR_NoBorder( __global uchar * oDestination, __global uchar * iSource, int iWidth, int iHeight)
{
  int x = get_global_id(0);
  int y = get_global_id(1);
 
  // 4 pixels per call, 2 pixels of 2 lines
  int sourcePixelIndex = (2 * y) * iWidth + 2 * x;
 
  int destWidth = 2 * get_global_size(0);
  int destY     = 2 * y;
  int destX     = 2 * x;
 
  uchar4 line_0;
  uchar4 line_1;
  uchar4 line_2;
  uchar4 line_3;
 
  // vector access .x .y .z. w
  line_0.x = iSource[sourcePixelIndex];
  line_0.y = iSource[sourcePixelIndex+1];
  line_0.x = iSource[sourcePixelIndex+2];
  line_0.w = iSource[sourcePixelIndex+3];
 
  sourcePixelIndex += iWidth;
  line_1.x = iSource[sourcePixelIndex];
  line_1.y = iSource[sourcePixelIndex+1];
  line_1.z = iSource[sourcePixelIndex+2];
  line_1.w = iSource[sourcePixelIndex+3];
 
  sourcePixelIndex += iWidth;
  line_2.x = iSource[sourcePixelIndex];
  line_2.y = iSource[sourcePixelIndex+1];
  line_2.z = iSource[sourcePixelIndex+2];
  line_2.w = iSource[sourcePixelIndex+3];
 
  sourcePixelIndex += iWidth;
  line_3.x = iSource[sourcePixelIndex];
  line_3.y = iSource[sourcePixelIndex+1];
  line_3.z = iSource[sourcePixelIndex+2];
  line_3.w = iSource[sourcePixelIndex+3];
 
  // first pixel first line
  ushort red_00   = hadd(line_0.y, line_2.y);
  ushort green_00 = line_1.y;
  ushort blue_00  = hadd(line_1.x, line_1.z);
 
  // second pixel first line
  ushort red_01    = (line_0.y + line_0.w + line_2.y + line_2.w) / 4;
  ushort green_01  = (line_0.z + line_2.z + line_1.y + line_1.w) / 4;
  ushort blue_01   = line_1.z;
 
  // first pixel second line
  ushort red_10    = line_2.y;
  ushort green_10  = (line_1.y + line_3.y + line_2.x + line_2.z) / 4;
  ushort blue_10   = (line_1.x + line_1.z + line_3.x + line_3.z) / 4;
 
  // second pixel second line
  ushort red_11    = hadd(line_2.y, line_2.w);
  ushort green_11  = line_2.z;
  ushort blue_11   = hadd(line_1.z, line_3.z);
 
  // first pixel first line
  int destPixelIndex = ( destY * destWidth + destX) * 3;
  oDestination[destPixelIndex]    = red_00;
  oDestination[destPixelIndex+1]  = green_00;
  oDestination[destPixelIndex+2]  = blue_00;
 
  // second pixel first line
  oDestination[destPixelIndex+3]  = red_01;
  oDestination[destPixelIndex+4]  = green_01;
  oDestination[destPixelIndex+5]  = blue_01;
 
  // first pixel second line
  destPixelIndex += destWidth * 3;
  oDestination[destPixelIndex]    = red_10;
  oDestination[destPixelIndex+1]  = green_10;
  oDestination[destPixelIndex+2]  = blue_10;
 
  // second pixel second line
  oDestination[destPixelIndex+3]  = red_11;
  oDestination[destPixelIndex+4]  = green_11;
  oDestination[destPixelIndex+5]  = blue_11;
}