Hi Guys~!

Just currious about the working size of oclBoxfilter example which NVDIA provides.

"szLocalworksize[0]" in function "BoxFilterGPU" they make redundency part(APRON) like below

szLocalWorkSize[0] = MASK_RADIUS_ALIGNED + ROWS_OUTPUT_WIDTH + r;

The APRON size only needs to be radius of the kernel but it seems to be more longer.(MASK_RADIUS_ALIGNED is 16 which is longer than radius of the kernel)

I wonder why the made the worksize bigger then it need to be.

I attached some part of the example code but I think you have to open your own example to be sure.

thanks~!


#define MASK_RADIUS 8
#define MASK_RADIUS_ALIGNED 16
#define MASK_LENGTH (2 * MASK_RADIUS + 1)
#define ROWS_OUTPUT_WIDTH 128
#define COLUMNS_BLOCKDIMX 16
#define COLUMNS_BLOCKDIMY 16
#define COLUMNS_OUTPUT_HEIGHT 128
float fScale = 1.0f/MASK_LENGTH;

...

void BoxFilterGPU(unsigned int* uiInputImage, unsigned int* uiOutputImage, unsigned int uiWidth, unsigned int uiHeight, int r, float fScale)
{
// Copy input data from host to device
ciErrNum = clEnqueueWriteBuffer(cqCommandQue, cmDevBufIn, CL_TRUE, 0, szBuffBytes, uiInputImage, 0, NULL, NULL);
shrCheckErrorEX (ciErrNum, CL_SUCCESS, pCleanup);

// Set global and local work sizes for row kernel
szLocalWorkSize[0] = MASK_RADIUS_ALIGNED + ROWS_OUTPUT_WIDTH + r;
szLocalWorkSize[1] = 1;
szGlobalWorkSize[0] = iDivUp(uiWidth, ROWS_OUTPUT_WIDTH) * szLocalWorkSize[0];
szGlobalWorkSize[1] = uiHeight;

......

__kernel void BoxRows( __global const uchar4* uc4Source, __global unsigned int* uiDest,
__local uchar4* uc4LocalData,
unsigned int uiWidth, unsigned int uiHeight, float fScale)
{
// Compute x and y pixel coordinates from group ID and local ID indexes
int globalPosX = ((int)get_group_id(0) * ROWS_OUTPUT_WIDTH) + (int)get_local_id(0) - MASK_RADIUS_ALIGNED;
int globalPosY = get_group_id(1);
int iGlobalOffset = globalPosY * uiWidth + globalPosX;

......