i have also another question..

I want to make sure if i understand the definition of size_t global_size[] and size_t local_size[] in 2 dimensions, for global and local work space!

I have to read from buffer (from my first kernel) an array of QxN size , and from my second kernel an array of Qxk size! so i suppose that i have to define global_size[] = {Q,N} and local_size[] = {16,16} i think that the local size is similar like the block size in cuda so i choose 16x16 to be more appropriate !
For my second kernel i define global_size[] = {Q,k} and local_size[] = {16,16}.

So, when i run my program i give 3 arguments, size N, size Q and size k, if i try to create an array of
{N=16, Q=16, k=16} or {N=64, Q=64, k=16} i don't have any problem, but if i try for {N=128, Q=128, k=16} or another combination i have the bug error -54 CL_INVALID_WORK_GROUP_SIZE , so i think that something i didn't understand so well, i would be grateful if someone help me to manage with that issue! I have read many blocks and sites but i post again to this page because i want an answer to my specific problem!

thank you anyway!

here is the code...

//======================= GPU ==========================//

/* Create device and context */
device = create_device();
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);

if(err < 0) {
perror("Couldn't create a context");
exit(1);
}

/* Build program */
program1 = build_program(context, device, PROGRAM_FILE);

/* Create data buffer */
int numQuery = numQueries;
int D=numDim;

double *data = training_set.data;
double *Query = query_set.data;

size_t global_size[] = {numQuery,numObjects};
size_t local_size[] = {BLK_SIZE,BLK_SIZE};


Matrix d_N;
d_N.width = D; //num Dim
d_N.height = numObjects; //num objects

size_t sizeN = D * numObjects * sizeof(double);


d_N.elements = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeN, data, &err);

Matrix d_Q;
d_Q.width = D; //num Dim
d_Q.height = numQuery; //num querries

size_t sizeQ = numQuery * D * sizeof(double);

d_Q.elements = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeQ, Query, &err);

//result
Matrix d_result_Dist;
d_result_Dist.width = numObjects;//num objects
d_result_Dist.height = numQuery ;//num querries

size_t sizeDist = numObjects * numQuery * sizeof(double);

d_result_Dist.elements = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeDist, Dist, &err);//mipws kapws prepei na xwthei to num of groups

if(err < 0) {
perror("Couldn't create a buffer");
exit(1);
};

/* Create a command queue */
queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE , &err);
if(err < 0){
perror("Couldn't create a command queue 1");
printf("error %d \n ", err);
exit(1);
};

/* Create a kernel EDW THA PAREI TON PRWTO KERNEL */
kernel_createDist = clCreateKernel(program1, "kernel_createDist", &err);
if(err < 0) {
perror("Couldn't create a kernel 1");
printf("error %d \n ", err);
exit(1);
};


/* Create kernel arguments TO LOCAL TO EXEI VALEI STO PARADEIGMA EPEIDI O KERNEL TOU EXEI ORISMA TYPOU _LOCAL */
err = clSetKernelArg(kernel_createDist, 0, sizeof(d_N.elements),(void*) &d_N.elements);
err = clSetKernelArg(kernel_createDist, 1, sizeof(d_Q.elements), (void*)&d_Q.elements);
err = clSetKernelArg(kernel_createDist, 2, sizeof(d_result_Dist.elements), (void*)&d_result_Dist.elements);
err = clSetKernelArg(kernel_createDist, 3, sizeof(d_N.width), (void*)&d_N.width);
err = clSetKernelArg(kernel_createDist, 4, sizeof(d_result_Dist.width), (void*)&d_result_Dist.width);

if(err < 0) {
perror("Couldn't create a kernel argument 1");
printf("error %d \n ", err);
exit(1);
}
//wait complete queue
clFinish(queue);

/* Enqueue kernel */
err = clEnqueueNDRangeKernel(queue, kernel_createDist, 2, NULL, global_size, local_size, 0, NULL, &event);

if(err < 0) {
perror("Couldn't enqueue the kernel 1 %d");
printf("error %d \n ", err);
exit(1);
}
// wait for sure to finish the kernel
clWaitForEvents(1 , &event);

// compute of time in GPU
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL);

float executionTimeInMilliseconds1 = (end - start) * 1.0e-6f;
printf("[OPENCL] Time elapsed for GPU first kernel: %f s\n", executionTimeInMilliseconds1);

/* Read the kernel's output */
err = clEnqueueReadBuffer(queue, d_result_Dist.elements, CL_TRUE, 0, sizeDist, Dist, 0, NULL, NULL);
if(err < 0) {
perror("Couldn't enqueue the kernel 1 %d");
printf("error %d \n ", err);
exit(1);
}


clReleaseMemObject(d_N.elements);
clReleaseMemObject(d_Q.elements);
clReleaseMemObject(d_result_Dist.elements);


clReleaseKernel(kernel_createDist);
clReleaseCommandQueue(queue);
clReleaseProgram(program1);
clReleaseContext(context);

//~ printf("----Dist gpu---\n");
//~ printMat(Dist,numQuery,numObjects);
//~

//============================= SECOND KERNEL =================================//
device = create_device();
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if(err < 0) {
perror("Couldn't create a context 2");
printf("error %d \n ", err);
exit(1);
}

program2 = build_program(context, device, PROGRAM_FILE);

size_t global_size2[] = {numQuery,k};
size_t local_size2[] = {BLK_SIZE,BLK_SIZE};

//input
Matrix d_D;
d_D.width = numObjects;//num objects
d_D.height = numQuery ;//num querries

size_t sizeDist1 = numObjects * numQuery * sizeof(double);

d_D.elements = clCreateBuffer(context,CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeDist1, Dist, &err);



//result data
Matrix d_NNidx;
d_NNidx.width = k;
d_NNidx.height = numQuery; //num querries

size_t sizeId = k * numQuery * sizeof(double);

d_NNidx.elements = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeId, NNidx, &err);

//result data
Matrix d_result;
d_result.width = k;
d_result.height = numQuery ;//num querries

size_t sizeRe = k * numQuery * sizeof(double);

d_result.elements = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeRe, NNdist, &err);

/* Create a command queue */
queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
if(err < 0) {
perror("Couldn't create a command queue 2");
printf("error %d \n ", err);
exit(1);
};

kernel_ParallelSorting = clCreateKernel(program2, "kernel_ParallelSorting", &err);

if(err < 0) {
perror("Couldn't create a kernel 2");
printf("error %d \n ", err);
exit(1);
};

//~ /* Create kernel arguments */
err = clSetKernelArg(kernel_ParallelSorting, 0, sizeof(d_D.elements), (void*)&d_D.elements);

err = clSetKernelArg(kernel_ParallelSorting, 1, sizeof(d_result.elements), (void*)&d_result.elements);

err = clSetKernelArg(kernel_ParallelSorting, 2, sizeof(d_NNidx.elements), (void*)&d_NNidx.elements);

err = clSetKernelArg(kernel_ParallelSorting, 3, sizeof(d_D.width), (void*)&d_D.width);

err = clSetKernelArg(kernel_ParallelSorting, 4, sizeof(d_result.width), (void*)&d_result.width);


if(err < 0) {
perror("Couldn't create a kernel argument 2");
printf("error num %d\n",err);
exit(1);
}

clFinish(queue);

//~ /* Enqueue kernel */
err = clEnqueueNDRangeKernel(queue, kernel_ParallelSorting, 2, NULL, global_size2, local_size2, 0, NULL, &event);

if(err < 0) {
perror("Couldn't enqueue the kernel 2");
printf("error %d \n ", err);
exit(1);
}

clWaitForEvents(1 , &event);

clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL);

float executionTimeInMilliseconds2 = (end - start) * 1.0e-6f;
printf("[OPENCL] Time elapsed for GPU first kernel: %f s\n", executionTimeInMilliseconds2);


//read buffer 1
err = clEnqueueReadBuffer(queue, d_result.elements , CL_TRUE, 0, sizeRe, NNdist, 0, NULL, NULL);

if(err < 0) {
perror("Couldn't read the buffer 2 for d_kDist");
printf("error %d \n ", err);
exit(1);
}


// read buffer 2
err = clEnqueueReadBuffer(queue, d_NNidx.elements, CL_TRUE, 0, sizeId, NNidx, 0, NULL, NULL);

if(err < 0) {
perror("Couldn't read the buffer 2 for d_kDist");
exit(1);
}


printf("+++++++++++++++++++++ knns GPU +++++++++++++++++++++++++\n");
printMat(NNdist,numQueries,k);



clReleaseMemObject(d_D.elements);
clReleaseMemObject(d_result.elements);
clReleaseMemObject(d_NNidx.elements);

clReleaseKernel(kernel_ParallelSorting);
clReleaseCommandQueue(queue);
clReleaseProgram(program2);

clReleaseContext(context);