I am kinda new to the OpenCL and I tried to write this small testing program of matrix add (don't get confused by the kernel function name )

The code runs successfully, however the output matrix is full of zeros instead of the proper result 3.

Could You please help me find the mistake in my code?


#include <stdlib.h>
#include <stdio.h>

#include <CL/cl.h>

const char *OpenCLSource[] = {
"void MatMul(__global int* matAA, __global int* matBB, __global int* matCC)",
" unsigned int i = get_global_id(0);",
" matCC[i] = matAA[i] + matBB[i];",

// Main function
// ************************************************** *******************
int main(int argc, char *argv[])
int clerror = CL_SUCCESS;

cl_mem matAA;
cl_mem matBB;
cl_mem matCC;

const int dsize = 16 * 16;

int matA[dsize];
int matB[dsize];
int matC[dsize];

for(int i = 0; i < dsize; i++)
//matA[i] = matB[i] = rand();
matA[i] = 1;
matB[i] = 2;

// Query platform ID
cl_platform_id platform;
clGetPlatformIDs (1, &platform, NULL);

// Setup context properties
cl_context_properties props[3];
props[0] = (cl_context_properties)CL_CONTEXT_PLATFORM;
props[1] = (cl_context_properties)platform;
props[2] = (cl_context_properties)0;

// Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
cl_context GPUContext = clCreateContextFromType(props, CL_DEVICE_TYPE_GPU,NULL, NULL, NULL);

// Get the list of GPU devices associated with this context
size_t ParmDataBytes;
clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes);
cl_device_id* GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL);

// Create a command-queue on the first GPU device
cl_command_queue GPUCommandQueue = clCreateCommandQueue(GPUContext, GPUDevices[0], 0, NULL);

// Allocate GPU memory for source vectors AND initialize from CPU memory
matAA = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * dsize, matA, NULL);
matBB = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * dsize, matB, NULL);
matCC = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, sizeof(int) * dsize, NULL, NULL);

// Create OpenCL program with source code
cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 18, OpenCLSource, NULL, NULL);

// Build the program (OpenCL JIT compilation)
clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL);

// Create a handle to the compiled OpenCL function (Kernel)
cl_kernel matMulKernel = clCreateKernel(OpenCLProgram, "MatMul", NULL);

size_t global_work_size[1];
size_t local_work_size[1];

global_work_size[0] = dsize;
local_work_size[0] = dsize;

// In the next step we associate the GPU memory with the Kernel arguments
clSetKernelArg(matMulKernel, 0, sizeof(cl_mem), (void*)&matA);
clSetKernelArg(matMulKernel, 1, sizeof(cl_mem), (void*)&matB);
clSetKernelArg(matMulKernel, 2, sizeof(cl_mem), (void*)&matC);

// Launch the Kernel on the GPU
clEnqueueNDRangeKernel(GPUCommandQueue, matMulKernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);

// Copy the output in GPU memory back to CPU memory
clEnqueueReadBuffer(GPUCommandQueue, matCC, CL_TRUE, 0, global_work_size[0], matC, 0, NULL, NULL);

// Print out the results
for (int i = 0; i < 10; i++)
printf("%d\n", matC[i]);

// Cleanup


return 0;