Hi, i am being able to compile this code but it doesn't give me any output and goes in some indefinite loop (or something else)... i dont understand whats happening. I'm a newbie to OpenCL and this is my first program.. All i'm trying to do is add two vectors. I get no errors during the compilation. If i am not wrong, i should see the 'End' that i've printed after the gpu portion completes right?

This is the machine detail that i'm running the code on:
Linux gpu02.cluster 2.6.18-92.1.22.el5 #1 SMP Tue Dec 16 11:57:43 EST 2008 x86_64 x86_64 x86_64 GNU/Linux


Code :
#include <stdio.h>
#include <CL/cl.h>
 
#define SIZE 10
 
int va[SIZE];
int vb[SIZE];
int vc[SIZE];
 
char* load_program_source(const char*);
 
int Init(){
        int i;
        srand(20);
        for(i=0;i<SIZE;i++){
                va[i]=rand()%10;
                vb[i]=rand()%10;
        }
}
 
char* load_program_source(const char *filename)
{
        int fileSize = 0;
        FILE *pFile = fopen(filename, "r");
        rewind(pFile);
        fseek(pFile, 0, SEEK_END);
        fileSize = ftell(pFile);
        rewind(pFile);
 
        char *data = (char*) calloc(sizeof(char), fileSize+1);
        fread(data, 1, fileSize, pFile);
        data[fileSize]='\0';
        fclose(pFile);
        return data;
}
 
int main(){
        if(!Init()){
                printf("Unable to initialize data");
                return 1;
        }
 
        cl_context GPUContext = clCreateContextFromType(0,CL_DEVICE_TYPE_GPU, NULL, NULL, NULL);
        if(!GPUContext){
                printf("Error: Failed to create context");
                return 1;
        }
 
        //Get the list of GPU devices associated with this context
        size_t ParmDataBytes;
        clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes);
        cl_device_id* GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
        clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes,GPUDevices,NULL);
 
        //Create a command queue on first gpu device
        cl_command_queue GPUCommandQueue = clCreateCommandQueue(GPUContext, GPUDevices[0],0,NULL);
        if(!GPUCommandQueue){
                printf("Error: Failed to create a command queue");
                return 1;
        }
 
        //Allocate memory
        cl_mem GPUva = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*SIZE, va, NULL);
        cl_mem GPUvb = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*SIZE, vb, NULL);
        cl_mem GPUvc = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, sizeof(int)*SIZE, NULL, NULL);
 
        //Create OCL program reading the source code from the file
        char * OclSource = load_program_source("vectoradd.cl");
        cl_program  OpenCLProgram = clCreateProgramWithSource(GPUContext,1,(const char**)&OclSource,NULL,NULL);
 
        //Build the program
        clBuildProgram(OpenCLProgram,0,NULL,NULL,NULL,NULL);
 
        //obtain the handle for the kernel
        cl_kernel OpenCLVectorAdd = clCreateKernel(OpenCLProgram,"VectorAdd",NULL);
 
       //associate GPU memory with the kernel
        clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem), (void*)&GPUvc);
        clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&GPUvb);
        clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&GPUva);
 
        //Launch the kernel in the GPU
        size_t WorkSize[1] = {SIZE};
        clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd,1,NULL,WorkSize,NULL,0,NULL,NULL);
 
        //copy the result back to the main memory
        clEnqueueReadBuffer(GPUCommandQueue, GPUvc, CL_TRUE,0,sizeof(int) * SIZE, vc, 0, NULL, NULL);
 
        //cleanup
        free(GPUDevices);
        clReleaseKernel(OpenCLVectorAdd);
        clReleaseProgram(OpenCLProgram);
        clReleaseCommandQueue(GPUCommandQueue);
        clReleaseContext(GPUContext);
        clReleaseMemObject(GPUva);
        clReleaseMemObject(GPUvb);
        clReleaseMemObject(GPUvc);
 
        printf("End");
        return 0;
}

this is my kernel code:

Code :
__kernel void VectorAdd(__global int* vc, __global int* vb, __global int* va){
        int i;
        for(i=0;i<100;i++){
                vc[i]=vb[i]+va[i];
        }
}

could anyone please help me with it?

Thanks a lot