I have recently started working with OpenCL, but at my first program I ran into some problems.

This is the code:

Code :
#include <oclUtils.h>
#include "Timer.h"
#define NUM 512
int main()
   cl_platform_id platform;
   cl_uint arrDimension = NUM;
   float *arr1 = new float[NUM];
   float *arr2 = new float[NUM];
   float *arr3 = new float[NUM];
   Timer t1;
   cl_int err = oclGetPlatformID(&platform);
   if(err != CL_SUCCESS)
       printf("O eroare la citirea platformei= %s\n",oclErrorString(err));
   cl_device_id gpuDevice;
   err = clGetDeviceIDs(platform,CL_DEVICE_TYPE_GPU,1,&gpuDevice,NULL);
   if(err != CL_SUCCESS)
        printf("O eroare la conectarea la dispozitivul de calcul= %s\n",oclErrorString(err));
   cl_context gpuContext;
   gpuContext = clCreateContext(0,1,&gpuDevice,NULL,NULL,&err);
   if(err != CL_SUCCESS)
        printf("O eroare la crearea contextului= %s\n",oclErrorString(err));
   cl_command_queue gpuCommandQueue;
   gpuCommandQueue = clCreateCommandQueue(gpuContext,gpuDevice,0,&err);
   if(err != CL_SUCCESS)
        printf("O eroare la crearea cozii de comenzi= %s\n",oclErrorString(err));
   cl_program program;
   const char *kernelStr = "__kernel void add(__global float* a, __global float* b, __global float* c)\
                         unsigned int i = get_global_id(0);\
                         c[i] = a[i] + b[i];\
   size_t kernelLength = strlen(kernelStr);
   program = clCreateProgramWithSource(gpuContext,1,&kernelStr,&kernelLength,&err);
   err = clBuildProgram(program,0,NULL,NULL,NULL,NULL);
   if(err != CL_SUCCESS)
        printf("O eroare la compilarea kernelului= %s\n",oclErrorString(err));
   cl_kernel kernel = clCreateKernel(program,"add",&err);
for(int i=0;i<NUM;i++)
arr1[i] = i;
arr2[i] = i;
   cl_mem buf_a = clCreateBuffer(gpuContext,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(cl_float)*NUM, arr1,&err);
   cl_mem buf_b = clCreateBuffer(gpuContext,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(cl_float)*NUM, arr2,&err);
   cl_mem buf_c = clCreateBuffer(gpuContext,CL_MEM_WRITE_ONLY,sizeof(cl_float)*NUM,arr3,&err);
   err = clSetKernelArg(kernel,0,sizeof(cl_mem),(void *)&buf_a);
   err = clSetKernelArg(kernel,1,sizeof(cl_mem),(void *)&buf_b);
   err = clSetKernelArg(kernel,2,sizeof(cl_mem),(void *)&buf_c);
   err = clEnqueueNDRangeKernel(gpuCommandQueue,kernel,1,NULL,&arrDimension,0,0,0,0);
   clEnqueueReadBuffer(gpuCommandQueue,buf_c,CL_TRUE,0, NUM*sizeof(cl_float),arr3,0,0,0);
	//for(int i=0;i<NUM;i++)
	//arr3[i]=arr1[i] +arr2[i];
printf("Time: %f\n",t1.getElapsedTimeInMicroSec());
  // for(int i=0;i<NUM;i++)
  // {
  // }

The thing is that cpu processing time is less than GPU processing time. Where am I doing wrong?

Help would be greatfully apreciated.