i am using gpu of type nvidia,i am using opencl but when i run the program using ctrl+F5(start without debugging )then i get result in which gpu takes more time than cpu but when i run the program cpu takes more time than gpu and result is also i am giving
start without debugging -> cpu time=6127 ms gpu time= 6240 ms
start with debug-> cpu time= 18354 ms gpu time= 9125 ms

wt is the reason in this difference......
visual studio 2010 i am using
the code is here. wt is going wrong.?..thanks


// Hello.cpp : Defines the entry point for the console application.
//

//#include <stdafx.h>
#include<stdio.h>
#include<stdlib.h>
#include<conio.h>
#include<time.h>
#include "CL/cl.h"
#define DATA_SIZE 100000
const char *KernelSource =
"kernel void hello(global float *input , global float *output)\n"\
"{\n"\
" size_t id =get_global_id(0);\n"\
"output[id] =input[id]*input[id];\n"\
"} "
"\n"\
"\n";
//float start_time,end_time;

int main(void)
{
double start_time,end_time;
start_time=clock();
cl_context context;
cl_context_properties properties[3];
cl_kernel kernel;
cl_command_queue command_queue;
cl_program program;
cl_int err;
cl_uint num_of_platforms=0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices=0;
cl_mem input,output;
size_t global;
float inputData[100000];
for(int j=0;j<100000;j++)
{
inputData[j]=(float)j;
}

float results[DATA_SIZE];//={0};

// int i;

//retrieve a list of platform variable
if(clGetPlatformIDs(1,&platform_id,&num_of_platfor ms)!=CL_SUCCESS)
{
printf("Unable to get platform_id\n");
return 1;
}

//try to get supported GPU DEvice
if(clGetDeviceIDs(platform_id,CL_DEVICE_TYPE_CPU,1 ,&device_id,
&num_of_devices)!=CL_SUCCESS)
{
printf("unable to get device_id\n");
return 1;
}

//context properties list -must be terminated with 0
properties[0]=CL_CONTEXT_PLATFORM;
properties[1]=(cl_context_properties) platform_id;
properties[2]=0;

//create a context with the GPU device
context=clCreateContext(properties,1,&device_id,NU LL,NULL,&err);

//create command queue using the context and device
command_queue=clCreateCommandQueue(context,device_ id,0,&err);

//create a program from the kernel source code
program=clCreateProgramWithSource(context,1,(const char**)
&KernelSource,NULL,&err);

//compile the program
err=clBuildProgram(program,0,NULL,NULL,NULL,NULL);
if((err!=CL_SUCCESS))
{
printf("build error \n",err);
size_t len;
char buffer[4096];
//get the build log
clGetProgramBuildInfo(program,device_id,CL_PROGRAM _BUILD_LOG,sizeof(buffer),buffer,&len);
printf("----build Log---\n%s\n",buffer);
exit(1);

// return 1;
}

//specify which kernel from the program to execute
kernel=clCreateKernel(program,"hello",&err);

//create buffers for the input and output
input=clCreateBuffer(context,CL_MEM_READ_ONLY,size of(float)*DATA_SIZE,NULL,NULL);

output=clCreateBuffer(context,CL_MEM_WRITE_ONLY,si zeof(float)*DATA_SIZE,NULL,NULL);

//load data into the input buffer

clEnqueueWriteBuffer(command_queue,input,CL_TRUE,0 ,
sizeof(float)*DATA_SIZE,inputData,0,NULL,NULL);

//set the argument list for the kernel command
clSetKernelArg(kernel,0,sizeof(cl_mem),&input);
clSetKernelArg(kernel,1,sizeof(cl_mem),&output);
global=DATA_SIZE;

//enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue,kernel,1,NULL ,&global,NULL,0,NULL,NULL);
clFinish(command_queue);

//copy the results from out of the buffer
clEnqueueReadBuffer(command_queue,output,CL_TRUE,0 ,sizeof(float)*DATA_SIZE,results,0,
NULL,NULL);

//print the results
printf("output:");
for(int i=0;i<DATA_SIZE;i++)
{
printf("%f\n",results[i]);
//printf("no. of times loop run %d\n",count);
}

//cleanup-release OpenCL resources

clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
end_time=clock();
printf("execution time is%f",end_time-start_time);
_getch();
return 0;

}