deNorma

12-17-2009, 07:08 PM

Greetings,

I have probelm when trying to find the max value in a work-group. Say, every work-item has a value 'thread_c', and I want to find the biggest one within work-group and then use that max for later calculation in every work-item in that work-group. I succeed it in cuda, and now just 'copy' it to openCL, but I don't know why it has problem... The max value 's calculation is wrong. I copy my code(cuda and openCL) as below, and any help will be appreciated!

cuda code:

-----------------

/*

every thread has its own value thread_c

all thread_c are bigger than zero

*/

__shared__ float array[BLOCK_SIZE];

array[threadIdx.x]=thread_c;

__shared__ float s_c[BLOCK_SIZE];

__syncthreads();

if (threadIdx.x==0)

{

float temp_c_max;

temp_c_max=0.0;

for (int i=0; i<BLOCK_SIZE; i++)

{

if (array[i]>temp_c_max) temp_c_max=array[i];

}

for (int i=0; i<BLOCK_SIZE; i++)

{

s_c[i]=temp_c_max;

}

}

__syncthreads();

//

float c;

c=s_c[threadIdx.x];

// then every thread has new c value to calculate

-------------------------

openCL code:

-----------------------

localIdX=get_local_id(0);

/*

every work-item has its own value thread_c

*/

__local float array[SIZEofWORKGROUP];

array[localIdX]=thread_c;

__local float s_c[SIZEofWORKGROUP];

barrier(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE);

//

if (localIdX==0)

{

float temp_c_max;

temp_c_max=0.0;

for (i=0; i<SIZEofWORKGROUP; i++)

{

if (array[i]>temp_c_max) temp_c_max=array[i];

}

for (i=0; i<SIZEofWORKGROUP; i++)

{

s_c[i]=temp_c_max;

}

}

barrier(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE);

//

float c;

c=s_c[localIdX];

//

// then every item can use new c to calculate

------------------------------

I am sure things before these two codes are correct. and each thread_c are also correct but just this max c is wrong. Thanks in advance.

N

I have probelm when trying to find the max value in a work-group. Say, every work-item has a value 'thread_c', and I want to find the biggest one within work-group and then use that max for later calculation in every work-item in that work-group. I succeed it in cuda, and now just 'copy' it to openCL, but I don't know why it has problem... The max value 's calculation is wrong. I copy my code(cuda and openCL) as below, and any help will be appreciated!

cuda code:

-----------------

/*

every thread has its own value thread_c

all thread_c are bigger than zero

*/

__shared__ float array[BLOCK_SIZE];

array[threadIdx.x]=thread_c;

__shared__ float s_c[BLOCK_SIZE];

__syncthreads();

if (threadIdx.x==0)

{

float temp_c_max;

temp_c_max=0.0;

for (int i=0; i<BLOCK_SIZE; i++)

{

if (array[i]>temp_c_max) temp_c_max=array[i];

}

for (int i=0; i<BLOCK_SIZE; i++)

{

s_c[i]=temp_c_max;

}

}

__syncthreads();

//

float c;

c=s_c[threadIdx.x];

// then every thread has new c value to calculate

-------------------------

openCL code:

-----------------------

localIdX=get_local_id(0);

/*

every work-item has its own value thread_c

*/

__local float array[SIZEofWORKGROUP];

array[localIdX]=thread_c;

__local float s_c[SIZEofWORKGROUP];

barrier(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE);

//

if (localIdX==0)

{

float temp_c_max;

temp_c_max=0.0;

for (i=0; i<SIZEofWORKGROUP; i++)

{

if (array[i]>temp_c_max) temp_c_max=array[i];

}

for (i=0; i<SIZEofWORKGROUP; i++)

{

s_c[i]=temp_c_max;

}

}

barrier(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE);

//

float c;

c=s_c[localIdX];

//

// then every item can use new c to calculate

------------------------------

I am sure things before these two codes are correct. and each thread_c are also correct but just this max c is wrong. Thanks in advance.

N