PDA

View Full Version : OpenCL Addition Example (Mac OS X 10.6 Snow *Leopard)



yile
06-15-2009, 07:29 AM
/*

File: compute_sumints.c

Abstract: source for compute_sum kernel and initialization and runtime
code for summing integers in and OpenCL kernel

Version: 1.0

Disclaimer: IMPORTANT: This Apple software is supplied to you by
Apple Inc. ("Apple") in consideration of your agreement to the
following terms, and your use, installation, modification or
redistribution of this Apple software constitutes acceptance of these
terms. If you do not agree with these terms, please do not use,
install, modify or redistribute this Apple software.

In consideration of your agreement to abide by the following terms, and
subject to these terms, Apple grants you a personal, non-exclusive
license, under Apple's copyrights in this original Apple software (the
"Apple Software"), to use, reproduce, modify and redistribute the Apple
Software, with or without modifications, in source and/or binary forms;
provided that if you redistribute the Apple Software in its entirety and
without modifications, you must retain this notice and the following
text and disclaimers in all such redistributions of the Apple Software.
Neither the name, trademarks, service marks or logos of Apple Inc.
may be used to endorse or promote products derived from the Apple
Software without specific prior written permission from Apple. Except
as expressly stated in this notice, no other rights or licenses, express
or implied, are granted by Apple herein, including but not limited to
any patent rights that may be infringed by your derivative works or by
other works in which the Apple Software may be incorporated.

The Apple Software is provided by Apple on an "AS IS" basis. APPLE
MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND
OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS.

IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION,
MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED
AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE),
STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.

Copyright (C) 2008 Apple Inc. All Rights Reserved.

*/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <stdbool.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <OpenCL/opencl.h>
#include <mach/mach_time.h>

static cl_device device;
static cl_context context;

const char *sum_kernel_code =
"__kernel void compute_sum(__global int *a, int n, __local long *tmp_sum, __global long *sum)\n"
"{\n"
" int tid = get_local_thread_id(0);\n"
" int lsize = get_local_thread_size(0);\n"
" int i;\n"
"\n"
" tmp_sum[tid] = 0;\n"
" for (i=tid; i<n; i+=lsize)\n"
" tmp_sum[tid] += a[i];\n"
"\n"
" for (i=lsize/2; i>0; i/=2)\n"
" {\n"
" barrier(CL_GLOBAL_MEM_FENCE);\n"
" if (tid < i)\n"
" tmp_sum[tid] += tmp_sum[tid + i];\n"
" }\n"
"\n"
" if (tid == 0)\n"
" *sum = tmp_sum[0];\n"
"}\n";


static int
verify_sum(int *inptr, long long *outptr, int n)
{
long long r = 0;
int i;

for (i=0; i<n; i++)
{
r += inptr[i];
}

if (r != outptr[0])
{
printf("sum of ints test failed\n");
return -1;
}

printf("sum of ints test passed\n");
return 0;
}

int
compute_sumints(int num_elements, long long *compute_sum, float *compute_time)
{
cl_mem streams[2];
long long sum;
int *input_ptr;
cl_program program;
cl_kernel kernel;
void *values[4];
size_t sizes[4] = { sizeof(cl_mem), sizeof(int), 0, sizeof(cl_mem) };
size_t lengths[1];
unsigned int global_threads[1];
unsigned int local_threads[1];
int err;
unsigned int max_threadgroup_size;
int i;
cl_device_id device_id;
uint64_t t0, t1;
struct mach_timebase_info info;

mach_timebase_info(&info);

printf( "computing sum for %d randomly generated ints\n", num_elements );
input_ptr = malloc(sizeof(int) * num_elements);
for (i=0; i<num_elements; i++)
input_ptr[i] = (int)rand();

err = clGetDeviceGroupInfo(device, CL_DEVICE_IDS, &device_id, sizeof(cl_device_id), NULL);
if (err != CL_SUCCESS) {
printf( "clGetDeviceGroupInfo failed\n" );
return -1;
}

clGetDeviceConfigInfo(device_id, CL_DEVICE_MAX_THREAD_GROUP_SIZE, &max_threadgroup_size, sizeof(unsigned int), NULL);

lengths[0] = strlen(sum_kernel_code);
program = clCreateProgramWithSource(device, 1, &sum_kernel_code, lengths);
if (!program)
{
printf("clCreateProgramWithSource failed\n");
return -1;
}

err = clBuildProgramExecutable(program, false, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("clBuildProgramExecutable failed\n");
return -1;
}

kernel = clCreateKernel(program, "compute_sum");
if (!kernel)
{
printf("clCreateKernel failed\n");
return -1;
}

streams[1] = clCreateArray(device, CL_MEM_ALLOC_GLOBAL_POOL, sizeof(long long), 1, NULL);
if (!streams[1])
{
printf("clCreateArray failed\n");
return -1;
}

// begin timing
t0 = mach_absolute_time();
streams[0] = clCreateArray(device, CL_MEM_ALLOC_GLOBAL_POOL | CL_MEM_COPY_HOST_PTR, sizeof(int), num_elements, input_ptr);
if (!streams[0])
{
printf("clCreateArray failed\n");
return -1;
}

sizes[0] = sizeof(cl_mem); values[0] = streams[0];
sizes[1] = sizeof(int); values[1] = (void *)&num_elements;
sizes[2] = max_threadgroup_size*sizeof(long long); values[2] = NULL;
sizes[3] = sizeof(cl_mem); values[3] = streams[1];
err = clSetKernelArgs(context, kernel, 4, NULL, values, sizes);
if (err != CL_SUCCESS)
{
printf("clSetKernelArgs failed\n");
return -1;
}

global_threads[0] = max_threadgroup_size;
local_threads[0] = max_threadgroup_size;
err = clExecuteKernel(context, kernel, NULL, global_threads, local_threads, 1, NULL, 0, NULL);
if (err != CL_SUCCESS)
{
printf("clExecuteKernel failed\n");
return -1;
}

err = clReadArray(context, streams[1], false, 0, sizeof(long long), (void *)&sum, NULL);
if (err != CL_SUCCESS)
{
printf("clReadArray failed\n");
return -1;
}
// end timing
t1 = mach_absolute_time();

{

if (compute_time) *compute_time = 1e-9 * (t1 - t0) * info.numer / info.denom;
}
err = verify_sum(input_ptr, &sum, num_elements);

if (compute_sum) *compute_sum = sum;

// cleanup
clReleaseMemObject(streams[0]);
clReleaseMemObject(streams[1]);
clReleaseKernel(kernel);
clReleaseProgram(program);
free(input_ptr);

return err;
}

int
init_compute()
{
cl_device_id compute_device_id[2];
unsigned int num_devices = 0;
int return_value = 0;

return_value = clGetComputeDevices(CL_DEVICE_TYPE_GPU, 2, compute_device_id, &num_devices);
if(return_value || 0 == num_devices) {
printf( "clGetComputeDevices failed (with %d devices available)\n", num_devices );
return -1;
}

device = clCreateDeviceGroup(1, &compute_device_id[0]);
if (!device)
{
printf("clCreateDeviceGroup failed\n");
return -1;
}

context = clCreateContext(0, device);
if (!context)
{
printf("clCreateContext failed\n");
return -1;
}

return 0;
}

void release_compute()
{
clReleaseContext(context);
clReleaseDeviceGroup(device);
}

#if 0
int
main(int argc, char *argv[])
{
if (init_compute())
return -1;

int r = compute_sumints(1024*1024, NULL, NULL);
release_compute();
return r;
}
#endif




Because this forum is not support attachment, so if u want the whole Sample code, pls PM with ur email :)

mitchde
08-04-2009, 11:03 AM
Thanks - nice to see some Mac OS X 10.6 devs here !
I pm éd you for some little demo sources.

mitchde
08-08-2009, 01:49 AM
Anyone other has an OpenCL Example (with Xcode project file) for me ?
Didnt get an answer from above (perhaps he is in holidays ;) )
Thanks

seventhkevin
08-12-2009, 03:07 PM
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <stdbool.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <OpenCL/opencl.h>
#include <mach/mach_time.h>

const char * sProgramSource =
"__kernel void vectorAdd( \n" \
"__global const float * a, \n" \
"__global const float * b, \n" \
"__global float * c) \n" \
"{ \n" \
" // Vector element index \n" \
" int nIndex = get_global_id(0); \n" \
" c[nIndex] = a[nIndex] + b[nIndex]; \n" \
"} \n";

int main (int argc, const char * argv[])
{
const unsigned int cnBlockSize= 512;
const unsigned int cnBlocks =3;
size_t cnDimension = cnBlocks * cnBlockSize;
int err;
cl_device_id device_id;
size_t local;
size_t len;
char buffer[2048];

int gpu = 1;
err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to get device ID\n");
exit(1);
}

err = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(buffer), buffer, &len);
printf("CL_DEVICE_NAME: %s\n", buffer);
err = clGetDeviceInfo(device_id, CL_DEVICE_VENDOR, sizeof(buffer), buffer, &len);
printf("CL_DEVICE_VENDOR: %s\n", buffer);

// create OpenCL device & context
cl_context hContext;
hContext = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if (err != CL_SUCCESS)
{
printf("Error: Failed to create context\n");
exit(1);
}

// create a command queue for our device
cl_command_queue hCmdQueue;
hCmdQueue = clCreateCommandQueue(hContext, device_id, 0, 0);

// create & compile program
cl_program hProgram;
hProgram = clCreateProgramWithSource(hContext, 1, (const char **) &sProgramSource, NULL, &err);
if (!hProgram || err != CL_SUCCESS)
{
printf("Error: Failed to Create program with source\n");
exit(1);
}

err = clBuildProgram(hProgram, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to build program executable\n");
clGetProgramBuildInfo(hProgram, device_id, CL_PROGRAM_BUILD_LOG,
sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}

// create kernel
cl_kernel hKernel;
hKernel = clCreateKernel(hProgram, "vectorAdd", &err);
if (!hKernel || err != CL_SUCCESS)
{
printf("Error: Failed to create kernel\n");
exit(1);
}

// allocate host vectors
float * pA = new float[cnDimension];
float * pB = new float[cnDimension];
float * pC = new float[cnDimension];
float * pC1 = new float[cnDimension];

memset(pC, 0, cnDimension * sizeof(float));
memset(pC1, 0, cnDimension * sizeof(float));

// initialize host memory
int i;
for(i=0; i < cnDimension; i++)
{
pA[i] = pC[i] = pC1[i] = 0;
pB[i] = i;
// pA[i] = rand() % 10 + 1;
// pB[i] = rand() % 10 + 1;
}

// allocate device memory
cl_mem hDeviceMemA, hDeviceMemB, hDeviceMemC;
hDeviceMemA = clCreateBuffer(hContext,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float), pA, 0);
hDeviceMemB = clCreateBuffer(hContext,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float), pB, 0);
hDeviceMemC = clCreateBuffer(hContext,
CL_MEM_WRITE_ONLY, cnDimension * sizeof(cl_float), 0, 0);

// setup parameter values
err = 0;
err = clSetKernelArg(hKernel, 0, sizeof(cl_mem), (void *)&hDeviceMemA);
err |= clSetKernelArg(hKernel, 1, sizeof(cl_mem), (void *)&hDeviceMemB);
err |= clSetKernelArg(hKernel, 2, sizeof(cl_mem), (void *)&hDeviceMemC);
if (err != CL_SUCCESS)
{
printf("Error: Failed to set kernel args\n");
exit(1);
}

// Get the maximum work-group size for executing the kernel on the device
err = clGetKernelWorkGroupInfo(hKernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if (err != CL_SUCCESS)
{
printf("Error: clGetKernelWorkGroupInfo Failed\n");
exit(1);
}

// execute kernel
err = clEnqueueNDRangeKernel(hCmdQueue, hKernel, 1, NULL, (size_t*)(&cnDimension), &local, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: clEnqueueNDRangeKernel Failed\n");
exit(1);
}
// copy results from device back to host
clEnqueueReadBuffer(hCmdQueue, hDeviceMemC, CL_TRUE, 0, cnDimension * sizeof(cl_float),
pC, 0, NULL, NULL);

// wait for command queue
clFinish(hCmdQueue);

bool valid = true;
for(i=0; i < cnDimension; i++)
{
pC1[i] = pA[i] + pB[i];
if (pC[i] != pC1[i])
{
printf("Error: %0.2f != %0.2f\n", pC[i], pC1[i]);
valid = false;
}
}

printf("Number of elements : %d\n", cnDimension);
printf("First Element: %0.2f\n", pC[0]);
printf("Last Element : %0.2f\n\n", pC[cnDimension-1]);
if (valid) {
printf("Test passed\n");
}
else {
printf("Test failed\n");
}


delete[] pA;
delete[] pB;
delete[] pC;
delete[] pC1;

clReleaseMemObject(hDeviceMemA);
clReleaseMemObject(hDeviceMemB);
clReleaseMemObject(hDeviceMemC);
clReleaseProgram(hProgram);
clReleaseKernel(hKernel);
clReleaseCommandQueue(hCmdQueue);
clReleaseContext(hContext);
return 0;
}

mitchde
08-14-2009, 11:40 PM
Thanks !



How about an thing like that (raytracing on GPU, with sample source code buts CUDA).

http://cg.alexandra.dk/2009/08/10/trier ... -tutorial/ (http://cg.alexandra.dk/2009/08/10/triers-cuda-ray-tracing-tutorial/)

mitchde
08-19-2009, 11:36 AM
Sorry,
i tried to compile both OpenCL .c examples with Xode (10.6, as command line projects).
Both cant be compiled.
Different errors - some definition errors (CL... not declared), some compile errors
float *pa = new float (xyz):

Can someone upload that examples as .xcodeproject files (zipped, really small!) which then will work (right settings for Librarys/ compiler + code fixes) ?

Thanks

yile
08-26-2009, 06:13 AM
hey guys, I'm sorry for reply so late...

I just send out the code to you, pls check ur inbox

mitchde
08-26-2009, 12:39 PM
THANKS.
I will look on it.

I got hat example to run with that

const char * sProgramSource =
"__kernel void vectorAdd( \n" \
"__global const float * a, \n" \
"__global const float * b, \n" \
"__global float * c) \n" \
"{ \n" \
" // Vector element index \n" \
" int loop; \n" \
" int nIndex = get_global_id(0); \n" \
" for (loop=1; loop< 5000; loop++) \n" \ *** changed by me to run longer on GPU ***
"{ \n" \


" c[nIndex] = a[nIndex] + b[nIndex]; \n" \

"} \n" \

"} \n";



http://www.macupdate.com/images/screens/uploaded/32266_scr.png


But it will run only on NVIDIAs - from 9600M GT up to GTX 285 - no problems.
9600M GT = 15 sec, GTX285 = 0,8 sec
CPU from 3,8 Sec i7 920@4 GHZ down to 100 sec C2Mobile 2 GHZ.

ATI Users (OS X, 10.6) reported that OS X complete freezes when they run the Bench.

I posted the code (V020) and xcodeproject here:
http://freenet-homepage.de/amichalak/OpenCL2_SRC.zip

Would be fine if i will get some help to fix that freeze problem with ATI.

So, OpenCL is not an "fire & forget" , i must do some extra coding for GPU differences ?
Any help would be fine !

Also, i dont know what to do with that CL.hpp - if i include it, i get > 400 compiler errors.

mitchde
08-27-2009, 03:20 AM
I changed some code of the source part (smaller loop for the vector adds).
Much error handling added.
Works now on ATI 4870 /OS X 10.6), but runs way slower.
ATI 4870 : 4 sec, Geforce 285 : 0,17 sec , Geforce 9600GT : 0,93 sec, Geforce 9600M : 5 sec, Geforce 9400M: 15 sec
Now V025.
Sourcecode same link as post before.

I thried also the OpenCL Example1 from the kind user out of china.
But i get lots of errors at compiling, even if i used you complete "pack" as Xcode project.
Normally should work.
Some cl OpenCL calls are definitly NOT found in the OpenCL Standard ( i checked that).
Also some CL_MAX... constants are not defined and cant be found also in the OpenCL documentation too.
And some cl OpenCL calls have to less values given with.
Question: Did you compiled that own, or do you only have the source and shared that.
If you get that compiled for OS X 10.6, please pm that small execute (zipped) to me.
Example2 gave much less errors like undfinded constants and undefined OpenCL calls,

Question:
I am a bit confused , because the bech works so good an all Nvidias and near not on any ATIs.

mitchde
08-27-2009, 03:34 AM
Examples of errors (OpenCL example1)
compute_sumints.c

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <stdbool.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <OpenCL/opencl.h>
#include <mach/mach_time.h>
...
static cl_device device;
> no cl_device defined, i must use cl_device_id

err = clGetDeviceGroupInfo(device, CL_DEVICE_IDS, &device_id, sizeof(cl_device_id), NULL);
> clGetDeviceGroupInfo call doesnt exist, only the clGetDeviceInfo, CL_DEVICE_IDS doesnt exist in OpenCL

clGetDeviceConfigInfo(device_id, CL_DEVICE_MAX_THREAD_GROUP_SIZE, &max_threadgroup_size, sizeof(unsigned int), NULL);
> clGetDeviceConfigInfo doesnt exist in OpenCL, CL_DEVICE_MAX_THREAD_GROUP_SIZE doesnt exist ,only ..._MAX_WORK_GROUP_....


and so on. Was easy to fix cl calls with to less values, but how fix complete unknow openCl calls ?

Thanks for any help.

MicahVillmow
08-27-2009, 11:41 AM
mitchde,
The performance difference comes mainly from the fact that you are not fully utilizing the ATI architecture. The ATI architecture is a 5-way VLIW, where as Nvidia is a scalar architecture. If you want to see performance on ATI hardware, you need to program using the vector types that OpenCL provides.

mitchde
08-27-2009, 10:15 PM
Thanks !

I now will try the float4 insted of float:

/*const char * sProgramSource =
"__kernel void vectorAdd( \n" \
"__global const float * a, \n" \
"__global const float * b, \n" \
"__global float * c) \n" \
"{\n" \
" // Vector element index \n" \
" int loop; \n" \
" int test1; \n" \
" int nIndex = get_global_id(0); \n" \
" for (loop=1; loop< 1000; loop++)\n" \
"{\n" \
" c[nIndex] = a[nIndex] + b[nIndex]; \n" \
" c[nIndex] = c[nIndex] * (a[nIndex] + b[nIndex]); \n" \
" c[nIndex] = c[nIndex] * (a[nIndex] / 2.0 ); \n" \
"}\n" \

"}\n";
*/

const char * sProgramSource =
"__kernel void vectorAdd( \n" \
"__global const float4 * a, \n" \
"__global const float4 * b, \n" \
"__global float4 * c) \n" \
"{\n" \
" // Vector element index \n" \
" int loop; \n" \
" int test1; \n" \
" int nIndex = get_global_id(0); \n" \
"
" c[nIndex] = a[nIndex] + b[nIndex]; \n" \

"}\n";

I changed also the OpenCL calls from float to float4.

hDeviceMemA = clCreateBuffer(hContext,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float4), pA, &err);

(for all 3 : pA, pB, pC)
and
err = clEnqueueReadBuffer(hCmdQueue, hDeviceMemC, CL_TRUE, 0, cnDimension * sizeof(cl_float4),
pC, 0, NULL, NULL);

I will report what changed, at least it compiled ;)

mitchde
08-27-2009, 10:38 PM
Upps.
I have seen that AMD itself uses float or float2 , not float4.
i now will remove that loop and go back to very basic gpu source code for first steps in OpenCL :)

http://ati.amd.com/technology/streamcom ... pencl.html (http://ati.amd.com/technology/streamcomputing/intro_opencl.html) ) also uses float and not float4 in their own example.

http://ati.amd.com/technology/streamcomputing/images/code1.jpg
http://ati.amd.com/technology/streamcomputing/images/code2.jpg

Groovounet
08-28-2009, 07:40 AM
Does it actual use the GPU already?
I'm quite surprise, nVidia and ATI OpenCL drivers does seem really so I actually wonder how Apple could have OpenCL working

mitchde
08-28-2009, 10:53 AM
Yes indeed, the Example, i modded to an "bench" is running well and valid at least on Nvidia GPUs / Mac OS X Snow Leopard 10.6.

Here some resullts i got posted:

NEW V025 test results !

ATIs (no freezes of the bench anymore , ATI 4870 works now):
Number of OpenCL devices found: 3
OpenCL Device # 0 = Radeon HD 4870
Device 0 is an: GPU with max. 750 MHz and 4 units/cores // 4 cores are wrong !!! //
Now computing - please be patient....
time used: 4.126 seconds

Number of OpenCL devices found: 2
OpenCL Device # 0 = Radeon HD 4870
Device 0 is an: GPU with max. 750 MHz and 4 units/cores
time used: 3.997 seconds
At least with actual drivers and my benchmark ATI benches are useless.
Seems to be that either OpenCL isnt sooo universal (same code run on all GPUs optimized) or bugs in ATI OpenCL part. Maybe some OpenCL PRAGMA settings must set for ATI to get better performance.

NVIDIAs:
Number of OpenCL devices found: 2
OpenCL Device # 0 = GeForce GTX 285
Device 0 is an: GPU with max. 1584 MHz and 240 units/cores
time used: 0.231 seconds
OpenCL Device # 1 = Intel® Core™ i7 CPU 920 4,3GHz
time used: 1.296 seconds

by grue:
Number of OpenCL devices found: 3
OpenCL Device # 0 = GeForce 8800 GT
Device 0 is an: GPU with max. 1500 MHz and 112 units/cores
time used: 0.683 seconds
OpenCL Device # 1 = GeForce GTX 260
Device 1 is an: GPU with max. 1400 MHz and 216 units/cores
time used: 0.365 seconds
OpenCL Device # 2 = Intel® Xeon® CPU X5365 @ 3.00GHz
time used: 3.094 seconds

by moondark
Number of OpenCL devices found: 3
OpenCL Device # 0 = GeForce 9600M GT
Device 0 is an: GPU with max. 1250 MHz and 32 units/cores
time used: 2.798 seconds
OpenCL Device # 1 = GeForce 9400M
Device 1 is an: GPU with max. 1100 MHz and 16 units/cores
time used: 9.549 seconds
OpenCL Device # 2 = Intel® Core™2 Duo CPU P8600 @ 2.40GHz
time used: 15.800 seconds

by antic
Number of OpenCL devices found: 2
OpenCL Device # 0 = GeForce 9500 GT
Device 0 is an: GPU with max. 1350 MHz and 32 units/cores
time used: 3.053 seconds
OpenCL Device # 1 = Intel® Core™2 CPU 6600 @ 3.80GHz
time used: 15.188 seconds

by ricola
Number of OpenCL devices found: 2
OpenCL Device # 0 = GeForce 9400 GT
Device 0 is an: GPU with max. 1375 MHz and 16 units/cores
time used: 3.992 seconds
OpenCL Device # 1 = Intel® Core™2 CPU E7500 @ 3,66 GHz
time used: 12.048 seconds

Groovounet
08-28-2009, 11:13 AM
Niceeeeee!
Does it even support the connection with OpenGL?

mitchde
08-30-2009, 01:13 PM
I now tried an orig. Apple OpenCL Demo .

There is an OpenCL + OpenGL (GLUT) togehter, when you mean that.

Looks like :
CPU
http://www.insanelymac.com/forum/index.php?act=attach&type=post&id=54948

GPU
http://www.insanelymac.com/forum/index.php?act=attach&type=post&id=54949

iollmann
08-31-2009, 09:21 AM
static cl_device device;
> no cl_device defined, i must use cl_device_id

err = clGetDeviceGroupInfo(device, CL_DEVICE_IDS, &device_id, sizeof(cl_device_id), NULL);
> clGetDeviceGroupInfo call doesnt exist, only the clGetDeviceInfo, CL_DEVICE_IDS doesnt exist in OpenCL

clGetDeviceConfigInfo(device_id, CL_DEVICE_MAX_THREAD_GROUP_SIZE, &max_threadgroup_size, sizeof(unsigned int), NULL);
> clGetDeviceConfigInfo doesnt exist in OpenCL, CL_DEVICE_MAX_THREAD_GROUP_SIZE doesnt exist ,only ..._MAX_WORK_GROUP_....

These are historical interfaces. I believe they are left over from the WWDC 2008 release. They've been removed from the standard and Apple's OpenCL implementation. Rough translations follow:

cl_device -> cl_device_id
clGetDeviceGroupInfo -> clGetContextInfo( CL_CONTEXT_DEVICES)
clGetDeviceConfigInfo(CL_DEVICE_MAX_THREAD_GROUP_S IZE) -> clGetDeviceInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE)

The last one might also be translated as clGetKernelWorkGroupInfo, depending on what you are doing.

All and all, a benchmark that looks at array addition is pretty weak. You are mostly just benchmarking memory bandwidth. Even if you have all the data you need in some equivalent of a L1 cache, its still 3 LSU ops for each arithmetic instruction. You folks should work on some more real world examples.

mitchde
09-01-2009, 04:07 AM
"All and all, a benchmark that looks at array addition is pretty weak. You are mostly just benchmarking memory bandwidth. Even if you have all the data you need in some equivalent of a L1 cache, its still 3 LSU ops for each arithmetic instruction. You folks should work on some more real world examples."

You are absoulte right - but for starting+understanding OpenCL coding (not the OpenCL sourcepart !) an weak OpenCL source part is OK ;)

What do you think about that Apple OpenCL example, coding qJulia on GPU ?
I think that OpenCL source part is much more "real parallel gpu programming" than an simple vector add - also i7 CPU´s can do really fast.

I compiled that also and got around 30 FPS in the starting szenes , 10-60 fps in the animation with an fixed 800x800 window.
http://www.insanelymac.com/forum/index.php?act=attach&type=post&id=55138



//
// File: qjulia.c
//
// Abstract: This example shows how to use OpenCL to raytrace a 4d Quaternion Julia-Set
// Fractal and intermix the results of a compute kernel with OpenGL for rendering.
//
// Version: <1.0>
//
/
// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
//
////////////////////////////////////////////////////////////////////////////////////////////////////

#ifndef WIDTH
#define WIDTH (512)
#endif
#ifndef HEIGHT
#define HEIGHT (512)
#endif
#define ASPECT ((float)WIDTH / (float)HEIGHT)
#define SQR(x) ((x)*(x))
#define BOUNDING_RADIUS (2.0f)
#define BOUNDING_RADIUS_SQR (SQR(BOUNDING_RADIUS))
#define ESCAPE_THRESHOLD (BOUNDING_RADIUS * 1.5f)
#define DELTA (1e-5f)
#define ITERATIONS (10)
#define EPSILON (0.003f)
#define SHADOWS (0)

////////////////////////////////////////////////////////////////////////////////////////////////////

// Note that __float3_SPI is an unsupported vector type. It is not part of the
// OpenCL specification, and is not officially supported by any platform or vendor
// and it should not be used.

#define FLOAT3_TYPE __float3_SPI
#define FLOAT3_CONSTRUCTOR(x,y,z) ((__float3_SPI){(x),(y),(z)})

////////////////////////////////////////////////////////////////////////////////////////////////////

#ifndef FLOAT3_TYPE
#define FLOAT3_TYPE float4
#endif

#ifndef FLOAT3_CONSTRUCTOR(x,y,z)
#define FLOAT3_CONSTRUCTOR(x,y,z) ((float4){(x),(y),(z),(0.0f)})
#endif

////////////////////////////////////////////////////////////////////////////////////////////////////

#define float3 FLOAT3_TYPE
#define make_float3(x,y,z) FLOAT3_CONSTRUCTOR(x,y,z)

////////////////////////////////////////////////////////////////////////////////////////////////////

float3 cross3(float3 b, float3 c)
{
return make_float3(mad(b.y, c.z, -b.z * c.y),
mad(b.z, c.x, -b.x * c.z),
mad(b.x, c.y, -b.y * c.x));

}

float3 normalize3(float3 v)
{
return v * half_rsqrt(dot(v, v));
}

////////////////////////////////////////////////////////////////////////////////////////////////////

float4 qmult( float4 q1, float4 q2 )
{
float4 r;
float3 t;

float3 q1yzw = make_float3(q1.y, q1.z, q1.w);
float3 q2yzw = make_float3(q2.y, q2.z, q2.w);
float3 c = cross3( q1yzw, q2yzw );

t = q2yzw * q1.x + q1yzw * q2.x + c;
r.x = q1.x * q2.x - dot( q1yzw, q2yzw );
r.yzw = t.xyz;

return r;
}

float4 qsqr( float4 q )
{
float4 r;
float3 t;

float3 qyzw = make_float3(q.y, q.z, q.w);

t = 2.0f * q.x * qyzw;
r.x = q.x * q.x - dot( qyzw, qyzw );
r.yzw = t.xyz;

return r;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

float3
EstimateNormalQJulia(
float3 p,
float4 c,
int iterations )
{
float4 qp = make_float4( p.x, p.y, p.z, 0.0f );
float4 gx1 = qp - make_float4( DELTA, 0.0f, 0.0f, 0.0f );
float4 gx2 = qp + make_float4( DELTA, 0.0f, 0.0f, 0.0f );
float4 gy1 = qp - make_float4( 0.0f, DELTA, 0.0f, 0.0f );
float4 gy2 = qp + make_float4( 0.0f, DELTA, 0.0f, 0.0f );
float4 gz1 = qp - make_float4( 0.0f, 0.0f, DELTA, 0.0f );
float4 gz2 = qp + make_float4( 0.0f, 0.0f, DELTA, 0.0f );

for ( int i = 0; i < iterations; i++ )
{
gx1 = qsqr( gx1 ) + c;
gx2 = qsqr( gx2 ) + c;
gy1 = qsqr( gy1 ) + c;
gy2 = qsqr( gy2 ) + c;
gz1 = qsqr( gz1 ) + c;
gz2 = qsqr( gz2 ) + c;
}

float nx = fast_length(gx2) - fast_length(gx1);
float ny = fast_length(gy2) - fast_length(gy1);
float nz = fast_length(gz2) - fast_length(gz1);

float3 normal = normalize3(make_float3( nx, ny, nz ));

return normal;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

float4
IntersectQJulia(
float3 rO,
float3 rD,
float4 c,
float epsilon,
float escape)
{
float rd = 0.0f;
float dist = epsilon;
while ( dist >= epsilon && rd < escape)
{
float4 z = make_float4( rO.x, rO.y, rO.z, 0.0f );
float4 zp = make_float4( 1.0f, 0.0f, 0.0f, 0.0f );
float zd = 0.0f;
uint count = 0;
while(zd < escape && count < ITERATIONS)
{
zp = 2.0f * qmult(z, zp);
z = qsqr(z) + c;
zd = dot(z, z);
count++;
}

float normZ = fast_length( z );
dist = 0.5f * normZ * half_log( normZ ) / fast_length( zp );
rO += rD * dist;
rd = dot(rO, rO);
}

return make_float4(rO.x, rO.y, rO.z, dist);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

float3
Phong(
float3 light,
float3 eye,
float3 pt,
float3 normal,
float3 base)
{
const float SpecularExponent = 10.0f;
const float Specularity = 0.45f;

float3 light_dir = normalize3( light - pt );
float3 eye_dir = normalize3( eye - pt );
float NdotL = dot( normal, light_dir );
float3 reflect_dir = light_dir - 2.0f * NdotL * normal;

base += fabs(normal) * 0.5f;
float3 diffuse = base * fmax(NdotL, 0.0f);
float3 specular = Specularity * half_powr( fmax( dot(eye_dir, reflect_dir), 0.0f), SpecularExponent );
return diffuse + specular;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

float
IntersectSphere(
float3 rO,
float3 rD,
float radius )
{
float fB = 2.0f * dot( rO, rD );
float fB2 = fB * fB;
float fC = dot( rO, rO ) - radius;
float fT = (fB2 - 4.0f * fC);
if (fT <= 0.0f)
return 0.0f;
float fD = half_sqrt( fT );
float fT0 = ( -fB + fD ) * 0.5f;
float fT1 = ( -fB - fD ) * 0.5f;
fT = fmin(fT0, fT1);
return fT;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

float4
RaytraceQJulia(
float3 rO,
float3 rD,
float4 mu,
float epsilon,
float3 eye,
float3 light,
float3 diffuse,
float radius,
bool shadows,
int iterations )
{
const float4 background = make_float4( 0.15f, 0.15f, 0.15f, 0.0f );
float4 color = background;

rD = normalize3( rD );
float t = IntersectSphere( rO, rD, radius );
if ( t <= 0.0f )
return color;

rO += rD * t;
float4 hit = IntersectQJulia( rO, rD, mu, epsilon, ESCAPE_THRESHOLD );
float dist = hit.w;
if (dist >= epsilon)
return color;

rO.xyz = hit.xyz;
float3 normal = EstimateNormalQJulia( rO, mu, iterations );

float3 rgb = Phong( light, rD, rO, normal, diffuse );
color.xyz = rgb.xyz;
color.w = 1.0f;

if (SHADOWS)
{
float3 light_dir = normalize3( light - rO );
rO += normal * epsilon * 2.0f;
hit = IntersectQJulia( rO, light_dir, mu, epsilon, ESCAPE_THRESHOLD );
dist = hit.w;
color.xyz *= (dist < epsilon) ? (0.4f) : (1.0f);
}

return color;
}

///////////////////////////////////////////////////////////////////////////////////////////

float4
QJulia(
float4 coord,
float4 mu,
float4 diffuse,
float epsilon,
float iterations,
int shadows,
uint width,
uint height)
{
float zoom = BOUNDING_RADIUS_SQR;
float radius = BOUNDING_RADIUS_SQR;

float2 size = make_float2((float)width, (float)height);
float scale = max(size.x, size.y);
float2 half = make_float2(0.5f, 0.5f);
float2 position = (coord.xy - half * size) / scale;
float2 frame = (position) * zoom;

float3 light = make_float3(1.5f, 0.5f, 4.0f);
float3 eye = make_float3(0.0f, 0.0f, 4.0f);
float3 ray = make_float3(frame.x, frame.y, 0.0f);
float3 base = make_float3(diffuse.x, diffuse.y, diffuse.z);

float3 rO = eye;
float3 rD = (ray - rO);

float4 color = RaytraceQJulia( rO, rD, mu, epsilon, eye, light, base, radius, shadows, iterations);

return color;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

__kernel void
QJuliaKernel(
__global uchar4 *result,
const float4 mu,
const float4 diffuse,
const float epsilon)
{
int tx = get_global_id(0);
int ty = get_global_id(1);
int sx = get_global_size(0);
int sy = get_global_size(1);
int index = ty * WIDTH + tx;
bool valid = (tx < WIDTH) && (ty < HEIGHT);

float4 coord = make_float4((float)tx, (float)ty, 0.0f, 0.0f);

if(valid)
{
float4 color = QJulia(coord, mu, diffuse, epsilon, ITERATIONS, SHADOWS, WIDTH, HEIGHT);
uchar4 output = convert_uchar4_sat_rte(color * 255.0f);
result[index] = output;
}
}

////////////////////////////////////////////////////////////////////////////////////////////////////

yile
09-03-2009, 01:50 AM
hey, guys, I found some examples!

1.
http://developer.apple.com/mac/library/ ... index.html (http://developer.apple.com/mac/library/samplecode/OpenCL_Hello_World_Example/index.html)

2.
http://developer.apple.com/mac/library/ ... index.html (http://developer.apple.com/mac/library/samplecode/OpenCL_Matrix_Transpose_Example/index.html)

3.
http://developer.apple.com/mac/library/ ... index.html (http://developer.apple.com/mac/library/samplecode/OpenCL_NBody_Simulation_Example/index.html)

4.
http://developer.apple.com/mac/library/ ... index.html (http://developer.apple.com/mac/library/samplecode/OpenCL_Parallel_Prefix_Sum_Example/index.html)

5.
http://developer.apple.com/mac/library/ ... index.html (http://developer.apple.com/mac/library/samplecode/OpenCL_Parallel_Reduction_Example/index.html)

6.
http://developer.apple.com/mac/library/ ... index.html (http://developer.apple.com/mac/library/samplecode/OpenCL_Procedural_Geometric_Displacement_Example/index.html)

7.
http://developer.apple.com/mac/library/ ... index.html (http://developer.apple.com/mac/library/samplecode/OpenCL_Procedural_Grass_and_Terrain_Example/index.html)

8.
http://developer.apple.com/mac/library/ ... index.html (http://developer.apple.com/mac/library/samplecode/OpenCL_Procedural_Noise_Example/index.html)

9.
http://developer.apple.com/mac/library/ ... index.html (http://developer.apple.com/mac/library/samplecode/OpenCL_RayTraced_Quaternion_Julia-Set_Example/index.html)

10.
http://developer.apple.com/mac/library/ ... index.html (http://developer.apple.com/mac/library/samplecode/Trajectories/index.html)

mitchde
09-03-2009, 03:11 AM
Yes, that are excat that i could get compiled & run QJULIA & GALAXY & GRASS - at least on NVIDIA gpus.
My screenshoot above showing QJULIA (Raytraycing qJuila).

ATI Mac users (4850 / 4870) get freezes OS X or Errormessages for ProgrammBuild(OpenCL ATI part not even can compile the source for ATI, NV rocks!- not running on ATI 48XX !!!
I thought OpenCL is NOT vendor specific ?!
Why same Source (Host & Client) runs perfect on NVIDIA low end 9400M and not on highend ATI 4870 ?!
It cant be that highend ATI 4870 has less hw features(for OpenCL) than the mobile! lowend Nvidia 9400M - or ?

here Screenshoots for GALAXY & GRASS

http://www.insanelymac.com/forum/uploads/monthly_09_2009/post-110586-1251826803_thumb.jpg

http://www.insanelymac.com/forum/uploads/monthly_08_2009/post-110586-1251659249_thumb.jpg

mitchde
09-04-2009, 07:38 AM
Here some collected Mac OS X OpenCL Benchresults of the apps i told you:
http://www.barefeats.com/opencl.html

yile
09-05-2009, 11:17 PM
Here some collected Mac OS X OpenCL Benchresults of the apps i told you:
http://www.barefeats.com/opencl.html

good page, thank you :)

mitchde
09-06-2009, 12:45 AM
I got informations from ATI OpenCL OS X dev division (AMD Inc) that they aware of some major problems with ATI 48xx running OpenCL in 10.6.
They will fix that , so it may be possible to get that fixes within the 10.6.1 update.

Near none of the Apple OpenCL Demos can run.
And even if they do (not crash, not freeze) they get poor performance compared to the Nvidia GPUs, which really
run good in wide range from 9400M up to GTX 285 (8xxx also).

So at least at this time (Mac OS X 10.6.0), OpenCL and ATI 48xx didnt work.

I also share the compiled Displacement build (orig. Apple Demo Code)

httl://rapidshare.com/files/276023300/OpenCL_Displacement_Bench.zip

http://www.insanelymac.com/forum/index.php?act=attach&type=post&id=55579

Roho
08-31-2011, 03:15 AM
Niceeeeee!
Does it even support the connection with OpenGL?
yeap, i believe it does, though i hasn't tried yet..