Page 1 of 3 123 LastLast
Results 1 to 10 of 25

Thread: OpenCL Addition Example (Mac OS X 10.6 Snow *Leopard)

  1. #1
    Junior Member yile's Avatar
    Join Date
    Jun 2009
    Location
    Beijing, China
    Posts
    11

    OpenCL Addition Example (Mac OS X 10.6 Snow *Leopard)

    Code :
    /*
     
     File: compute_sumints.c
     
     Abstract: source for compute_sum kernel and initialization and runtime 
               code for summing integers in and OpenCL kernel
     
     Version: 1.0
     
     Disclaimer: IMPORTANT:  This Apple software is supplied to you by 
     Apple Inc. ("Apple") in consideration of your agreement to the
     following terms, and your use, installation, modification or
     redistribution of this Apple software constitutes acceptance of these
     terms.  If you do not agree with these terms, please do not use,
     install, modify or redistribute this Apple software.
     
     In consideration of your agreement to abide by the following terms, and
     subject to these terms, Apple grants you a personal, non-exclusive
     license, under Apple's copyrights in this original Apple software (the
     "Apple Software"), to use, reproduce, modify and redistribute the Apple
     Software, with or without modifications, in source and/or binary forms;
     provided that if you redistribute the Apple Software in its entirety and
     without modifications, you must retain this notice and the following
     text and disclaimers in all such redistributions of the Apple Software. 
     Neither the name, trademarks, service marks or logos of Apple Inc. 
     may be used to endorse or promote products derived from the Apple
     Software without specific prior written permission from Apple.  Except
     as expressly stated in this notice, no other rights or licenses, express
     or implied, are granted by Apple herein, including but not limited to
     any patent rights that may be infringed by your derivative works or by
     other works in which the Apple Software may be incorporated.
     
     The Apple Software is provided by Apple on an "AS IS" basis.  APPLE
     MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
     THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
     FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND
     OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
     
     IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
     OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION,
     MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED
     AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE),
     STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
     POSSIBILITY OF SUCH DAMAGE.
     
     Copyright (C) 2008 Apple Inc. All Rights Reserved.
     
     */
     
    #include <stdio.h>
    #include <stdlib.h>
    #include <math.h>
    #include <string.h>
    #include <stdbool.h>
    #include <sys/types.h>
    #include <sys/stat.h>
    #include <OpenCL/opencl.h>
    #include <mach/mach_time.h>
     
    static cl_device		device;
    static cl_context		context;
     
    const char *sum_kernel_code = 
    "__kernel void compute_sum(__global int *a, int n, __local long *tmp_sum, __global long *sum)\n"
    "{\n"
    "    int  tid = get_local_thread_id(0);\n"
    "    int  lsize = get_local_thread_size(0);\n"
    "    int  i;\n"
    "\n"
    "    tmp_sum[tid] = 0;\n"
    "    for (i=tid; i<n; i+=lsize)\n"
    "        tmp_sum[tid] += a[i];\n"
    "\n"
    "    for (i=lsize/2; i>0; i/=2)\n"
    "    {\n"
    "        barrier(CL_GLOBAL_MEM_FENCE);\n"
    "        if (tid < i)\n"
    "            tmp_sum[tid] += tmp_sum[tid + i];\n"
    "    }\n"
    "\n"
    "    if (tid == 0)\n"
    "        *sum = tmp_sum[0];\n"
    "}\n";
     
     
    static int
    verify_sum(int *inptr, long long *outptr, int n)
    {
        long long	r = 0;
        int         i;
     
        for (i=0; i<n; i++)
        {
    		r += inptr[i];
        }
     
    	if (r != outptr[0])
    	{
    		printf("sum of ints test failed\n");
    		return -1;
    	}
     
        printf("sum of ints test passed\n");
        return 0;
    }
     
    int
    compute_sumints(int num_elements, long long *compute_sum, float *compute_time)
    {
    	cl_mem						streams[2];
    	long long					sum;
    	int							*input_ptr;
    	cl_program					program;
    	cl_kernel					kernel;
    	void						*values[4];
    	size_t						sizes[4] = { sizeof(cl_mem), sizeof(int), 0, sizeof(cl_mem) };
    	size_t						lengths[1];
    	unsigned int				global_threads[1];
    	unsigned int				local_threads[1];
    	int							err;
    	unsigned int				max_threadgroup_size;
    	int							i;
    	cl_device_id				device_id;
    	uint64_t					t0, t1;
    	struct mach_timebase_info	info;
     
    	mach_timebase_info(&info);
     
    	printf( "computing sum for %d randomly generated ints\n", num_elements );
    	input_ptr = malloc(sizeof(int) * num_elements);
    	for (i=0; i<num_elements; i++)
    		input_ptr[i] = (int)rand();
     
    	err = clGetDeviceGroupInfo(device, CL_DEVICE_IDS, &device_id, sizeof(cl_device_id), NULL);
    	if (err != CL_SUCCESS) {
    		printf( "clGetDeviceGroupInfo failed\n" );
    		return -1;
    	}
     
    	clGetDeviceConfigInfo(device_id, CL_DEVICE_MAX_THREAD_GROUP_SIZE, &max_threadgroup_size, sizeof(unsigned int), NULL);
     
    	lengths[0] = strlen(sum_kernel_code);
        program = clCreateProgramWithSource(device, 1, &sum_kernel_code, lengths);
    	if (!program)
    	{
    		printf("clCreateProgramWithSource failed\n");
    		return -1;
    	}
     
    	err = clBuildProgramExecutable(program, false, NULL, NULL);
    	if (err != CL_SUCCESS)
    	{
    		printf("clBuildProgramExecutable failed\n");
    		return -1;
    	}
     
    	kernel = clCreateKernel(program, "compute_sum");
    	if (!kernel)
    	{
    		printf("clCreateKernel failed\n");
    		return -1;
    	}
     
    	streams[1] = clCreateArray(device, CL_MEM_ALLOC_GLOBAL_POOL, sizeof(long long), 1, NULL);
    	if (!streams[1])
    	{
    		printf("clCreateArray failed\n");
    		return -1;
    	}
     
    	// begin timing
    	t0 = mach_absolute_time();
    	streams[0] = clCreateArray(device, CL_MEM_ALLOC_GLOBAL_POOL | CL_MEM_COPY_HOST_PTR, sizeof(int), num_elements, input_ptr);
    	if (!streams[0])
    	{
    		printf("clCreateArray failed\n");
    		return -1;
    	}
     
    	sizes[0] = sizeof(cl_mem);  values[0] = streams[0];
    	sizes[1] = sizeof(int);     values[1] = (void *)&num_elements;
    	sizes[2] = max_threadgroup_size*sizeof(long long); values[2] = NULL;
    	sizes[3] = sizeof(cl_mem);  values[3] = streams[1];
    	err = clSetKernelArgs(context, kernel, 4, NULL, values, sizes);
    	if (err != CL_SUCCESS)
    	{
    		printf("clSetKernelArgs failed\n");
    		return -1;
    	}
     
    	global_threads[0] = max_threadgroup_size;
    	local_threads[0] = max_threadgroup_size;
        err = clExecuteKernel(context, kernel, NULL, global_threads, local_threads, 1, NULL, 0, NULL);
        if (err != CL_SUCCESS)
        {
            printf("clExecuteKernel failed\n");
            return -1;
        }
     
        err = clReadArray(context, streams[1], false, 0, sizeof(long long), (void *)&sum, NULL);
        if (err != CL_SUCCESS)
        {
            printf("clReadArray failed\n");
            return -1;
        }
        // end timing
    	t1 = mach_absolute_time();
     
    	{
     
    		if (compute_time) *compute_time = 1e-9 * (t1 - t0) * info.numer / info.denom;
    	}
        err = verify_sum(input_ptr, &sum, num_elements);
     
    	if (compute_sum) *compute_sum = sum;
     
    	// cleanup
    	clReleaseMemObject(streams[0]);
    	clReleaseMemObject(streams[1]);
    	clReleaseKernel(kernel);
    	clReleaseProgram(program);
    	free(input_ptr);
     
    	return err;
    }
     
    int
    init_compute()
    {
    	cl_device_id compute_device_id[2];
    	unsigned int num_devices = 0;
    	int return_value = 0;
     
        return_value = clGetComputeDevices(CL_DEVICE_TYPE_GPU, 2, compute_device_id, &num_devices);
        if(return_value || 0 == num_devices) {
    		printf( "clGetComputeDevices failed (with %d devices available)\n", num_devices );
            return -1;
    	}
     
        device = clCreateDeviceGroup(1, &compute_device_id[0]);
    	if (!device)
    	{
    		printf("clCreateDeviceGroup failed\n");
    		return -1;
    	}
     
    	context = clCreateContext(0, device);
    	if (!context)
    	{
    		printf("clCreateContext failed\n");
    		return -1;
    	}
     
    	return 0;
    }
     
    void release_compute()
    {
    	clReleaseContext(context);
    	clReleaseDeviceGroup(device);
    }
     
    #if 0
    int
    main(int argc, char *argv[])
    {
    	if (init_compute())
    		return -1;
     
    	int r = compute_sumints(1024*1024, NULL, NULL);
    	release_compute();
    	return r;
    }
    #endif

    Because this forum is not support attachment, so if u want the whole Sample code, pls PM with ur email
    cclv

  2. #2
    Junior Member
    Join Date
    Aug 2009
    Posts
    21

    Re: OpenCL Addition Example (Mac OS X 10.6 Snow *Leopard)

    Thanks - nice to see some Mac OS X 10.6 devs here !
    I pm Úd you for some little demo sources.

  3. #3
    Junior Member
    Join Date
    Aug 2009
    Posts
    21

    Re: OpenCL Addition Example (Mac OS X 10.6 Snow *Leopard)

    Anyone other has an OpenCL Example (with Xcode project file) for me ?
    Didnt get an answer from above (perhaps he is in holidays )
    Thanks

  4. #4

    Re: OpenCL Addition Example (Mac OS X 10.6 Snow *Leopard)

    Code :
    #include <stdio.h>
    #include <stdlib.h>
    #include <math.h>
    #include <string.h>
    #include <stdbool.h>
    #include <sys/types.h>
    #include <sys/stat.h>
    #include <OpenCL/opencl.h>
    #include <mach/mach_time.h>
     
    const char * sProgramSource = 
    "__kernel void vectorAdd(              \n" \
    "__global const float * a,             \n" \
    "__global const float * b,             \n" \
    "__global	float * c)                 \n" \
    "{                                     \n" \
    "	// Vector element index            \n" \
    "	int nIndex = get_global_id(0);     \n" \
    "	c[nIndex] = a[nIndex] + b[nIndex]; \n" \
    "}                                     \n";
     
    int main (int argc, const char * argv[])
    {
    	const unsigned int cnBlockSize= 512;
    	const unsigned int cnBlocks =3;
    	size_t cnDimension = cnBlocks * cnBlockSize;
    	int err;
    	cl_device_id device_id;
    	size_t local;
    	size_t len;
    	char buffer[2048];
     
    	int gpu = 1;
    	err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
    	if (err != CL_SUCCESS)
    	{
    		printf("Error: Failed to get device ID\n");
    		exit(1);
    	}
     
    	err = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(buffer), buffer, &len);
    	printf("CL_DEVICE_NAME: %s\n", buffer);
    	err = clGetDeviceInfo(device_id, CL_DEVICE_VENDOR, sizeof(buffer), buffer, &len);
    	printf("CL_DEVICE_VENDOR: %s\n", buffer);
     
    	// create OpenCL device & context
    	cl_context hContext;
    	hContext = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    	if (err != CL_SUCCESS)
    	{
    		printf("Error: Failed to create context\n");
    		exit(1);
    	}
     
    	// create a command queue for our device
    	cl_command_queue hCmdQueue;
    	hCmdQueue = clCreateCommandQueue(hContext, device_id, 0, 0);
     
    	// create & compile program
    	cl_program hProgram;
    	hProgram = clCreateProgramWithSource(hContext, 1, (const char **) &sProgramSource, NULL, &err);
    	if (!hProgram || err != CL_SUCCESS)
    	{
    		printf("Error: Failed to Create program with source\n");
    		exit(1);
    	}
     
    	err = clBuildProgram(hProgram, 0, NULL, NULL, NULL, NULL);
    	if (err != CL_SUCCESS)
    	{
    		printf("Error: Failed to build program executable\n");
    		clGetProgramBuildInfo(hProgram, device_id, CL_PROGRAM_BUILD_LOG,
    							  sizeof(buffer), buffer, &len);
    		printf("%s\n", buffer);
    		exit(1);
    	}
     
    	// create kernel
    	cl_kernel hKernel;
    	hKernel = clCreateKernel(hProgram, "vectorAdd", &err);
    	if (!hKernel || err != CL_SUCCESS)
    	{
    		printf("Error: Failed to create kernel\n");
    		exit(1);
    	}
     
    	// allocate host vectors
    	float * pA = new float[cnDimension];
    	float * pB = new float[cnDimension];
    	float * pC = new float[cnDimension];
    	float * pC1 = new float[cnDimension];
     
    	memset(pC, 0, cnDimension * sizeof(float));
    	memset(pC1, 0, cnDimension * sizeof(float));
     
    	// initialize host memory
    	int i;
    	for(i=0; i < cnDimension; i++)
    	{
    		pA[i] = pC[i] = pC1[i] = 0;
    		pB[i] = i;
    //		pA[i] = rand() % 10 + 1;
    //		pB[i] = rand() % 10 + 1;
    	}
     
    	// allocate device memory
    	cl_mem hDeviceMemA, hDeviceMemB, hDeviceMemC;
    	hDeviceMemA = clCreateBuffer(hContext,
    								 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float), pA, 0);
    	hDeviceMemB = clCreateBuffer(hContext,
    								 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float), pB, 0);
    	hDeviceMemC = clCreateBuffer(hContext,
    								 CL_MEM_WRITE_ONLY, cnDimension * sizeof(cl_float), 0, 0);
     
    	// setup parameter values
    	err = 0;
    	err  = clSetKernelArg(hKernel, 0, sizeof(cl_mem), (void *)&hDeviceMemA);
    	err |= clSetKernelArg(hKernel, 1, sizeof(cl_mem), (void *)&hDeviceMemB);
    	err |= clSetKernelArg(hKernel, 2, sizeof(cl_mem), (void *)&hDeviceMemC);
    	if (err != CL_SUCCESS)
    	{
    		printf("Error: Failed to set kernel args\n");
    		exit(1);
    	}
     
    	// Get the maximum work-group size for executing the kernel on the device
    	err = clGetKernelWorkGroupInfo(hKernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    	if (err != CL_SUCCESS)
    	{
    		printf("Error: clGetKernelWorkGroupInfo Failed\n");
    		exit(1);
    	}
     
    	// execute kernel
    	err = clEnqueueNDRangeKernel(hCmdQueue, hKernel, 1, NULL, (size_t*)(&cnDimension), &local, 0, NULL, NULL);
    	if (err != CL_SUCCESS)
    	{
    		printf("Error: clEnqueueNDRangeKernel Failed\n");
    		exit(1);
    	}
    	// copy results from device back to host
    	clEnqueueReadBuffer(hCmdQueue, hDeviceMemC, CL_TRUE, 0, cnDimension * sizeof(cl_float),
    						pC, 0, NULL, NULL);
     
    	// wait for command queue
    	clFinish(hCmdQueue);
     
    	bool valid = true;
    	for(i=0; i < cnDimension; i++)
    	{
    		pC1[i] = pA[i] + pB[i];
    		if (pC[i] != pC1[i])
    		{
    			printf("Error: %0.2f != %0.2f\n", pC[i], pC1[i]);
    			valid = false;
    		}
    	}
     
    	printf("Number of elements : %d\n", cnDimension);
    	printf("First Element: %0.2f\n", pC[0]);
    	printf("Last Element : %0.2f\n\n", pC[cnDimension-1]);
    	if (valid) {
    		printf("Test passed\n");
    	}
    	else {
    		printf("Test failed\n");
    	}
     
     
    	delete[] pA;
    	delete[] pB; 
    	delete[] pC;
    	delete[] pC1;
     
    	clReleaseMemObject(hDeviceMemA); 
    	clReleaseMemObject(hDeviceMemB); 
    	clReleaseMemObject(hDeviceMemC);
    	clReleaseProgram(hProgram);
    	clReleaseKernel(hKernel);
    	clReleaseCommandQueue(hCmdQueue);
    	clReleaseContext(hContext);
        return 0;
    }

  5. #5
    Junior Member
    Join Date
    Aug 2009
    Posts
    21

    Re: OpenCL Addition Example (Mac OS X 10.6 Snow *Leopard)

    Thanks !



    How about an thing like that (raytracing on GPU, with sample source code buts CUDA).

    http://cg.alexandra.dk/2009/08/10/trier ... -tutorial/

  6. #6
    Junior Member
    Join Date
    Aug 2009
    Posts
    21

    Re: OpenCL Addition Example (Mac OS X 10.6 Snow *Leopard)

    Sorry,
    i tried to compile both OpenCL .c examples with Xode (10.6, as command line projects).
    Both cant be compiled.
    Different errors - some definition errors (CL... not declared), some compile errors
    float *pa = new float (xyz):

    Can someone upload that examples as .xcodeproject files (zipped, really small!) which then will work (right settings for Librarys/ compiler + code fixes) ?

    Thanks

  7. #7
    Junior Member yile's Avatar
    Join Date
    Jun 2009
    Location
    Beijing, China
    Posts
    11

    Re: OpenCL Addition Example (Mac OS X 10.6 Snow *Leopard)

    hey guys, I'm sorry for reply so late...

    I just send out the code to you, pls check ur inbox
    cclv

  8. #8
    Junior Member
    Join Date
    Aug 2009
    Posts
    21

    Re: OpenCL Addition Example (Mac OS X 10.6 Snow *Leopard)

    THANKS.
    I will look on it.

    I got hat example to run with that
    Code :
    const char *[b] sProgramSource[/b] = 
    "__kernel void vectorAdd(              \n" \
    "__global const float * a,             \n" \
    "__global const float * b,             \n" \
    "__global   float * c)                 \n" \
    "{                                     \n" \
    "   // Vector element index            \n" \
    "   int loop;            \n" \
    "   int nIndex = get_global_id(0);     \n" \
    "   [b]for (loop=1; loop< 5000; loop++)[/b]   \n" \      *** changed by me to run longer on GPU  ***
    "{                                     \n" \
     
     
    "   c[nIndex] = a[nIndex] + b[nIndex]; \n" \
     
    "}                                     \n" \
     
    "}                                     \n";





    But it will run only on NVIDIAs - from 9600M GT up to GTX 285 - no problems.
    9600M GT = 15 sec, GTX285 = 0,8 sec
    CPU from 3,8 Sec i7 920@4 GHZ down to 100 sec C2Mobile 2 GHZ.

    ATI Users (OS X, 10.6) reported that OS X complete freezes when they run the Bench.

    I posted the code (V020) and xcodeproject here:
    http://freenet-homepage.de/amichalak/OpenCL2_SRC.zip

    Would be fine if i will get some help to fix that freeze problem with ATI.

    So, OpenCL is not an "fire & forget" , i must do some extra coding for GPU differences ?
    Any help would be fine !

    Also, i dont know what to do with that CL.hpp - if i include it, i get > 400 compiler errors.

  9. #9
    Junior Member
    Join Date
    Aug 2009
    Posts
    21

    Re: OpenCL Addition Example (Mac OS X 10.6 Snow *Leopard)

    I changed some code of the source part (smaller loop for the vector adds).
    Much error handling added.
    Works now on ATI 4870 /OS X 10.6), but runs way slower.
    ATI 4870 : 4 sec, Geforce 285 : 0,17 sec , Geforce 9600GT : 0,93 sec, Geforce 9600M : 5 sec, Geforce 9400M: 15 sec
    Now V025.
    Sourcecode same link as post before.

    I thried also the OpenCL Example1 from the kind user out of china.
    But i get lots of errors at compiling, even if i used you complete "pack" as Xcode project.
    Normally should work.
    Some cl OpenCL calls are definitly NOT found in the OpenCL Standard ( i checked that).
    Also some CL_MAX... constants are not defined and cant be found also in the OpenCL documentation too.
    And some cl OpenCL calls have to less values given with.
    Question: Did you compiled that own, or do you only have the source and shared that.
    If you get that compiled for OS X 10.6, please pm that small execute (zipped) to me.
    Example2 gave much less errors like undfinded constants and undefined OpenCL calls,

    Question:
    I am a bit confused , because the bech works so good an all Nvidias and near not on any ATIs.

  10. #10
    Junior Member
    Join Date
    Aug 2009
    Posts
    21

    Re: OpenCL Addition Example (Mac OS X 10.6 Snow *Leopard)

    Examples of errors (OpenCL example1)
    compute_sumints.c
    Code :
    #include <stdio.h>
    #include <stdlib.h>
    #include <math.h>
    #include <string.h>
    #include <stdbool.h>
    #include <sys/types.h>
    #include <sys/stat.h>
    #include <OpenCL/opencl.h>
    #include <mach/mach_time.h>
    ...
    static cl_device		device;  
    > no cl_device defined, i must use cl_device[b]_id[/b]
     
    err = clGetDeviceGroupInfo(device, CL_DEVICE_IDS, &device_id, sizeof(cl_device_id), NULL);
    > clGetDevice[b]Group[/b]Info call doesnt exist, only the clGetDeviceInfo, [b]CL_DEVICE_IDS[/b] doesnt exist in OpenCL
     
    clGetDeviceConfigInfo(device_id, CL_DEVICE_MAX_THREAD_GROUP_SIZE, &max_threadgroup_size, sizeof(unsigned int), NULL);
    > clGetDevice[b]Config[/b]Info doesnt exist in OpenCL, CL_DEVICE_MAX_[b]THREAD[/b]_GROUP_SIZE doesnt exist ,only ..._MAX_[b]WORK[/b]_GROUP_....
    and so on. Was easy to fix cl calls with to less values, but how fix complete unknow openCl calls ?

    Thanks for any help.

Page 1 of 3 123 LastLast

Similar Threads

  1. Image2D OpenCL & Snow Leopard
    By Letinono in forum Interoperability issues
    Replies: 6
    Last Post: 04-05-2012, 11:11 AM
  2. Replies: 3
    Last Post: 11-30-2009, 03:12 AM

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •