Results 1 to 6 of 6

Thread: vectorization

  1. #1
    Junior Member
    Join Date
    Jan 2013
    Posts
    7

    vectorization

    hello,
    i want to do vectorization using opencl like this
    float 4 C=(float 4)(a[0],a[1],a[2],a[3])+(float 4)(b[0],b[1],b[2],b[3])
    please help me out how to do this in opencl

  2. #2
    Senior Member
    Join Date
    Oct 2012
    Posts
    165

    Re: vectorization

    float4 c = float4(a[0]+b[0],a[1]+b[1],a[2]+b[2],a[3]+b[3])

    or you can change a and b to float 4

    float4 a = (float4)(a[0],...);
    float4 b = (float4)(b[0],...);
    float4 c = a+b;

  3. #3
    Junior Member
    Join Date
    Jan 2013
    Posts
    7

    Re: vectorization

    hey thanks for reply
    please check this code and please correct me
    --------------------------------------------------------------------------
    #include "stdafx.h"
    #include <iostream>
    #include "CL\cl.h"
    #include <stdio.h>
    using namespace std;

    const char *source =
    "__kernel void vec_add (__global int *a, \n"
    " __global const int *b, \n"
    " __global int *c) \n"
    "{ \n"

    "int16 a=(int16)(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); \n"
    "int16 b=(int16)(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); \n"
    "int16 c = a+b; \n"

    "} \n";
    int _tmain(int argc, _TCHAR* argv[])
    {
    int N = 16;
    // Get the first available platform
    // Example: AMD Accelerated Parallel Processing
    cl_platform_id platform;
    clGetPlatformIDs(1,&platform,NULL); // number of platforms available

    // Get the first GPU device the platform provides
    cl_device_id device;
    clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device,NULL); // number of devices available

    // Create a context and command queue on that device
    cl_context context = clCreateContext(0, // optional (context properties)
    1, // number of devices
    &device, // pointer to device list
    NULL, NULL, // optional (callback function for reporting errors)
    NULL); // no error code returned

    cl_command_queue queue = clCreateCommandQueue(context, // valid context
    device, // device associated with context
    CL_QUEUE_PROFILING_ENABLE, // optional (command queue properties)
    NULL); // no error code returned

    // Create program object and load source code into program object
    cl_program program = clCreateProgramWithSource(context,
    1, // number of strings
    &source, // strings
    NULL, // string length or NULL terminated
    NULL); // no error code returned


    // Build program executable from program source
    clBuildProgram(program,
    1, // number of devices
    &device, // pointer to device list
    NULL, // optional (build options)
    NULL, NULL); // optional (callback function, argument)

    // Build program executable from program source

    // Create kernel object
    cl_kernel kernel = clCreateKernel(program, // program object
    "vec_add", // kernel name in program
    NULL); // no error code returned

    // Initialize arrays
    cl_float *a = (cl_float *) malloc(N*sizeof(cl_float));
    cl_float *b = (cl_float *) malloc(N*sizeof(cl_float));
    // int i;
    //for(i=0;i<N;i++){
    // a[i] = i;
    // b[i] = i;
    //}
    // A buffer object is a handle to a region of memory
    cl_mem a_buffer = clCreateBuffer(context,
    CL_MEM_READ_ONLY | // buffer object read only for kernel
    CL_MEM_COPY_HOST_PTR, // copy data from memory referenced
    // by host pointer
    N*sizeof(cl_float), // size in bytes of buffer object
    a, // host pointer
    NULL); // no error code returned
    cl_mem b_buffer = clCreateBuffer(context,
    CL_MEM_READ_ONLY |CL_MEM_COPY_HOST_PTR,
    N*sizeof(cl_float), b, NULL);

    cl_mem c_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
    N*sizeof(cl_float), NULL, NULL);
    //size_t global_work_size = 4;

    // Set the kernel arguments
    clSetKernelArg(kernel, 0, sizeof(a_buffer), (void*) &a_buffer);
    clSetKernelArg(kernel, 1, sizeof(b_buffer), (void*) &b_buffer);
    clSetKernelArg(kernel, 2, sizeof(c_buffer), (void*) &c_buffer);
    cl_event timeEvent;
    //const size_t m=4;
    size_t szGlobalWorkSize[1];
    //size_t szLocalWorkSize[2];
    szGlobalWorkSize[0]=1;
    //szLocalWorkSize[0]=2;
    //szLocalWorkSize[1]=2;
    // Enqueue a command to execute the kernel on the GPU device
    cl_int error = clEnqueueNDRangeKernel(queue, kernel,
    1, NULL, // global work items dimensions and offset
    szGlobalWorkSize, // number of global work items
    NULL, // number of work items in a work group
    0, NULL, // don't wait on any events to complete
    &timeEvent); // no event object returned

    // Block until all commands in command-queue have completed
    clFinish(queue);
    // Read back the results
    cl_float *c = (cl_float *) malloc(N*sizeof(cl_float));
    clEnqueueReadBuffer(
    queue, // command queue in which read command will be queued
    c_buffer, // buffer object to read back
    CL_TRUE, // blocking read - doesn't return until buffer copied
    0, // offset in bytes in buffer object to read from
    N * sizeof(cl_float), // size in bytes of data being read
    c, // pointer to host memory where data is to be read into
    0, NULL, // don't wait on any even
    NULL); // no event object returned
    for(int i=0;i<N;i++)
    cout<<"\n"<<c[i];


    cl_ulong startBuf;
    // size_t a = 1000;
    cl_int x= clGetEventProfilingInfo ( timeEvent,
    CL_PROFILING_COMMAND_START ,
    sizeof(cl_ulong),
    &startBuf,
    NULL);

    cl_ulong endBuf;
    // size_t a = 1000;
    cl_int y= clGetEventProfilingInfo ( timeEvent,
    CL_PROFILING_COMMAND_END ,
    sizeof(cl_ulong),
    &endBuf,
    NULL);
    long diff = endBuf - startBuf;
    double values_in_second = (double)diff/(double)1000000000;

    cout<<"Total GPU time:"<<values_in_second<<"\n";
    free(a);
    free(b);
    free(c);
    clReleaseMemObject(a_buffer);
    clReleaseMemObject(b_buffer);
    clReleaseMemObject(c_buffer);
    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseContext(context);
    clReleaseCommandQueue(queue);
    getchar();
    return 0;
    }

  4. #4
    Senior Member
    Join Date
    Oct 2012
    Posts
    165

    Re: vectorization

    a, b and c are local variables hiding the global in pointers i think. i dont know if thats allowed in openCL kernels.
    if you want to access the global variables it has to be something like
    Code :
    const char *source =
    "__kernel void vec_add (__global int *a, \n"
    " __global const int *b, \n"
    " __global int *c) \n"
    "{ \n"
     
    "int16 la=(int16)(a[0],a[1]...); \n"
    "int16 lb=(int16)(b[0],b[1]...); \n"
    "int16 lc = a+b; \n"
    c[0] = lc.0;
    c[1] = lc.1;
    ...
    "} \n";

    But this would be the same as using:
    Code :
    c[i] = a[i] + b[i]; //With i beeing your workitem ID
    in your kernel code.

  5. #5
    Junior Member
    Join Date
    Jan 2013
    Posts
    7

    Re: vectorization

    thanks for reply me.
    actually i want to know that is there anyways to do vectorization which i want to do via this program...
    if you have any code regarding this.can you please give me..because the code which you have given to me is not providing proper solution of vectorization. that is vector addition.

  6. #6

    Re: vectorization

    You might find it worthwhile to look up the vload* and vstore* functions. In your case, replace * with 16.

Similar Threads

  1. AMD vs Intel: Auto-vectorization
    By Nick Wiggill in forum OpenCL
    Replies: 5
    Last Post: 05-02-2012, 09:45 AM

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •