Results 1 to 3 of 3

Thread: low float precision - only on parts of the output data

  1. #1
    Junior Member
    Join Date
    Feb 2010
    Posts
    7

    low float precision - only on parts of the output data

    hello,

    I have a problem with low-precision of calculations - I try to do a one-dimensional FFT transform on matrix(using DFT matrix). Unfortunately, the FFT and IFFT results compared with the input data vary considerably (above 0.1), and the strange is that the first line has the correct results. it does not matter whether the input data(matrix) has a size of 8x8 or more (256x256).
    in each case, the results differ significantly, much more than could be due to the float precision.
    I tried to build a kernel with option '-cl-opt-disable', unfortunately it does not improve accuracy. any ideas what I am doing wrong?

    below the sample input and output data (the first and third row of the output is correct, but differs in the first column) and the host and kernel code.

    input:
    0.840188 0.394383 0.783099 0.798440
    0.911647 0.197551 0.335223 0.768230
    0.277775 0.553970 0.477397 0.628871
    0.364784 0.513401 0.952230 0.916195

    output:
    0.840188 0.394383 0.783099 0.798440
    0.638216 0.355476 0.643726 0.842212
    0.277775 0.553970 0.477397 0.628871
    0.638216 0.355476 0.643726 0.842212

    fft1d.cl

    __kernel void mul(
    __global float* InA,
    __global float* OutRe,
    __global float* OutIm,
    int N,

    __global float* Wre,
    __global float* Wim,
    __global float* Mre,
    __global float* Mim,

    __global float* tempre,
    __global float* tempim

    )
    {

    int gid0 = get_global_id(0);
    int gid1 = get_global_id(1);
    int i,j,k;

    int size = N;

    float wynikre = 0;
    float wynikim = 0;

    if(gid0<N && gid1<N) {

    // fft1d
    for(j=0;j<N;j++) {

    wynikre += Wre[gid1*N+j]*InA[j*N+gid0]; //RE
    wynikim += Wim[gid1*N+j]*InA[j*N+gid0]; //IM
    }

    tempre[gid1*N+gid0] = wynikre;
    tempim[gid1*N+gid0] = wynikim;

    barrier(CLK_GLOBAL_MEM_FENCE);

    wynikre = 0;
    wynikim = 0;

    // ifft1d
    for(j=0;j<N;j++) {

    wynikre += Mre[gid1*N+j]*tempre[j*N+gid0]-Mim[gid1*N+j]*tempim[j*N+gid0] ;
    wynikim += Mre[gid1*N+j]*tempim[j*N+gid0]+Mim[gid1*N+j]*tempre[j*N+gid0] ;
    }

    OutRe[gid1*N+gid0] = wynikre;
    OutIm[gid1*N+gid0] = wynikim;

    barrier(CLK_GLOBAL_MEM_FENCE);

    }

    }
    fft.cpp

    #include <stdio.h>
    #include <stdlib.h>
    #include <malloc.h>
    #include <unistd.h>
    #include <time.h>
    #include <cctype>
    #include <vector>
    #include <math.h>
    #include <complex.h>

    #include "CL/cl.h"

    using namespace std;
    const char* cSourceFile = "fft1d.cl"; // dct kernel file

    cl_context cxGPUContext; // OpenCL context
    cl_command_queue cqCommandQue; // OpenCL command que
    cl_device_id* cdDevices; // OpenCL device list

    cl_program cpProgram; // OpenCL program
    cl_kernel ckKernel; // OpenCL kernel

    size_t GlobalWorkSize[2];
    size_t LocalWorkSize[2];

    size_t ParmDataBytes; // Byte size of context information
    cl_int ciErr1, ciErr2; // Error code var
    char* cPathAndName = NULL; // Var for paths
    char* cSourceCL = NULL; // Buffer to hold source for compilation


    char* load_source(const char* filename)
    {
    FILE* stream = NULL;
    size_t source_lenght;

    stream = fopen(filename, "rb");
    if(stream == 0)
    {
    return NULL;
    }

    fseek(stream, 0, SEEK_END);
    source_lenght = ftell(stream);
    fseek(stream, 0, SEEK_SET);
    size_t result =0;

    char* source_string = (char *)malloc(source_lenght + 1);
    result = fread(source_string, source_lenght, 1, stream);

    fclose(stream);

    source_string[source_lenght] = '\0';
    return source_string;
    }


    void showM(float* daneOut, int N) {

    int i=0;
    for(int i =0;i<N*N;i++) {
    if (i%N == 0) { printf("\n"); }
    printf("%f ", *(daneOut+i));
    }
    printf("\n");

    }

    int main(int argc, char **argv)
    {

    const char* optio = "-cl-opt-disable";

    int N = 4;
    int siz = sizeof(float)*N*N;

    GlobalWorkSize[0] = N;
    GlobalWorkSize[1] = N;
    LocalWorkSize[0] = 2;
    LocalWorkSize[1] = 2;

    float PI = 3.14159265;

    complex float a = cos(2*PI/N)-sin(2*PI/N)*I;
    complex float b;
    int k, w;

    ////////////////////////////////////////////////////////////////////////

    cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &ciErr1);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clCreateContextFromType !!!\n\n");}

    ciErr1 = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes);
    cdDevices = (cl_device_id*)malloc(ParmDataBytes);
    ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, cdDevices, NULL);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clGetContextInfo !!!\n\n");}

    cqCommandQue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, &ciErr1);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clCreateCommandQueue !!!\n\n");}

    ////////////////////////////////////////////////////////////////////////

    float* daneInA;
    daneInA = (float*)malloc(siz);
    void * pointer_daneInA= (float *)daneInA;
    for(int j=0;j<N*N;j++) {
    *(daneInA+j)= (float) rand()/RAND_MAX;
    }

    float* daneOutRe;
    daneOutRe = (float*)malloc(siz);
    void * pointer_daneOutRe= (float *)daneOutRe;
    for(int j=0;j<N*N;j++) {
    *(daneOutRe+j)= 0;
    }

    float* daneOutIm;
    daneOutIm = (float*)malloc(siz);
    void * pointer_daneOutIm= (float *)daneOutIm;
    for(int j=0;j<N*N;j++) {
    *(daneOutIm+j)=0;
    }

    // Wre
    float* data_fft_Wre;
    data_fft_Wre = (float*)malloc(siz);
    void * pointer_fft_Wre= (float *)data_fft_Wre;
    for(k=0;k<N;k++) {
    for(w=0;w<N;w++) {
    b = k*w;
    *(data_fft_Wre+(k+w*N))=crealf( cpow( a,b ) );
    }
    }

    // Wim
    float* data_fft_Wim;
    data_fft_Wim = (float*)malloc(siz);
    void * pointer_fft_Wim= (float *)data_fft_Wim;
    for(k=0;k<N;k++) {
    for(w=0;w<N;w++) {
    b = k*w;
    *(data_fft_Wim+(k+w*N))=cimagf( cpow( a,b ) );
    }
    }

    // Mre
    float* data_fft_Mre;
    data_fft_Mre = (float*)malloc(siz);
    void * pointer_fft_Mre= (float *)data_fft_Mre;
    for(k=0;k<N;k++) {
    for(w=0;w<N;w++) {
    b = -k*w;
    *(data_fft_Mre+(k+w*N))=crealf( cpow( a,b ) )/N;
    }
    }

    // Mim
    float* data_fft_Mim;
    data_fft_Mim = (float*)malloc(siz);
    void * pointer_fft_Mim= (float *)data_fft_Mim;
    for(k=0;k<N;k++) {
    for(w=0;w<N;w++) {
    b = -k*w;
    *(data_fft_Mim+(k+w*N))=cimagf( cpow( a,b ) )/N;
    }
    }

    float* temp1;
    temp1 = (float*)malloc(siz);
    void * pointer_temp1= (float *)temp1;
    for(int j=0;j<N*N;j++) {
    *(temp1+j)=0;
    }

    float* temp2;
    temp2 = (float*)malloc(siz);
    void * pointer_temp2= (float *)temp2;
    for(int j=0;j<N*N;j++) {
    *(temp2+j)=0;
    }


    cl_mem Buffer_daneInA = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clcreatebuffer Buffer_daneIn !!!\n\n");}
    cl_mem Buffer_daneOutRe = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clcreatebuffer Buffer_daneOut!!!\n\n");}
    cl_mem Buffer_daneOutIm = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clcreatebuffer Buffer_daneOut2!!!\n\n");}

    cl_mem Buffer_fft_Wre = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clcreatebuffer Buffer_daneOut2!!!\n\n");}
    cl_mem Buffer_fft_Wim = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clcreatebuffer Buffer_daneOut2!!!\n\n");}
    cl_mem Buffer_fft_Mre = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clcreatebuffer Buffer_daneOut2!!!\n\n");}
    cl_mem Buffer_fft_Mim = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clcreatebuffer Buffer_daneOut2!!!\n\n");}

    cl_mem Buffer_temp1 = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clcreatebuffer Buffer_daneOut2!!!\n\n");}
    cl_mem Buffer_temp2 = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clcreatebuffer Buffer_daneOut2!!!\n\n");}

    ////////////////////////////////////////////////////////////////////////

    cSourceCL = load_source(cSourceFile);

    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, NULL, &ciErr1);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clCreateProgramWithSource !!!\n\n");}

    ciErr1 = clBuildProgram(cpProgram, 0, NULL, optio, NULL, NULL);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clBuildProgram !!!\n\n");}

    ckKernel = clCreateKernel(cpProgram, "mul", &ciErr1);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clCreateKernel !!!\n\n");}

    ////////////////////////////////////////////////////////////////////////

    ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&Buffer_daneInA);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clSetKernelArg !!!\n\n");}
    ciErr1 = clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&Buffer_daneOutRe);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clSetKernelArg !!!\n\n");}
    ciErr1 = clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&Buffer_daneOutIm);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clSetKernelArg !!!\n\n");}
    ciErr1 = clSetKernelArg(ckKernel, 3, sizeof(int), (void*)&N);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clSetKernelArg !!!\n\n");}

    ciErr1 = clSetKernelArg(ckKernel, 4, sizeof(cl_mem), (void*)&Buffer_fft_Wre);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clSetKernelArg !!!\n\n");}
    ciErr1 = clSetKernelArg(ckKernel, 5, sizeof(cl_mem), (void*)&Buffer_fft_Wre);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clSetKernelArg !!!\n\n");}
    ciErr1 = clSetKernelArg(ckKernel, 6, sizeof(cl_mem), (void*)&Buffer_fft_Mre);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clSetKernelArg !!!\n\n");}
    ciErr1 = clSetKernelArg(ckKernel, 7, sizeof(cl_mem), (void*)&Buffer_fft_Mim);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clSetKernelArg !!!\n\n");}

    ciErr1 = clSetKernelArg(ckKernel, 8, sizeof(cl_mem), (void*)&Buffer_temp1);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clSetKernelArg !!!\n\n");}
    ciErr1 = clSetKernelArg(ckKernel, 9, sizeof(cl_mem), (void*)&Buffer_temp2);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clSetKernelArg !!!\n\n");}
    ////////////////////////////////////////////////////////////////////////

    ciErr1 = clEnqueueWriteBuffer(cqCommandQue, Buffer_daneInA, CL_TRUE, 0, siz, pointer_daneInA, 0, NULL, NULL);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clEnqueueWriteBuffer !!!\n\n");}

    ciErr1 = clEnqueueWriteBuffer(cqCommandQue, Buffer_fft_Wre, CL_TRUE, 0, siz, pointer_fft_Wre, 0, NULL, NULL);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clEnqueueWriteBuffer !!!\n\n");}
    ciErr1 = clEnqueueWriteBuffer(cqCommandQue, Buffer_fft_Wim, CL_TRUE, 0, siz, pointer_fft_Wim, 0, NULL, NULL);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clEnqueueWriteBuffer !!!\n\n");}
    ciErr1 = clEnqueueWriteBuffer(cqCommandQue, Buffer_fft_Mre, CL_TRUE, 0, siz, pointer_fft_Mre, 0, NULL, NULL);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clEnqueueWriteBuffer !!!\n\n");}
    ciErr1 = clEnqueueWriteBuffer(cqCommandQue, Buffer_fft_Mim, CL_TRUE, 0, siz, pointer_fft_Mim, 0, NULL, NULL);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clEnqueueWriteBuffer !!!\n\n");}


    ciErr1 = clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 2, NULL, GlobalWorkSize, LocalWorkSize, 0, NULL, NULL);
    clFinish(cqCommandQue);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clEnqueueNDRangeKernel !!!\n\n");}

    ciErr1 = clEnqueueReadBuffer(cqCommandQue, Buffer_daneOutRe, CL_TRUE, 0, siz, pointer_daneOutRe, 0, NULL, NULL);
    if (ciErr1 != CL_SUCCESS) {printf("Error in clEnqueueReadBuffer !!!\n\n");}

    showM(daneInA, N);
    showM(daneOutRe, N);

    }


  2. #2
    Senior Member
    Join Date
    Jul 2009
    Location
    Northern Europe
    Posts
    311

    Re: low float precision - only on parts of the output data

    The first thing that comes to mind are data races. Are you sure that your barrier() in your kernel is not trying to cross work-groups? Your work-group size is 2x2, so that barrier will only synchronize 4 work-items. If that is what you intended then I don't have any other immediate thoughts.

  3. #3
    Junior Member
    Join Date
    Feb 2010
    Posts
    7

    Re: low float precision - only on parts of the output data

    i think about algoritm again and it seems to me that the synchronization in algorithm is not needed - they are indeed multiple reads of the same buffer, but always for a given pair global_id and local_id (in the sense of two-dimensional), there is only one record to the output buffer. what is more without the use of barriers results are the same ( incorrect as in the beggining).

    I tried to divide the two 'for' loops from one kernel into two separate ( fft and ifft) lets say ker1 and ker2.
    results from ker1 are correct but results from ker2 are not, and i have no clue why.

    row1 and row3 from ker2 are incorrect, also row1==row3. only row0 and row3 are correct. this occurs independent to local size - i tested it with local (1,1), (2,2) and (4,4). global size remains the same - (4,4).

    any ideas of how to solve it ?

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •