Results 1 to 7 of 7

Thread: Different result between AMD and Nvidea devices.

  1. #1
    Junior Member
    Join Date
    Apr 2014
    Posts
    5

    Question Different result between AMD and Nvidea devices.

    Hi all,
    My kernel works well on Nvidea GTX 560 GPU device but works error on AMD A7970 device. The source code list bellow.
    A7970's outputs is 0xd7 eb 6a d7 d7 eb 6a d7, but GTX560's is 0xd7 eb 6a d7 05 b5 30 ad.
    Where the problem appear. :confused:


    // kernel source code -------------------------------------------

    typedef union USHA1_type
    {
    unsigned int sha1uint[5];
    unsigned char sha1uchar[20];
    }USHA1_t;

    inline uint SWAP32(uint x)
    {
    x = rotate(x, 16U);
    return ((x & 0x00FF00FF) << 8) + ((x >> 8) & 0x00FF00FF);
    }

    //sha1 ==================
    #define K1 0x5A827999
    #define K2 0x6ED9EBA1
    #define K3 0x8F1BBCDC
    #define K4 0xCA62C1D6

    #define H1 0x67452301
    #define H2 0xEFCDAB89
    #define H3 0x98BADCFE
    #define H4 0x10325476
    #define H5 0xC3D2E1F0

    #define F1(x,y,z) (z ^ (x & (y ^ z)))
    #define F2(x,y,z) (x ^ y ^ z)
    #define F3(x,y,z) ((x & y) | (z & (x | y)))
    #define F4(x,y,z) (x ^ y ^ z)


    #define R(t) (temp = W[(t - 3) & 0x0F] ^ W[(t - 8) & 0x0F] ^ W[(t - 14) & 0x0F] ^ W[t & 0x0F], ( W[t & 0x0F] = rotate((int)temp,1) ) )

    #define P1(a,b,c,d,e,x) \
    { \
    e += rotate((int)a,5) + F1(b,c,d) + K1 + x; b = rotate((int)b,30);\
    }

    #define P2(a,b,c,d,e,x) \
    { \
    e += rotate((int)a,5) + F2(b,c,d) + K2 + x; b = rotate((int)b,30);\
    }
    #define P3(a,b,c,d,e,x) \
    { \
    e += rotate((int)a,5) + F3(b,c,d) + K3 + x; b = rotate((int)b,30);\
    }
    #define P4(a,b,c,d,e,x) \
    { \
    e += rotate((int)a,5) + F4(b,c,d) + K4 + x; b = rotate((int)b,30);\
    }

    //1-63 BYTES sha1
    inline void sha1_crypt(__private unsigned char *plain, unsigned int plainlen, __private unsigned int *digest)
    {
    int t;
    int stop, mmod;
    unsigned int i, ulen;
    unsigned int W[16] = {0};
    unsigned int temp, A,B,C,D,E;

    A = H1;
    B = H2;
    C = H3;
    D = H4;
    E = H5;

    for (t = 1; t < 15; t++)
    {
    W[t] = 0x00000000;
    }

    i = plainlen;

    stop = i / 4 ;
    for (t = 0 ; t < stop ; t++){
    W[t] = ((unsigned char) plain[t * 4]) << 24;
    W[t] |= ((unsigned char) plain[t * 4 + 1]) << 16;
    W[t] |= ((unsigned char) plain[t * 4 + 2]) << 8;
    W[t] |= (unsigned char) plain[t * 4 + 3];
    }
    mmod = i % 4;
    if ( mmod == 3){
    W[t] = ((unsigned char) plain[t * 4]) << 24;
    W[t] |= ((unsigned char) plain[t * 4 + 1]) << 16;
    W[t] |= ((unsigned char) plain[t * 4 + 2]) << 8;
    W[t] |= ((unsigned char) 0x80) ;
    } else if (mmod == 2) {
    W[t] = ((unsigned char) plain[t * 4]) << 24;
    W[t] |= ((unsigned char) plain[t * 4 + 1]) << 16;
    W[t] |= 0x8000 ;
    } else if (mmod == 1) {
    W[t] = ((unsigned char) plain[t * 4]) << 24;
    W[t] |= 0x800000 ;
    } else /*if (mmod == 0)*/ {
    W[t] = 0x80000000 ;
    }
    ulen = (i * 8) & 0xFFFFFFFF;
    W[15] = ulen ;


    P1( A, B, C, D, E, W[0] );
    P1( E, A, B, C, D, W[1] );
    P1( D, E, A, B, C, W[2] );
    P1( C, D, E, A, B, W[3] );
    P1( B, C, D, E, A, W[4] );
    P1( A, B, C, D, E, W[5] );
    P1( E, A, B, C, D, W[6] );
    P1( D, E, A, B, C, W[7] );
    P1( C, D, E, A, B, W[8] );
    P1( B, C, D, E, A, W[9] );
    P1( A, B, C, D, E, W[10] );
    P1( E, A, B, C, D, W[11] );
    P1( D, E, A, B, C, W[12] );
    P1( C, D, E, A, B, W[13] );
    P1( B, C, D, E, A, W[14] );
    P1( A, B, C, D, E, W[15] );
    P1( E, A, B, C, D, R(16) );
    P1( D, E, A, B, C, R(17) );
    P1( C, D, E, A, B, R(18) );
    P1( B, C, D, E, A, R(19) );

    P2( A, B, C, D, E, R(20) );
    P2( E, A, B, C, D, R(21) );
    P2( D, E, A, B, C, R(22) );
    P2( C, D, E, A, B, R(23) );
    P2( B, C, D, E, A, R(24) );
    P2( A, B, C, D, E, R(25) );
    P2( E, A, B, C, D, R(26) );
    P2( D, E, A, B, C, R(27) );
    P2( C, D, E, A, B, R(28) );
    P2( B, C, D, E, A, R(29) );
    P2( A, B, C, D, E, R(30) );
    P2( E, A, B, C, D, R(31) );
    P2( D, E, A, B, C, R(32) );
    P2( C, D, E, A, B, R(33) );
    P2( B, C, D, E, A, R(34) );
    P2( A, B, C, D, E, R(35) );
    P2( E, A, B, C, D, R(36) );
    P2( D, E, A, B, C, R(37) );
    P2( C, D, E, A, B, R(38) );
    P2( B, C, D, E, A, R(39) );

    P3( A, B, C, D, E, R(40) );
    P3( E, A, B, C, D, R(41) );
    P3( D, E, A, B, C, R(42) );
    P3( C, D, E, A, B, R(43) );
    P3( B, C, D, E, A, R(44) );
    P3( A, B, C, D, E, R(45) );
    P3( E, A, B, C, D, R(46) );
    P3( D, E, A, B, C, R(47) );
    P3( C, D, E, A, B, R(48) );
    P3( B, C, D, E, A, R(49) );
    P3( A, B, C, D, E, R(50) );
    P3( E, A, B, C, D, R(51) );
    P3( D, E, A, B, C, R(52) );
    P3( C, D, E, A, B, R(53) );
    P3( B, C, D, E, A, R(54) );
    P3( A, B, C, D, E, R(55) );
    P3( E, A, B, C, D, R(56) );
    P3( D, E, A, B, C, R(57) );
    P3( C, D, E, A, B, R(58) );
    P3( B, C, D, E, A, R(59) );


    P4( A, B, C, D, E, R(60) );
    P4( E, A, B, C, D, R(61) );
    P4( D, E, A, B, C, R(62) );
    P4( C, D, E, A, B, R(63) );
    P4( B, C, D, E, A, R(64) );
    P4( A, B, C, D, E, R(65) );
    P4( E, A, B, C, D, R(66) );
    P4( D, E, A, B, C, R(67) );
    P4( C, D, E, A, B, R(68) );
    P4( B, C, D, E, A, R(69) );
    P4( A, B, C, D, E, R(70) );
    P4( E, A, B, C, D, R(71) );
    P4( D, E, A, B, C, R(72) );
    P4( C, D, E, A, B, R(73) );
    P4( B, C, D, E, A, R(74) );
    P4( A, B, C, D, E, R(75) );
    P4( E, A, B, C, D, R(76) );
    P4( D, E, A, B, C, R(77) );
    P4( C, D, E, A, B, R(78) );
    P4( B, C, D, E, A, R(79) );

    digest[0] = SWAP32(A + H1);
    digest[1] = SWAP32(B + H2);
    digest[2] = SWAP32(C + H3);
    digest[3] = SWAP32(D + H4);
    digest[4] = SWAP32(E + H5);
    }

    __kernel void test_sha1_kernel(__global unsigned int* gout)
    {
    unsigned int id = get_global_id(0);
    unsigned int i = 0;

    //two input data
    unsigned char InData1[16] = {0};
    unsigned char InData2[16] = {0};

    //two calout data
    USHA1_t sha1out1;
    USHA1_t sha1out2;

    //init data
    for(i = 0; i < 5; i ++)
    {
    sha1out1.sha1uint[i] = 0;
    sha1out2.sha1uint[i] = 0;
    }
    for(i = 0; i < 16; i ++)
    {
    InData1[i] = 0x03;
    InData2[i] = 0x38;
    }

    //two out temp
    unsigned char out1[4] = {0};
    unsigned char out2[4] = {0};
    for(i = 0; i < 4; i++)
    {
    out1[i] = 0;
    out2[i] = 0;
    }

    //cal 1
    unsigned int *psha1out1 = (unsigned int *)(sha1out1.sha1uint);
    sha1_crypt(InData1, 8, psha1out1);
    sha1_crypt(InData1, 8, psha1out1);

    //save output1
    for(i = 0; i < 4; i++)
    {
    out1[i] = sha1out1.sha1uchar[i];
    }

    //cal 2
    unsigned int *psha1out2 = (unsigned int *)(sha1out2.sha1uint);
    sha1_crypt(InData2, 8, psha1out2);

    //save output2
    for(i = 0; i < 4; i++)
    {
    out2[i] = sha1out2.sha1uchar[i];
    }

    //out to cpu
    if(id == 0)
    {
    gout[0] = (unsigned int)out1[0];
    gout[1] = (unsigned int)out1[1];
    gout[2] = (unsigned int)out1[2];
    gout[3] = (unsigned int)out1[3];

    gout[4] = (unsigned int)out2[0];
    gout[5] = (unsigned int)out2[1];
    gout[6] = (unsigned int)out2[2];
    gout[7] = (unsigned int)out2[3];
    }


    }

  2. #2
    Senior Member
    Join Date
    Oct 2012
    Posts
    105
    Try to cast plain to unsigned int instead of unsigned char, such as:

    W[t] = ((unsigned int) plain[t * 4]) << 24;

    and so on...

  3. #3
    Junior Member
    Join Date
    Apr 2014
    Posts
    5
    Quote Originally Posted by utnapishtim View Post
    Try to cast plain to unsigned int instead of unsigned char, such as:

    W[t] = ((unsigned int) plain[t * 4]) << 24;

    and so on...
    Thank you very much, I tried it.

    for (t = 0 ; t < stop ; t++){
    W[t] = ((unsigned int)plain[t * 4]) << 24;
    W[t] |= ((unsigned int)plain[t * 4 + 1]) << 16;
    W[t] |= ((unsigned int)plain[t * 4 + 2]) << 8;
    W[t] |= (unsigned int) plain[t * 4 + 3];
    }
    mmod = i % 4;
    if ( mmod == 3){
    W[t] = ((unsigned int) plain[t * 4]) << 24;
    W[t] |= ((unsigned int)plain[t * 4 + 1]) << 16;
    W[t] |= ((unsigned int)plain[t * 4 + 2]) << 8;
    W[t] |= ((unsigned int)0x80) ;
    } else if (mmod == 2) {
    W[t] = ((unsigned int) plain[t * 4]) << 24;
    W[t] |= ((unsigned int)plain[t * 4 + 1]) << 16;
    W[t] |= 0x8000 ;
    } else if (mmod == 1) {
    W[t] = ((unsigned int) plain[t * 4]) << 24;
    W[t] |= 0x800000 ;
    } else /*if (mmod == 0)*/ {
    W[t] = 0x80000000 ;
    }


    But no change happen.

  4. #4
    Junior Member
    Join Date
    Apr 2014
    Posts
    5
    I debug the kernel using codexl.

    On device A7970 the screenshot shows when it runs to the breakpoint "if(id == 0)" , the value of out2 is 0x05 but actually I gets 0xd7 in the cpu memory.

    1.jpg

  5. #5
    Senior Member
    Join Date
    Oct 2012
    Posts
    105
    I've checked on NVIDIA GPU, AMD GPU and Intel CPU and your kernel is fine.

    How do you get the result from the device buffer on the host side?

  6. #6
    Junior Member
    Join Date
    Apr 2014
    Posts
    5
    Quote Originally Posted by utnapishtim View Post
    I've checked on NVIDIA GPU, AMD GPU and Intel CPU and your kernel is fine.

    How do you get the result from the device buffer on the host side?
    Thanks, my host side code:

    #include "stdafx.h"

    #include <CL/cl.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <iostream>
    #include <cstring>
    #include <string>
    #include <fstream>

    #define SUCCESS 0
    #define FAILURE 1
    #define EXPECTED_FAILURE 2

    #define GlobalThreadSize 256
    #define GroupSize 64

    #define OPENCLBUILDOPTIONS "-cl-opt-disable"
    //#define OPENCLBUILDOPTIONS NULL

    /* convert the kernel file into a string */
    int convertToString(const char *filename, std::string& s)
    {
    size_t size;
    char* str;
    std::fstream f(filename, (std::fstream::in | std::fstream::binary));

    if(f.is_open())
    {
    size_t fileSize;
    f.seekg(0, std::fstream::end);
    size = fileSize = (size_t)f.tellg();
    f.seekg(0, std::fstream::beg);
    str = new char[size+1];
    if(!str)
    {
    f.close();
    return SUCCESS;
    }

    f.read(str, fileSize);
    f.close();
    str[size] = '\0';
    s = str;
    delete[] str;
    return SUCCESS;
    }
    std::cout<<"Error: failed to open file\n:"<<filename<<std::endl;
    return FAILURE;
    }

    int _tmain(int argc, char* argv[])
    {

    cl_int status = 0;//store the return status

    cl_uint numPlatforms;//store the number of platforms query by clGetPlatformIDs()

    cl_platform_id platform = NULL;//store the chosen platform

    //get platform
    status = clGetPlatformIDs(0, NULL, &numPlatforms);
    if (status != CL_SUCCESS)
    {
    std::cout<<"Error: Getting platforms!"<<std::endl;
    return FAILURE;
    }

    if (numPlatforms > 0)
    {
    cl_platform_id* platforms = (cl_platform_id* )malloc(numPlatforms* sizeof(cl_platform_id));
    status=clGetPlatformIDs(numPlatforms,platforms,NUL L);
    platform=platforms[0];
    free(platforms);
    }

    if (NULL == platform)
    {
    std::cout<<"Error: No available platform found!"<<std::endl;
    return FAILURE;
    }


    /* Query the context and get the available devices */
    cl_uint numDevice=0;
    status=clGetDeviceIDs(platform,CL_DEVICE_TYPE_ALL, 0,NULL,&numDevice);
    cl_device_id *devices=(cl_device_id*)malloc(numDevice*sizeof(cl _device_id));
    if (devices == 0)
    {
    std::cout << "No device available\n";
    return FAILURE;
    }
    clGetDeviceIDs(platform,CL_DEVICE_TYPE_ALL,numDevi ce,devices,NULL);

    /* Create Context using the platform selected above */
    cl_context context=clCreateContext(NULL,numDevice,devices,NUL L,NULL,NULL);

    if (status != CL_SUCCESS)
    {
    std::cout<<"Error: Creating context failed!"<<std::endl;
    return FAILURE;
    }

    /*
    *The API clCreateCommandQueue creates a command-queue on a specific device.
    */
    cl_command_queue commandQueue = clCreateCommandQueue(context, devices[0], 0, &status);
    if (status != CL_SUCCESS)
    {
    std::cout<<"Error: Creating command queue failed!"<<std::endl;
    return FAILURE;
    }

    //set input data
    cl_uint *output = (cl_uint *) malloc( sizeof(cl_uint) * GlobalThreadSize);

    //create output buffer
    cl_mem outputBuffer = clCreateBuffer(
    context,
    CL_MEM_WRITE_ONLY,
    sizeof(cl_uint) * GlobalThreadSize,
    NULL,
    &status);
    if (status != CL_SUCCESS)
    {
    std::cout<<"Error: Creating output buffer failed!"<<std::endl;
    return FAILURE;
    }

    //get the printf kernel
    const char* filename = "./sha1_Kernel.cl";
    std::string sourceStr;
    status = convertToString(filename, sourceStr);
    const char *source = sourceStr.c_str();
    size_t sourceSize[] ={strlen(source)};
    //
    //create program
    cl_program program = clCreateProgramWithSource(context, 1, &source, sourceSize, &status);
    if (status != CL_SUCCESS)
    {
    std::cout<<"Error: Creating program object failed!"<<std::endl;
    return FAILURE;
    }

    //build program with the command line option '-g' so we can debug kernel
    status = clBuildProgram(program,1, devices, OPENCLBUILDOPTIONS, NULL, NULL);

    char opencl_log[65536];
    clGetProgramBuildInfo(program, *devices, CL_PROGRAM_BUILD_LOG, sizeof(opencl_log), (void *) opencl_log, NULL);

    if (status != CL_SUCCESS)
    {
    std::cout<<"Error: Building program failed!"<<std::endl;
    return FAILURE;
    }

    //create printf kernel
    cl_kernel kernel = clCreateKernel(program, "test_sha1_kernel", &status);
    if (status != CL_SUCCESS)
    {
    std::cout<<"Error: Creating kernel failed!"<<std::endl;
    return FAILURE;
    }

    //set args
    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&outputBuffer);

    size_t global_threads[1];
    size_t local_threads[1];
    global_threads[0] = GlobalThreadSize;
    local_threads[0] = GroupSize;

    //execute the kernel
    status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_threads, local_threads, 0, NULL, NULL);
    if (status != CL_SUCCESS)
    {
    std::cout<<"Error: Enqueue kernel onto command queue failed!"<<std::endl;
    return FAILURE;
    }
    status = clFinish(commandQueue);

    memset(output, 0, sizeof(cl_uint) * GlobalThreadSize);

    status = clEnqueueReadBuffer(commandQueue, outputBuffer, CL_TRUE, 0, sizeof(cl_uint)*GlobalThreadSize, output, 0, NULL, NULL);

    for(int i = 0; i < 8; i++)
    printf("%02X ", output[i]);

    // Clean the resources.
    status = clReleaseKernel(kernel);//Release kernel.
    status = clReleaseMemObject(outputBuffer);
    status = clReleaseProgram(program);//Release program.
    status = clReleaseCommandQueue(commandQueue);//Release command queue.
    status = clReleaseContext(context);//Release context.

    if (devices != NULL)
    {
    free(devices);
    devices = NULL;
    }

    free(output);

    scanf_s("%c");
    return SUCCESS;
    }

  7. #7
    Junior Member
    Join Date
    Apr 2014
    Posts
    5
    If do not use build option "-cl-opt-disable", the result seems to be right.

    In my mind disable-opt will be slow but more stable, it's a strange case.

    But in the case of enable-opt, there are also many problems can not be understood. Development is really very tough on the AMD 7970 device.

    When using the AMD device development,whether there is a need for special attention?

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •