Hi, I try to porting function on the OpenCL kernel, but compiler gave me next error:
cvmsErrorCompilerFailure: LLVM compiler has failed to compile a function.
Here is my kernel code:
Code :
//#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
 
//##############################################################################
// Utils function
//##############################################################################
void mmemcpy(unsigned char *dst, const unsigned char *src, size_t len) {
    for (size_t i = 0; i < len; i++)
        dst[i] = src[i];
}
 
 
//##############################################################################
// Tree hashing constants definition
//##############################################################################
/* tweak word T[1]: bit field starting positions */
#define SKEIN_T1_BIT(BIT)       ((BIT) - 64)            /* offset 64 because it's the second word  */
 
#define SKEIN_T1_POS_TREE_LVL   SKEIN_T1_BIT(112)       /* bits 112..118: level in hash tree       */
#define SKEIN_T1_POS_BIT_PAD    SKEIN_T1_BIT(119)       /* bit  119     : partial final input byte */
#define SKEIN_T1_POS_BLK_TYPE   SKEIN_T1_BIT(120)       /* bits 120..125: type field               */
#define SKEIN_T1_POS_FIRST      SKEIN_T1_BIT(126)       /* bits 126     : first block flag         */
#define SKEIN_T1_POS_FINAL      SKEIN_T1_BIT(127)       /* bit  127     : final block flag         */
 
/* tweak word T[1]: flag bit definition(s) */
#define SKEIN_T1_FLAG_FIRST     (((unsigned long)  1 ) << SKEIN_T1_POS_FIRST)
#define SKEIN_T1_FLAG_FINAL     (((unsigned long)  1 ) << SKEIN_T1_POS_FINAL)
#define SKEIN_T1_FLAG_BIT_PAD   (((unsigned long)  1 ) << SKEIN_T1_POS_BIT_PAD)
 
//##############################################################################
// Skein macros
//##############################################################################
#ifndef RotL_64
#define RotL_64(x, N)    (((x) << (N)) | ((x) >> (64-(N))))
#endif
 
//##############################################################################
// Skein block
//##############################################################################
#ifndef SKEIN_USE_ASM
#define SKEIN_USE_ASM   (0)                     /* default is all C code (no ASM) */
#endif
 
#ifndef SKEIN_LOOP
#define SKEIN_LOOP 001                          /* default: unroll 256 and 512, but not 1024 */
#endif
 
#define BLK_BITS        (WCNT*64)               /* some useful definitions for code here */
#define KW_TWK_BASE     (0)
#define KW_KEY_BASE     (3)
#define ks              (kw + KW_KEY_BASE)                
#define ts              (kw + KW_TWK_BASE)
 
//##############################################################################
// Port settings
//##############################################################################
/* Platform settings */
#define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
#define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
 
/* Define platfor here */
#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
 
#if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
/* here for big-endian */
#define SKEIN_NEED_SWAP (1)
#elif PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
/* here for little-endian */
#define SKEIN_NEED_SWAP (0)
#if PLATFORM_MUST_ALIGN == 0              /* ok to use "fast" versions? */
#define Skein_Put64_LSB_First(dst08, src64, bCnt) mmemcpy(dst08, src64, bCnt)
#define Skein_Get64_LSB_First(dst64, src08, wCnt) mmemcpy(dst64, src08, 8*(wCnt))
#endif
#endif
 
//##############################################################################
// Skein.h
//##############################################################################
enum {   
    /* Skein_256 round rotation constants */
    R_256_0_0=14, R_256_0_1=16,
    R_256_1_0=52, R_256_1_1=57,
    R_256_2_0=23, R_256_2_1=40,
    R_256_3_0= 5, R_256_3_1=37,
    R_256_4_0=25, R_256_4_1=33,
    R_256_5_0=46, R_256_5_1=12,
    R_256_6_0=58, R_256_6_1=22,
    R_256_7_0=32, R_256_7_1=32,
 
    /* Skein_512 round rotation constants */
    R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37,
    R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42,
    R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39,
    R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56,
    R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24,
    R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17,
    R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43,
    R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22,
 
    /* Skein1024 round rotation constants */
    R1024_0_0=24, R1024_0_1=13, R1024_0_2= 8, R1024_0_3=47, R1024_0_4= 8, R1024_0_5=17, R1024_0_6=22, R1024_0_7=37,
    R1024_1_0=38, R1024_1_1=19, R1024_1_2=10, R1024_1_3=55, R1024_1_4=49, R1024_1_5=18, R1024_1_6=23, R1024_1_7=52,
    R1024_2_0=33, R1024_2_1= 4, R1024_2_2=51, R1024_2_3=13, R1024_2_4=34, R1024_2_5=41, R1024_2_6=59, R1024_2_7=17,
    R1024_3_0= 5, R1024_3_1=20, R1024_3_2=48, R1024_3_3=41, R1024_3_4=47, R1024_3_5=28, R1024_3_6=16, R1024_3_7=25,
    R1024_4_0=41, R1024_4_1= 9, R1024_4_2=37, R1024_4_3=31, R1024_4_4=12, R1024_4_5=47, R1024_4_6=44, R1024_4_7=30,
    R1024_5_0=16, R1024_5_1=34, R1024_5_2=56, R1024_5_3=51, R1024_5_4= 4, R1024_5_5=53, R1024_5_6=42, R1024_5_7=41,
    R1024_6_0=31, R1024_6_1=44, R1024_6_2=47, R1024_6_3=46, R1024_6_4=19, R1024_6_5=42, R1024_6_6=44, R1024_6_7=25,
    R1024_7_0= 9, R1024_7_1=48, R1024_7_2=35, R1024_7_3=52, R1024_7_4=23, R1024_7_5=31, R1024_7_6=37, R1024_7_7=20
};
 
#ifndef SKEIN_ID_STRING_LE      /* allow compile-time personalization */
#define SKEIN_ID_STRING_LE      (0x33414853)            /* "SHA3" (little-endian)*/
#endif
 
#define SKEIN_MK_64(hi32,lo32)  ((lo32) + (((unsigned long) (hi32)) << 32))
#define SKEIN_SCHEMA_VER        SKEIN_MK_64(SKEIN_VERSION,SKEIN_ID_STRING_LE)
#define SKEIN_KS_PARITY         SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
 
//##############################################################################
// Skein structures
//##############################################################################
enum {
    SKEIN_SUCCESS         =      0,          /* return codes from Skein calls */
    SKEIN_FAIL            =      1,
    SKEIN_BAD_HASHLEN     =      2
};
 
#define  SKEIN_MODIFIER_WORDS  ( 2)          /* number of modifier (tweak) words */
 
#define  SKEIN_256_STATE_WORDS ( 4)
#define  SKEIN_512_STATE_WORDS ( 8)
#define  SKEIN1024_STATE_WORDS (16)
#define  SKEIN_MAX_STATE_WORDS (16)
 
#define  SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS)
#define  SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
#define  SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS)
 
#define  SKEIN_256_STATE_BITS  (64*SKEIN_256_STATE_WORDS)
#define  SKEIN_512_STATE_BITS  (64*SKEIN_512_STATE_WORDS)
#define  SKEIN1024_STATE_BITS  (64*SKEIN1024_STATE_WORDS)
 
#define  SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS)
#define  SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
#define  SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS)
 
#define SKEIN_256_ROUNDS_TOTAL (72)          /* number of rounds for the different block sizes */
#define SKEIN_512_ROUNDS_TOTAL (72)
#define SKEIN1024_ROUNDS_TOTAL (80)
 
typedef struct {
    unsigned long  hashBitLen;                             /* size of hash result, in bits */
    unsigned long  bCnt;                                   /* current byte count in buffer b[] */
    unsigned long  T[SKEIN_MODIFIER_WORDS];         /* tweak words: T[0]=byte cnt, T[1]=flags */
} Skein_Ctxt_Hdr_t;
 
typedef struct {                                    /*  256-bit Skein hash context structure */
    Skein_Ctxt_Hdr_t h;                             /* common header context variables */
    unsigned long  X[SKEIN_256_STATE_WORDS];        /* chaining variables */
    unsigned char  b[SKEIN_256_BLOCK_BYTES];        /* partial block buffer (8-byte aligned) */
} Skein_256_Ctxt_t;
 
//##############################################################################
// 256-bit Skein 
//##############################################################################
/*****************************  Skein_256 ******************************/
void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const unsigned *blkPtr, unsigned long blkCnt, unsigned long byteCntAdd) { /* do it in C */
 
    enum {
        WCNT = SKEIN_256_STATE_WORDS
    };
#undef  RCNT
#define RCNT (SKEIN_256_ROUNDS_TOTAL/8)
 
#ifdef  SKEIN_LOOP                              /* configure how much to unroll the loop */
#define SKEIN_UNROLL_256 (((SKEIN_LOOP)/100)%10)
#else
#define SKEIN_UNROLL_256 (0)
#endif
 
#if SKEIN_UNROLL_256
#if (RCNT % SKEIN_UNROLL_256)
#error "Invalid SKEIN_UNROLL_256"               /* sanity check on unroll count */
#endif
    unsigned long  r;
    unsigned long  kw[WCNT+4+RCNT*2];           /* key schedule words : chaining vars + tweak + "rotation"*/
#else
    unsigned long  kw[WCNT+4];                         /* key schedule words : chaining vars + tweak */
#endif
    unsigned long  X0, X1, X2, X3;                        /* local copy of context vars, for speed */
    unsigned long  w[WCNT];                           /* local copy of input block */
 
   /* never call with blkCnt == 0! */
    if (!(blkCnt != 0))
        return;
 
    ts[0] = ctx->h.T[0];
    ts[1] = ctx->h.T[1];
    do  {
        /* this implementation only supports 2**64 input bytes (no carry out here) */
        ts[0] += byteCntAdd;                    /* update processed length */
 
        /* precompute the key schedule for this block */
        ks[0] = ctx->X[0];     
        ks[1] = ctx->X[1];
        ks[2] = ctx->X[2];
        ks[3] = ctx->X[3];
        ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY;
 
        ts[2] = ts[0] ^ ts[1];
 
        Skein_Get64_LSB_First(w, blkPtr, WCNT);   /* get input block in little-endian format */
 
        X0 = w[0] + ks[0];                      /* do the first full key injection */
        X1 = w[1] + ks[1] + ts[0];
        X2 = w[2] + ks[2] + ts[1];
        X3 = w[3] + ks[3];
 
        blkPtr += SKEIN_256_BLOCK_BYTES;
 
        /* run the rounds */
 
// IN THIS MACRO WHEN I EXCHANGED X##p1 and X##p in RotL_64 for numbers 1, 2, THEN COMPILATION IS OK
#define Round256(p0, p1, p2, p3, ROT, rNum)                     \
X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \
X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \
 
#if SKEIN_UNROLL_256 == 0                       
#define R256(p0, p1, p2, p3, ROT, rNum)  /* fully unrolled */   \
Round256(p0, p1, p2, p3, ROT, rNum)
 
#define I256(R)                                                 \
X0   += ks[((R)+1) % 5];    /* inject the key schedule value */ \
X1   += ks[((R)+2) % 5] + ts[((R)+1) % 3];                      \
X2   += ks[((R)+3) % 5] + ts[((R)+2) % 3];                      \
X3   += ks[((R)+4) % 5] +     (R)+1;
#else                                       /* looping version */
#define R256(p0, p1, p2, p3, ROT, rNum)                         \
Round256(p0, p1, p2, p3, ROT, rNum)                             \
 
#define I256(R)                                                 \
X0   += ks[r+(R)+0];        /* inject the key schedule value */ \
X1   += ks[r+(R)+1] + ts[r+(R)+0];                              \
X2   += ks[r+(R)+2] + ts[r+(R)+1];                              \
X3   += ks[r+(R)+3] +    r+(R)   ;                              \
ks[r + (R)+4    ]   = ks[r+(R)-1];     /* rotate key schedule */\
ts[r + (R)+2    ]   = ts[r+(R)-1];                              
 
        for (r = 1; r < 2*RCNT; r += 2*SKEIN_UNROLL_256)  /* loop thru it */
#endif  
        {    
#define R256_8_rounds(R)          \
R256(0,1,2,3,R_256_0,8*(R) + 1);  \
R256(0,3,2,1,R_256_1,8*(R) + 2);  \
R256(0,1,2,3,R_256_2,8*(R) + 3);  \
R256(0,3,2,1,R_256_3,8*(R) + 4);  \
I256(2*(R));                      \
R256(0,1,2,3,R_256_4,8*(R) + 5);  \
R256(0,3,2,1,R_256_5,8*(R) + 6);  \
R256(0,1,2,3,R_256_6,8*(R) + 7);  \
R256(0,3,2,1,R_256_7,8*(R) + 8);  \
I256(2*(R)+1);
 
            R256_8_rounds( 0);
 
#define R256_Unroll_R(NN) ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_256 > (NN)))
 
#if   R256_Unroll_R( 1)
            R256_8_rounds( 1);
#endif
#if   R256_Unroll_R( 2)
            R256_8_rounds( 2);
#endif
#if   R256_Unroll_R( 3)
            R256_8_rounds( 3);
#endif
#if   R256_Unroll_R( 4)
            R256_8_rounds( 4);
#endif
#if   R256_Unroll_R( 5)
            R256_8_rounds( 5);
#endif
#if   R256_Unroll_R( 6)
            R256_8_rounds( 6);
#endif
#if   R256_Unroll_R( 7)
            R256_8_rounds( 7);
#endif
#if   R256_Unroll_R( 8)
            R256_8_rounds( 8);
#endif
#if   R256_Unroll_R( 9)
            R256_8_rounds( 9);
#endif
#if   R256_Unroll_R(10)
            R256_8_rounds(10);
#endif
#if   R256_Unroll_R(11)
            R256_8_rounds(11);
#endif
#if   R256_Unroll_R(12)
            R256_8_rounds(12);
#endif
#if   R256_Unroll_R(13)
            R256_8_rounds(13);
#endif
#if   R256_Unroll_R(14)
            R256_8_rounds(14);
#endif
#if  (SKEIN_UNROLL_256 > 14)
#error  "need more unrolling in Skein_256_Process_Block"
#endif
        }
 
        // WHEN I COMMENT NEXT 4 LINES, THEN COMPILATION IS OK
        ctx->X[0] = X0 ^ w[0]; 
        ctx->X[1] = X1 ^ w[1];
        ctx->X[2] = X2 ^ w[2];
        ctx->X[3] = X3 ^ w[3];
 
        ts[1] &= ~SKEIN_T1_FLAG_FIRST;
    }
    while (--blkCnt);
    ctx->h.T[0] = ts[0];
    ctx->h.T[1] = ts[1];
}
 
__kernel void update( __global Skein_256_Ctxt_t *gctx) {
    int glId = get_global_id(0);
    int lId = get_local_id(0);
    int grId = get_group_id(0);
 
    // TODO now is one hashState per block, try one hs per thread to accelerate computing
    gctx += grId;
 
    Skein_256_Ctxt_t ctx;
    ctx.h.hashBitLen = 256;
    Skein_256_Process_Block(&ctx, &ctx.b, 1, SKEIN_256_BLOCK_BYTES);
    *gctx = ctx;
}

When I comment marked line (I wrote this line in capital letters) or change the values (not both) the compilation is ok. Can anyone help me?