Hi,

I am just playing around with Apples OpenCL FFT code and added the following optimization:

Instead of calculating "(dir*2.0f*M_PI*j/64)" over and over again, I cached the result in a variable and use that subsequently.

But the code runs SLOWER than before now!

What might be the reason?

original code:
__kernel void fft1(__global float2 *in, __global float2 *out, int dir, int S)
{
...
ang = dir*2.0f*M_PI*j/64*1;
w = (float2)(native_cos(ang), native_sin(ang));
a[1] = complexMul(a[1], w);
ang = dir*2.0f*M_PI*j/64*2;
w = (float2)(native_cos(ang), native_sin(ang));
a[2] = complexMul(a[2], w);
ang = dir*2.0f*M_PI*j/64*3;
w = (float2)(native_cos(ang), native_sin(ang));
a[3] = complexMul(a[3], w);
ang = dir*2.0f*M_PI*j/64*4;
w = (float2)(native_cos(ang), native_sin(ang));
a[4] = complexMul(a[4], w);
ang = dir*2.0f*M_PI*j/64*5;
w = (float2)(native_cos(ang), native_sin(ang));
a[5] = complexMul(a[5], w);
ang = dir*2.0f*M_PI*j/64*6;
w = (float2)(native_cos(ang), native_sin(ang));
a[6] = complexMul(a[6], w);
ang = dir*2.0f*M_PI*j/64*7;
w = (float2)(native_cos(ang), native_sin(ang));
...
}

my optimization:
__kernel void fft1(__global float2 *in, __global float2 *out, int dir, int S)
{
...
float cached_multiplicator;
cached_multiplicator = dir*2.0f*M_PI*j/64;

ang = cached_multiplicator;
w = (float2)(native_cos(ang), native_sin(ang));
a[1] = complexMul(a[1], w);
ang = cached_multiplicator*2;
w = (float2)(native_cos(ang), native_sin(ang));
a[2] = complexMul(a[2], w);
ang = cached_multiplicator*3;
w = (float2)(native_cos(ang), native_sin(ang));
a[3] = complexMul(a[3], w);
ang = cached_multiplicator*4;
w = (float2)(native_cos(ang), native_sin(ang));
a[4] = complexMul(a[4], w);
ang = cached_multiplicator*5;
w = (float2)(native_cos(ang), native_sin(ang));
a[5] = complexMul(a[5], w);
ang = cached_multiplicator*6;
w = (float2)(native_cos(ang), native_sin(ang));
a[6] = complexMul(a[6], w);
ang = cached_multiplicator*7;
w = (float2)(native_cos(ang), native_sin(ang));
...
}