lordcoste

04-28-2011, 05:27 PM

Hello again, I'm using OpenCL to find the nearest neighbour between two set of 3D points.

This time I'm using kd-tree for the model.

First I build the kd-tree and then I pass it to the GPU.

I'm representing the tree as an implicit data structure (array) so I don't need to use pointer (left and right child) during the search on the kd-tree.

I'm using this properties to represent the tree as an array:

root: 0

left: index*2+1

right: index*2+2

http://i.imgur.com/lQQ1Q.png

The main problem is that kd-tree nearest neighbour is a recursive algorithm, so I've converted it to a non-recursive form based on a stack.

Here my non-optimized kernel:

float distanza(float4 a, float4 b)

{

float dx, dy, dz;

dx = a.x-b.x;

dx *= dx;

dy = a.y-b.y;

dy *= dy;

dz = a.z-b.z;

dz *= dz;

return dx+dy+dz;

}

bool isNull(float4 p)

{

if(p.w==-1)

return true;

return false;

}

int findLeaf(__global float4 *model, float4 qPoint, int model_size, int cur)

{

int best_id = -1;

int asse;

while(cur <= model_size)

{

asse = model[cur].w;

if(qPoint[asse] < model[cur][asse])

{

if(cur*2+1>=model_size || isNull(model[cur*2+1]))

{

if(cur*2+2>=model_size || isNull(model[cur*2+2]))

{

best_id = cur;

break;

}

else

{

cur = cur*2+2;

}

}

else

{

cur = cur*2+1;

}

}

else

{

if(cur*2+2>=model_size || isNull(model[cur*2+2]))

{

if(cur*2+1>=model_size || isNull(model[cur*2+1]))

{

best_id = cur;

break;

}

else

{

cur = cur*2+1;

}

}

else

{

cur = cur*2+2;

}

}

}

return best_id;

}

__kernel void

nearest_neighbour(__global float4 *model,

__global float4 *dataset,

__global int *nearest,

const int model_size)

{

int g_dataset_id = get_global_id(0);

float4 qPoint = dataset[g_dataset_id];

int stack[7]; // 7 is enough for the number of points in my model

int top = 0;

stack[top] = -1;

int node = findLeaf(model, qPoint, model_size, 0);

int nn = node;

int lastChild, asse, otherSide;

float bestDist = distanza(qPoint, model[node]);

while(node != 0)

{

lastChild = node;

node = (node - 1) / 2;

if(stack[top] == node)

{

--top;

}

else

{

float parentDist = distanza(qPoint, model[node]);

if(parentDist < bestDist)

{

bestDist = parentDist;

nn = node;

}

asse = model[node].w;

float testDist = model[node][asse] - qPoint[asse];

testDist = testDist * testDist;

if(testDist < bestDist)

{

if (node*2+1 == lastChild)

{

otherSide = node*2+2;

}

else

{

otherSide = node*2+1;

}

if(otherSide < model_size && !isNull(model[otherSide]))

{

int testNode = findLeaf(model, qPoint, model_size, otherSide);

testDist = distanza(qPoint, model[testNode]);

if(testDist < bestDist)

{

bestDist = testDist;

nn = testNode;

}

++top;

stack[top] = node;

node = testNode;

}

}

}

}

nearest[g_dataset_id] = nn;

}

The code return the correct nearest neighbour (except for few points but this is not a problem right now). The real problem is that is slower on the GPU then on the CPU.

Results (model 100.000 points and 100.000 query points):

http://i.imgur.com/dEUcA.png

Also it crash on ATI 5850.

What can I do to make it faster? I was thinking about storing the model in local memory, but I don't know if I can store 100.000 float4 on it, also I'm not sure if it will improve performance.

Any suggestion?

Thanks

Stefano

This time I'm using kd-tree for the model.

First I build the kd-tree and then I pass it to the GPU.

I'm representing the tree as an implicit data structure (array) so I don't need to use pointer (left and right child) during the search on the kd-tree.

I'm using this properties to represent the tree as an array:

root: 0

left: index*2+1

right: index*2+2

http://i.imgur.com/lQQ1Q.png

The main problem is that kd-tree nearest neighbour is a recursive algorithm, so I've converted it to a non-recursive form based on a stack.

Here my non-optimized kernel:

float distanza(float4 a, float4 b)

{

float dx, dy, dz;

dx = a.x-b.x;

dx *= dx;

dy = a.y-b.y;

dy *= dy;

dz = a.z-b.z;

dz *= dz;

return dx+dy+dz;

}

bool isNull(float4 p)

{

if(p.w==-1)

return true;

return false;

}

int findLeaf(__global float4 *model, float4 qPoint, int model_size, int cur)

{

int best_id = -1;

int asse;

while(cur <= model_size)

{

asse = model[cur].w;

if(qPoint[asse] < model[cur][asse])

{

if(cur*2+1>=model_size || isNull(model[cur*2+1]))

{

if(cur*2+2>=model_size || isNull(model[cur*2+2]))

{

best_id = cur;

break;

}

else

{

cur = cur*2+2;

}

}

else

{

cur = cur*2+1;

}

}

else

{

if(cur*2+2>=model_size || isNull(model[cur*2+2]))

{

if(cur*2+1>=model_size || isNull(model[cur*2+1]))

{

best_id = cur;

break;

}

else

{

cur = cur*2+1;

}

}

else

{

cur = cur*2+2;

}

}

}

return best_id;

}

__kernel void

nearest_neighbour(__global float4 *model,

__global float4 *dataset,

__global int *nearest,

const int model_size)

{

int g_dataset_id = get_global_id(0);

float4 qPoint = dataset[g_dataset_id];

int stack[7]; // 7 is enough for the number of points in my model

int top = 0;

stack[top] = -1;

int node = findLeaf(model, qPoint, model_size, 0);

int nn = node;

int lastChild, asse, otherSide;

float bestDist = distanza(qPoint, model[node]);

while(node != 0)

{

lastChild = node;

node = (node - 1) / 2;

if(stack[top] == node)

{

--top;

}

else

{

float parentDist = distanza(qPoint, model[node]);

if(parentDist < bestDist)

{

bestDist = parentDist;

nn = node;

}

asse = model[node].w;

float testDist = model[node][asse] - qPoint[asse];

testDist = testDist * testDist;

if(testDist < bestDist)

{

if (node*2+1 == lastChild)

{

otherSide = node*2+2;

}

else

{

otherSide = node*2+1;

}

if(otherSide < model_size && !isNull(model[otherSide]))

{

int testNode = findLeaf(model, qPoint, model_size, otherSide);

testDist = distanza(qPoint, model[testNode]);

if(testDist < bestDist)

{

bestDist = testDist;

nn = testNode;

}

++top;

stack[top] = node;

node = testNode;

}

}

}

}

nearest[g_dataset_id] = nn;

}

The code return the correct nearest neighbour (except for few points but this is not a problem right now). The real problem is that is slower on the GPU then on the CPU.

Results (model 100.000 points and 100.000 query points):

http://i.imgur.com/dEUcA.png

Also it crash on ATI 5850.

What can I do to make it faster? I was thinking about storing the model in local memory, but I don't know if I can store 100.000 float4 on it, also I'm not sure if it will improve performance.

Any suggestion?

Thanks

Stefano