import sys
import struct
import pyopencl as cl
import numpy
block_size = 16
matrixLength = 3101104
rows = 3344
row2width = numpy.zeros(rows, numpy.int32)
row2startIdx = numpy.zeros(rows, numpy.int32)
matrix = numpy.zeros(matrixLength, numpy.int32)
pl = cl.get_platforms()
devs = pl[0].get_devices(cl.device_type.GPU)
if(block_size > devs[0].get_info(cl.device_info.MAX_WORK_GROUP_SIZE)):
print "Error: block_size is larger than MAX_WORK_GROUP_SIZE..."
exit(1)
ctx = cl.Context(devs)
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags
src = """
// Thread block size
#define BLOCK_SIZE 16
__kernel void test(__global int* C, int CSize, __global int* A, __global int* rowWidths, __global int* rowStartIdxs)
{
int bi = get_group_id(0);
int bj = get_group_id(1);
int ti = get_local_id(0);
int tj = get_local_id(1);
int rowAIdx = bi * BLOCK_SIZE + ti;
int rowBIdx = bj * BLOCK_SIZE + tj;
int cOut = 1;
for(int x=0; x<1000; x++) {
__local int As[BLOCK_SIZE][BLOCK_SIZE];
__local int Bs[BLOCK_SIZE][BLOCK_SIZE];
As[ti][tj] = 1;
Bs[ti][tj] = 1;
barrier(CLK_LOCAL_MEM_FENCE);
}
C[rowBIdx * CSize + rowAIdx] = cOut;
}
""";
prg = cl.Program(ctx, src).build();
matrix_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=numpy.array(matrix).astype(numpy.int32))
row2width_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=numpy.array(row2width).astype(numpy.int32))
row2startIdx_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=numpy.array(row2startIdx).astype(numpy.int32))
o = numpy.zeros(rows * rows).astype(numpy.int32)
o_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=o)
w_o_buf = struct.pack("i", rows)
prg.test(queue, [rows, rows], o_buf, w_o_buf, matrix_buf, row2width_buf, row2startIdx_buf, local_size=(block_size, block_size))
cl.enqueue_read_buffer(queue, o_buf, o).wait()
i = numpy.nonzero(o)
print len(i[0])