// A[M][N] * B[N][P] = C[M][P]
kernel void mult_mem_global
(
const int Mdim,
const int Ndim,
const int Pdim,
global int *A,
global int *B,
global int *C
)
{
int k, j, tmp;
int i = get_global_id(0);
if (i > Mdim) return;
for (j = 0; j < Pdim; j++) {
tmp = 0;
for (k = 0; k < Ndim; k++)
tmp += A[i*Mdim + k] * B[k*Ndim + j];
C[i*Mdim + j] = tmp;
}
}
kernel void mult_mem_private_local
(
const int Mdim,
const int Ndim,
const int Pdim,
global int *A,
global int *B,
global int *C,
local int *local_B_column
)
{
int k, j, tmp;
int i = get_global_id(0);
int iloc = get_local_id(0);
int nloc = get_local_size(0);
int private_A_line[1000];
if (i > Mdim) return;
/* private memory */
for (k = 0; k < Ndim; k++)
private_A_line[k] = A[i*Mdim + k];
local int local_B_column[1000];
for(j = 0; j < Pdim; j++) {
for(k = iloc; k < Ndim; k += nloc) {
local_B_column[k] = B[k*Ndim + j];
}
barrier(CLK_LOCAL_MEM_FENCE);
for (k = tmp = 0; k < Ndim; k++)
tmp += private_A_line[k] * local_B_column[k];
C[i*Mdim + j] = tmp;
}
}