I want to have a complete pipeline mechanism in OpenCL kernel (AMD GPU). for example in a simple produceer/consumer Issue, in kernel side:
_kernel void produceer(__global float* src , __global float *dst)
{
int tid = get_global_id(0);
dst[tid] = src[tid] ;
return ;
}
__kernel void consumer(__global float* src , __global float *dst )
{
int tid = get_global_id(0);
dst[tid] = src[tid] ;
return ;
}
and in host side:
for (int tr = 0; tr < 10; tr++)
{
host_input[0] = (tr+1) * 10;
ret = clEnqueueWriteBuffer(queue, inputBuff, CL_TRUE, 0, n*sizeof(cl_float), host_input, 0, NULL, &ev_writ);
ret = clSetKernelArg(proc, 0, sizeof(cl_mem), (void *)&inputBuff);
ret = clSetKernelArg(proc, 1, sizeof(cl_mem), (void *)&pipeBuff);
ret = clEnqueueNDRangeKernel(queue, proc, 1, NULL, global_work_size, local_work_size, 0, NULL, &ev_sfac);
clWaitForEvents(1, &ev_sfac);
fprintf(stdout, "\n input tr: %d , date: %f" , tr, host_input[0]);
}
for (int tr = 0; tr < 10; tr++)
{
ret = clSetKernelArg(conum, 0, sizeof(cl_mem), (void *)&pipeBuff);
ret = clSetKernelArg(conum, 1, sizeof(cl_mem), (void *)&outputBuff);
ret = clEnqueueNDRangeKernel(queue, conum, 1, NULL, global_work_size, local_work_size, 0, NULL, &ev_trns);
clWaitForEvents(1, &ev_trns);
ret = clEnqueueReadBuffer(queue, outputBuff, CL_TRUE, 0, n*sizeof(cl_float), host_ouput, 0, NULL, &ev_read);
fprintf(stdout, "\n output tr: %d , date: %f", tr, host_ouput[0]);
}
I take the following output
input tr: 0 , date: 10.000000
input tr: 1 , date: 20.000000
input tr: 2 , date: 30.000000
input tr: 3 , date: 40.000000
input tr: 4 , date: 50.000000
input tr: 5 , date: 60.000000
input tr: 6 , date: 70.000000
input tr: 7 , date: 80.000000
input tr: 8 , date: 90.000000
input tr: 9 , date: 100.000000
output tr: 0 , date: 100.000000
output tr: 1 , date: 100.000000
output tr: 2 , date: 100.000000
output tr: 3 , date: 100.000000
output tr: 4 , date: 100.000000
output tr: 5 , date: 100.000000
output tr: 6 , date: 100.000000
output tr: 7 , date: 100.000000
output tr: 8 , date: 100.000000
output tr: 9 , date: 100.000000
While I need the following output
input tr: 0 , date: 10.000000
input tr: 1 , date: 20.000000
input tr: 2 , date: 30.000000
input tr: 3 , date: 40.000000
input tr: 4 , date: 50.000000
input tr: 5 , date: 60.000000
input tr: 6 , date: 70.000000
input tr: 7 , date: 80.000000
input tr: 8 , date: 90.000000
input tr: 9 , date: 100.000000
output tr: 0 , date: 10.000000
output tr: 1 , date: 20.000000
output tr: 2 , date: 30.000000
output tr: 3 , date: 40.000000
output tr: 4 , date: 50.000000
output tr: 5 , date: 60.000000
output tr: 6 , date: 70.000000
output tr: 7 , date: 80.000000
output tr: 8 , date: 90.000000
output tr: 9 , date: 100.000000