Hi all,
I'm trying to use clEnqueueReadBufferRect to read back a sub-matrix (for use with clBlas), but can't get the region parameter to work for this - despite being an array of 3 size_ts, it always copies a continuous region.
eg. the below example - compiled against the current version of APP on xubuntu 12 / amd64 :
#include <stdio.h>
#include <CL/cl.h>
#define sizeX 4
#define sizeY 4
int main() {
cl_platform_id platform;
clGetPlatformIDs(1, &platform, NULL);
cl_context_properties contextProperties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 };
cl_context context = clCreateContextFromType( contextProperties, CL_DEVICE_TYPE_GPU, NULL, NULL, NULL);
cl_device_id device;
clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &device, NULL);
cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
// Create a buffer of 4*4 ints
int A[sizeX * sizeY] = { 11, 12, 13, 14,
21, 22, 23, 24,
31, 32, 33, 34,
41, 42, 43, 44, };
cl_mem mem = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(*A)*sizeX*sizeY, NULL, NULL);
cl_uint error = clEnqueueWriteBuffer(queue, mem, CL_TRUE, 0, sizeof(*A)*sizeX*sizeY, A, 0, NULL, NULL);
int iX,iY;
int regionSizeX=3;
int regionSizeY=3;
// Try and read back a region of the buffer
size_t bufferOffset[] = { 0, 0, 0 };
size_t hostOffset[] = { 0, 0, 0 };
size_t region[] = { regionSizeX * sizeof(*A), regionSizeY, 1 };
size_t bufferRowPitch = sizeX*sizeof(*A);
size_t bufferSlicePitch = 0;
size_t hostRowPitch = sizeX*sizeof(*A);
size_t hostSlicePitch = 0;
if (error != CL_SUCCESS) {
printf("Error %d creating buffer.\n",error);
return 0;
}
// Change it so as to be clear what was read back
for (iX=0;iX<sizeX*sizeY;++iX)
A[iX] *= -1;
error = clEnqueueReadBufferRect( queue, mem, CL_TRUE, bufferOffset, hostOffset, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, A, 0, NULL, NULL);
if (error != CL_SUCCESS) {
printf("Error %d reading back.\n",error);
return 0;
}
for (iY=0;iY<sizeY;++iY) {
printf("\t[ ");
for (iX=0;iX<sizeX;++iX)
printf("%d,\t",A[iX+(iY*sizeX)]);
printf("]\n");
}
return 0;
}
this produces the output:
[ 11, 12, 13, 14, ] [ 21, 22, 23, 24, ]
[ 31, -32, -33, -34, ]
[ -41, -42, -43, -44, ]
where I would have expected
[ 11, 12, 12, -14, ]
[ 21, 22, 23, -24, ]
[ 31, 32, 33, -34, ]
[ -41, -42, -43, -44, ]
Is this the intended behaviour? If so, why have the size_t array for the region variable?
Solved! Go to Solution.
Runtime didn't handle Read/WrriteRect with the pitch value for DMA engine. The issue will be fixed in the new release. As a temporary workaround try to force the allocation to host memory - add CL_MEM_USE_HOST_PTR flag when you create a buffer(you will need to allocate extra system memory and align it to 4K). Please note, your application will have slow kernel executions, but you can test the functionality. Please don't forget to remove the flag and extra allocated memory with the new drivers.
Runtime didn't handle Read/WrriteRect with the pitch value for DMA engine. The issue will be fixed in the new release. As a temporary workaround try to force the allocation to host memory - add CL_MEM_USE_HOST_PTR flag when you create a buffer(you will need to allocate extra system memory and align it to 4K). Please note, your application will have slow kernel executions, but you can test the functionality. Please don't forget to remove the flag and extra allocated memory with the new drivers.
Yes, that works!
I'm actually using a loop to copy the rows individually with normal ReadBuffer/WriteBuffer for now, but I'll watch out for the new release.