The following code is giving error in reading a global float4 buffer1 array from the kernel... the kernel code is below the host code...
Can float4 buffers be read from the device memory to the host memory...?
//Host Code
#include <iostream>
#include <fstream>
#include <sstream>
#include <string.h>
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#include "FreeImage.h"
///
// Create an OpenCL context on the first available platform using
// either a GPU or CPU depending on what is available.
//
cl_context CreateContext()
{
cl_int errNum;
cl_uint numPlatforms;
cl_platform_id firstPlatformId;
cl_context context = NULL;
// First, select an OpenCL platform to run on. For this example, we
// simply choose the first available platform. Normally, you would
// query for all available platforms and select the most appropriate one.
errNum = clGetPlatformIDs(1, &firstPlatformId, &numPlatforms);
if (errNum != CL_SUCCESS || numPlatforms <= 0)
{
std::cerr << "Failed to find any OpenCL platforms." << std::endl;
return NULL;
}
// Next, create an OpenCL context on the platform. Attempt to
// create a GPU-based context, and if that fails, try to create
// a CPU-based context.
cl_context_properties contextProperties[] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)firstPlatformId,
0
};
context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU,
NULL, NULL, &errNum);
if (errNum != CL_SUCCESS)
{
std::cout << "Could not create GPU context, trying CPU..." << std::endl;
context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_CPU,
NULL, NULL, &errNum);
if (errNum != CL_SUCCESS)
{
std::cerr << "Failed to create an OpenCL GPU or CPU context." << std::endl;
return NULL;
}
}
return context;
}
///
// Create a command queue on the first device available on the
// context
//
cl_command_queue CreateCommandQueue(cl_context context, cl_device_id *device)
{
cl_int errNum;
cl_device_id *devices;
cl_command_queue commandQueue = NULL;
size_t deviceBufferSize = -1;
// First get the size of the devices buffer
errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceBufferSize);
if (errNum != CL_SUCCESS)
{
std::cerr << "Failed call to clGetContextInfo(...,GL_CONTEXT_DEVICES,...)";
return NULL;
}
if (deviceBufferSize <= 0)
{
std::cerr << "No devices available.";
return NULL;
}
// Allocate memory for the devices buffer
devices = new cl_device_id[deviceBufferSize / sizeof(cl_device_id)];
errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceBufferSize, devices, NULL);
if (errNum != CL_SUCCESS)
{
std::cerr << "Failed to get device IDs";
return NULL;
}
// In this example, we just choose the first available device. In a
// real program, you would likely use all available devices or choose
// the highest performance device based on OpenCL device queries
commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
if (commandQueue == NULL)
{
std::cerr << "Failed to create commandQueue for device 0";
return NULL;
}
*device = devices[0];
delete [] devices;
return commandQueue;
}
///
// Create an OpenCL program from the kernel source file
//
cl_program CreateProgram(cl_context context, cl_device_id device, const char* fileName)
{
cl_int errNum;
cl_program program;
std::ifstream kernelFile(fileName, std::ios::in);
if (!kernelFile.is_open())
{
std::cerr << "Failed to open file for reading: " << fileName << std::endl;
return NULL;
}
std::ostringstream oss;
oss << kernelFile.rdbuf();
std::string srcStdStr = oss.str();
const char *srcStr = srcStdStr.c_str();
program = clCreateProgramWithSource(context, 1,
(const char**)&srcStr,
NULL, NULL);
if (program == NULL)
{
std::cerr << "Failed to create CL program from source." << std::endl;
return NULL;
}
errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (errNum != CL_SUCCESS)
{
// Determine the reason for the error
char buildLog[16384];
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
sizeof(buildLog), buildLog, NULL);
std::cerr << "Error in kernel: " << std::endl;
std::cerr << buildLog;
system("pause");
clReleaseProgram(program);
return NULL;
}
return program;
}
///
// Cleanup any created OpenCL resources
//
void Cleanup(cl_context context, cl_command_queue commandQueue,
cl_program program, cl_kernel kernel, cl_mem imageObjects[2],
cl_sampler sampler, cl_mem *memObject)
{
for (int i = 0; i < 2; i++)
{
if (imageObjects != 0)
clReleaseMemObject(imageObjects);
}
if (commandQueue != 0)
clReleaseCommandQueue(commandQueue);
if (kernel != 0)
clReleaseKernel(kernel);
if (program != 0)
clReleaseProgram(program);
if (sampler != 0)
clReleaseSampler(sampler);
if (context != 0)
clReleaseContext(context);
if(*memObject!=0)
clReleaseMemObject(*memObject);
}
///
// Load an image using the FreeImage library and create an OpenCL
// image out of it
//
cl_mem LoadImage(cl_context context, char *fileName, int &width, int &height)
{
FREE_IMAGE_FORMAT format = FreeImage_GetFileType(fileName, 0);
FIBITMAP* image = FreeImage_Load(format, fileName);
// Convert to 32-bit image
FIBITMAP* temp = image;
image = FreeImage_ConvertTo32Bits(image);
FreeImage_Unload(temp);
width = FreeImage_GetWidth(image);
height = FreeImage_GetHeight(image);
char *buffer = new char[width * height * 4];
memcpy(buffer, FreeImage_GetBits(image), width * height * 4);
FreeImage_Unload(image);
// Create OpenCL image
cl_image_format clImageFormat;
clImageFormat.image_channel_order = CL_RGBA;
clImageFormat.image_channel_data_type = CL_UNORM_INT8;
cl_int errNum;
cl_mem clImage;
clImage = clCreateImage2D(context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
&clImageFormat,
width,
height,
0,
buffer,
&errNum);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error creating CL image object" << std::endl;
return 0;
}
return clImage;
}
///
// Save an image using the FreeImage library
//
bool SaveImage(char *fileName, char *buffer, int width, int height)
{
FREE_IMAGE_FORMAT format = FreeImage_GetFIFFromFilename(fileName);
FIBITMAP *image = FreeImage_ConvertFromRawBits((BYTE*)buffer, width,
height, width * 4, 32,
0xFF000000, 0x00FF0000, 0x0000FF00);
return (FreeImage_Save(format, image, fileName) == TRUE) ? true : false;
}
bool CreateMemObjects(cl_command_queue commandQueue,cl_context context, cl_mem *memObject,int width, int height)
{
*memObject = clCreateBuffer(context, CL_MEM_READ_WRITE,
sizeof(char) * width*height*4, NULL, NULL);
if (*memObject == NULL )
{
std::cerr << "Error creating memory objects." << std::endl;
return false;
}
return true;
}
///
// Round up to the nearest multiple of the group size
//
size_t RoundUp(int groupSize, int globalSize)
{
int r = globalSize % groupSize;
if(r == 0)
{
return globalSize;
}
else
{
return globalSize + groupSize - r;
}
}
///
// main() for HelloBinaryWorld example
//
int main(int argc, char** argv)
{
cl_context context = 0;
cl_command_queue commandQueue = 0;
cl_program program = 0;
cl_device_id device = 0;
cl_kernel kernel = 0;
cl_mem imageObjects[2] = { 0, 0 };
cl_mem memObject= 0;
cl_sampler sampler = 0;
cl_int errNum;
// if (argc != 3)
//{
// std::cerr << "USAGE: " << argv[0] << " <inputImageFile> <outputImageFiles>" << std::endl;
// system("pause");
// return 1;
// }
// Create an OpenCL context on first available platform
context = CreateContext();
if (context == NULL)
{
std::cerr << "Failed to create OpenCL context." << std::endl;
return 1;
}
// Create a command-queue on the first device available
// on the created context
commandQueue = CreateCommandQueue(context, &device);
if (commandQueue == NULL)
{
Cleanup(context, commandQueue, program, kernel, imageObjects, sampler,&memObject);
return 1;
}
// Make sure the device supports images, otherwise exit
cl_bool imageSupport = CL_FALSE;
clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool),
&imageSupport, NULL);
if (imageSupport != CL_TRUE)
{
std::cerr << "OpenCL device does not support images." << std::endl;
Cleanup(context, commandQueue, program, kernel, imageObjects, sampler,&memObject);
return 1;
}
// Load input image from file and load it into
// an OpenCL image object
int width, height;
// size_t origin[3] = { 0, 0, 0 };
//size_t region[3] = { width, height, 1};
imageObjects[0] = LoadImage(context,"C:\\Users\\Shreedhar\\Documents\\Visual Studio 2010\\Projects\\ImageFilter3\\lena.bmp", width, height);
if (imageObjects[0] == 0)
{
std::cerr << "Error loading: " << std::string("lena") << std::endl;
Cleanup(context, commandQueue, program, kernel, imageObjects, sampler,&memObject);
system("Pause");
return 1;
}
size_t origin[3] = { 0, 0, 0 };
size_t region[3] = { width, height, 1};
if (!CreateMemObjects(commandQueue,context, &memObject, width, height))
{
Cleanup(context, commandQueue, program, kernel, imageObjects, sampler,&memObject);
return 1;
}
// Create ouput image object
cl_image_format clImageFormat;
clImageFormat.image_channel_order = CL_RGBA;
clImageFormat.image_channel_data_type = CL_UNORM_INT8;
imageObjects[1] = clCreateImage2D(context,
CL_MEM_WRITE_ONLY,
&clImageFormat,
width,
height,
0,
NULL,
&errNum);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error creating CL output image object." << std::endl;
Cleanup(context, commandQueue, program, kernel, imageObjects, sampler,&memObject);
return 1;
}
// Create sampler for sampling image object
sampler = clCreateSampler(context,
CL_FALSE, // Non-normalized coordinates
CL_ADDRESS_CLAMP_TO_EDGE,
CL_FILTER_NEAREST,
&errNum);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error creating CL sampler object." << std::endl;
Cleanup(context, commandQueue, program, kernel, imageObjects, sampler,&memObject);
return 1;
}
// Create OpenCL program
program = CreateProgram(context, device, "ImageFilter2D.cl");
if (program == NULL)
{
Cleanup(context, commandQueue, program, kernel, imageObjects, sampler,&memObject);
return 1;
}
// Create OpenCL kernel
kernel = clCreateKernel(program, "gaussian_filter", NULL);
if (kernel == NULL)
{
std::cerr << "Failed to create kernel" << std::endl;
Cleanup(context, commandQueue, program, kernel, imageObjects, sampler,&memObject);
return 1;
}
// Set the kernel arguments
errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &imageObjects[0]);
errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &imageObjects[1]);
errNum |= clSetKernelArg(kernel, 2, sizeof(cl_sampler), &sampler);
errNum |= clSetKernelArg(kernel, 3, sizeof(cl_int), &width);
errNum |= clSetKernelArg(kernel, 4, sizeof(cl_int), &height);
errNum |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &memObject);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error setting kernel arguments." << std::endl;
Cleanup(context, commandQueue, program, kernel, imageObjects, sampler,&memObject);
return 1;
}
size_t localWorkSize[2] = { 16, 16 };
size_t globalWorkSize[2] = { RoundUp(localWorkSize[0], width),
RoundUp(localWorkSize[1], height) };
// Queue the kernel up for execution
errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL,
globalWorkSize, localWorkSize,
0, NULL, NULL);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error queuing kernel for execution." << std::endl;
Cleanup(context, commandQueue, program, kernel, imageObjects, sampler,&memObject);
return 1;
}
// Read the output buffer back to the Host
unsigned int * buffer1= new unsigned int [width * height * 4];
char *buffer = new char [width * height * 4];
errNum = clEnqueueReadImage(commandQueue, imageObjects[1], CL_TRUE,
origin, region, 0, 0, buffer,
0, NULL, NULL);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error reading result buffer." << std::endl;
Cleanup(context, commandQueue, program, kernel, imageObjects, sampler,&memObject);
return 1;
}
// Read the output buffer back to the Host
errNum = clEnqueueReadBuffer(commandQueue, memObject, CL_TRUE,
0, 4*width*height * sizeof(float), buffer1,
0, NULL, NULL);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error reading result buffer." << std::endl;
Cleanup(context, commandQueue, program, kernel, imageObjects, sampler,&memObject);
return 1;
}
// Output the result buffer
for ( unsigned int i = 0; i < 4*width*height; i++)
{
printf("%d=%u\n",i,buffer1);
}
std::cout << std::endl;
std::cout << "Executed program succesfully." << std::endl;
//memset(buffer, 0xff, width * height * 4);
// Save the image out to disk
if (!SaveImage("C:\\Users\\Shreedhar\\Documents\\Visual Studio 2010\\Projects\\ImageFilter3\\lenaout.bmp", buffer, width, height))
{
std::cerr << "Error writing output image: " <<"lenaout.bmp" << std::endl;
Cleanup(context, commandQueue, program, kernel, imageObjects, sampler,&memObject);
delete [] buffer;
return 1;
}
delete [] buffer;
Cleanup(context, commandQueue, program, kernel, imageObjects, sampler,&memObject);
return 0;
}
//Kernel Code
__kernel void gaussian_filter(__read_only image2d_t srcImg,
__write_only image2d_t dstImg,
sampler_t sampler,
int width, int height, __global float4 *buffer1)
{
int x= get_global_id(0);
int y=get_global_id(1);
unsigned int id=width*x+y;
buffer1[id]=read_imagef(srcImg, sampler, (int2)(x, y)) ;
}
Solved! Go to Solution.
There is absolutely no problem in reading cl_float4 (or) for that matter .. any other data type from device to host (or) host to device....
Atually, readBuffer API is data-type independent. At the hardware level, it is all bytes... Thats all. Only procesing units and software enforce semantics saying that it is an array of floats, integer etc... FYI
In your code, it looks like you are allocating width*height*sizeof(char)
But you are reading width*height*sizeof(cl_float4)
Thats probably reason why you are failing..... You can just read width*heigth*sizeof(char) into a cl_float4 array...
And then access it. It should work.
There is absolutely no problem in reading cl_float4 (or) for that matter .. any other data type from device to host (or) host to device....
Atually, readBuffer API is data-type independent. At the hardware level, it is all bytes... Thats all. Only procesing units and software enforce semantics saying that it is an array of floats, integer etc... FYI
In your code, it looks like you are allocating width*height*sizeof(char)
But you are reading width*height*sizeof(cl_float4)
Thats probably reason why you are failing..... You can just read width*heigth*sizeof(char) into a cl_float4 array...
And then access it. It should work.
Yeah, the code worked with your solution... but when I read the buffer array, it gives me 10 digit number for every pixel value... why is it so ...? cause the intensity range is just from 0 to 255...!
and how are these intensities arranged in the buffer, I mean is it R,,G, B of the first position in a sequence , then the R G B of the next position or is it R values of all positions firstly in sequence, then the G and then B...?
please reply Himanshu...!
Your best bet is to read through the relevant areas in the spec for sometime to understand things clearly -- so that you have some perspective.
For e.g., for your current case - If you search around the spec for CL_RGBA -- You can hit the explanation of the memory layout in Page 94 (openCL 1.2 PDF).
HTH
I have the same problem
chm289,
Actually, the problem reported by the "Shreedhar" was related to the code -- where the size of allocation was not proper.
Can you explain more what is not working for you?