Hi all,
i worked with ati stream and brook+ for 1 year.
now i'm trying to work with opencl. My problem is, the cl kernel is running, but i don't get the output. I tried using cpu and gpu and allways i have this problem.
My CL code ist a simple addition:
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
__kernel void add(__global int *a,__global int *b,__global int *c){
size_t tid = get_global_id(0);
c[0]=a[0]+b[0];
}
And this is my hostcode using c++ bindings:
#include
#define alloca _alloca
#include
#include
#include
#include
#include
#include
int main(int argc, char** argv){
try{
//get CL platform info
std::vector platforms;
cl:: Platform::get(&platforms);
std::vector::iterator i;
if(platforms.size() > 0)
{
for(i = platforms.begin(); i != platforms.end(); ++i)
{
if(!strcmp((*i).getInfo().c_str(), "Advanced Micro Devices, Inc."))
{
break;
}
}
}
cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*i)(), 0 };
//creating CL context
cl::Context context(CL_DEVICE_TYPE_GPU,cps);
//get CL device list
std::vector devices = context.getInfo();
//create CommandQuere
cl::CommandQueue queue(context, devices[0]);
//reading CL file
std::ifstream file("HelloCL_Kernels.cl");
std::string prog(std::istreambuf_iterator(file), (std::istreambuf_iterator()));
cl:: Program::Sources source(1, std::make_pair(prog.c_str(), prog.length()+1));
// create CL program
cl:: Program* pProgram=new cl:: Program(context, source);
cl:: Program& program = *pProgram;
program.build(devices);
//create CL kernel
cl::Kernel kernel(program, "add");
//create Host mem objects
int wsize=1;
float *hostIn1=new float[wsize];
int *hostIn2=new int[wsize];
int *hostOut=new int[wsize];
hostIn1[0]=1;
hostIn1[0]=5;
//create CL mem objects
cl::Buffer clIn1(context,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(float)*wsize,&hostIn1,NULL);
cl::Buffer clIn2(context,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(float)*wsize,&hostIn2,NULL);
cl::Buffer clOut(context,CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(float)*wsize,&hostOut,NULL);
//write mem objects to buffer
queue.enqueueWriteBuffer(clIn1,CL_TRUE,0,sizeof(float)*wsize,hostIn1,NULL,NULL);
queue.enqueueWriteBuffer(clIn2,CL_TRUE,0,sizeof(float)*wsize,hostIn2,NULL,NULL);
//set kernelargs
kernel.setArg(0,clIn1);
kernel.setArg(1,clIn2);
kernel.setArg(2,clOut);
//run CL program
queue.finish();
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(4,4), cl::NDRange(2, 2));
//read buffer
queue.enqueueReadBuffer(clOut,CL_TRUE,0,sizeof(float)*wsize,hostOut,NULL,NULL);
queue.finish();
std::cout<<<hostOut[0]<<std::endl;
delete pProgram;
}
catch(cl::Error err){
std::cerr << "ERROR: " << err.what() << "(" << err.err() << ")" << std::endl;
}
return EXIT_SUCCESS;
}
do anybody nows, why i don't get the addition's result, but 0?
Sheeep,
Your wsize=1 but your globalWorkSize=16. you are accessing out o fbound memory.
Please make it globalWorkSize=1 and localWorkGroupSize=1.
Hi,
i change:
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(1), cl::NDRange(1));
so i have global worksize 1 and wsize is 1, too. is that right?
and:
int *hostIn2=new int[wsize];
int *hostOut=new int[wsize];
to float.
but i have the same problem.
can you please explain my worksizeproblem. i don't really understand it at the moment.
Sheeep,
I am not able to compile your code. Please use HelloCL sample as starting point.
I hope you are using latest OpenCL SDK. You will find latest SDK at http://developer.amd.com/gpu/ATIStreamSDK/Pages/default.aspx
ok, i try it.
But i can compile it, using vs2008 and ati stream sdk 2.0 final
the kernel is running, the only problem is getting result back.
Whats error give the compiler back?
EDIT:
The Hello CL example use queue.enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(4, 4), cl::NDRange(2, 2));
So I have a total worksize of 16 and a workgroupsize of 4? And the data-dimensions are NullRange? Why that?
Merry Christmas
Sheeep
Ok, I tried to create a OpenCL Program, using the HelloCL example:
I get this host code:
#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <vector>
#include <CL\cl.hpp>
#include <ctime>
inline void checkErr(cl_int err, const char * name){
if (err != CL_SUCCESS) {
std::cerr << "ERROR: " << name << "(" << err << ")" << std::endl;
exit(EXIT_FAILURE);
}
};
int main(int argc, char** argv){
//error code
cl_int error;
//get CL platform info
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
std::vector<cl::Platform>::iterator i;
if(platforms.size() > 0)
{
for(i = platforms.begin(); i != platforms.end(); ++i)
{
if(!strcmp((*i).getInfo<CL_PLATFORM_VENDOR>().c_str(), "Advanced Micro Devices, Inc."))
{
break;
}
}
}
cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*i)(), 0 };
//Creating CL Device;
cl::Context context=cl::Context(CL_DEVICE_TYPE_GPU,cps,NULL,NULL,&error);
if(error==CL_SUCCESS){
std::cout<<"Using GPU"<<std::endl;
}
if(error!=CL_SUCCESS){
context=cl::Context(CL_DEVICE_TYPE_CPU,cps,NULL,NULL,&error);
std::cout<<"Using CPU"<<std::endl;
}
//getting Device List
std::vector<cl::Device> devices=context.getInfo<CL_CONTEXT_DEVICES>();
//creating CommandQuere
cl::CommandQueue quere=cl::CommandQueue(context,devices[0]);
//Reading CL Programm from file
std::ifstream file("Testkernel.cl"); //Kernelname
std::string prog(std::istreambuf_iterator<char>(file),(std::istreambuf_iterator<char>()));
cl::Program::Sources source(1,std::make_pair(prog.c_str(), prog.length()+1));
//Building CL Programm for Device
cl::Program program=cl::Program(context,source,&error);
checkErr(error,"Error building Programm");
program.build(devices);
//finally Kernels:
cl::Kernel kernel=cl::Kernel(program,"add",&error); //CL_Programm, Funktions-/Kernelname, errorcode
//checkErr(error,"Error setting Kernel");
//Suche Fehler:
std::string fehler=program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]);
//std::cout<<fehler<<std::endl;
int wsize=1;
float *a=new float[1];
float *b=new float[1];
float *c=new float[wsize];
a[0]=1;
b[0]=1;
//c[0]=0.0f;
//initialing OpenCL Buffer(MemoryObjects)
cl::Buffer CL1=cl::Buffer(context,CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR,wsize,a,&error);
cl::Buffer CL2=cl::Buffer(context,CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR,wsize,b,&error);
cl::Buffer CL3=cl::Buffer(context,CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR,wsize,c,&error);
//setting Kernel Arguments
kernel.setArg(0,CL1);
kernel.setArg(1,CL2);
kernel.setArg(2,CL3);
//write mCL mem to device
quere.enqueueWriteBuffer(CL1,CL_TRUE,0,wsize,a);
quere.enqueueWriteBuffer(CL2,CL_TRUE,0,wsize,b);
//Running Kernel
clock_t time;
time=clock();
quere.enqueueNDRangeKernel(kernel,cl::NDRange(),cl::NDRange(wsize,1,1),cl::NDRange(wsize,1,1),NULL,NULL); //quere.enqueueNDRangeKernel(kernelname,cl::NullRange,cl::NDRange(arraylänge),cl::NDRange(1,1),NULL,NULL);
quere.enqueueReadBuffer (CL3,CL_TRUE,0,wsize,c);
time=clock()-time;
//Ausgabe:
std::cout<<"Ergebnis GPU: "<<std::endl;
for(int i=0;i<wsize;i++){
std::cout<<" "<<c<<std::endl;
}
std::cout<<"Zeit GPU: "<<time<<std::endl;
return 0;
}
using this kernel:
__kernel void add(__global const float *a,__global const float *b,__global float *c){
int id0 = get_global_id(0);
c[id0] = a[id0] + b[id0];
}
I can run it on CPU, but not on GPU. Does anybody know why?
My GPU is a ATI Radeon 4870, typ MSI 4870OC Edition.
Sheeep,
There are problems in the code. many places you are specifing wsize instead of sizeof(float) * wsize.
See attached code. I have modified code which gives same results both on CPU and GPU
#include <cstdio> #include <cstdlib> #include <fstream> #include <iostream> #include <vector> #include <CL\cl.hpp> #include <ctime> inline void checkErr(cl_int err, const char * name){ if (err != CL_SUCCESS) { std::cerr << "ERROR: " << name << "(" << err << ")" << std::endl; exit(EXIT_FAILURE); } }; int main(int argc, char** argv){ //error code cl_int error; //get CL platform info std::vector<cl::Platform> platforms; cl::Platform::get(&platforms); std::vector<cl::Platform>::iterator i; if(platforms.size() > 0) { for(i = platforms.begin(); i != platforms.end(); ++i) { if(!strcmp((*i).getInfo<CL_PLATFORM_VENDOR>().c_str(), "Advanced Micro Devices, Inc.")) { break; } } } cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*i)(), 0 }; //Creating CL Device; cl::Context context=cl::Context(CL_DEVICE_TYPE_GPU,cps,NULL,NULL,&error); if(error==CL_SUCCESS){ std::cout<<"Using GPU"<<std::endl; } if(error!=CL_SUCCESS){ context=cl::Context(CL_DEVICE_TYPE_CPU,cps,NULL,NULL,&error); std::cout<<"Using CPU"<<std::endl; } //getting Device List std::vector<cl::Device> devices=context.getInfo<CL_CONTEXT_DEVICES>(); //creating CommandQuere cl::CommandQueue quere=cl::CommandQueue(context,devices[0]); //Reading CL Programm from file std::ifstream file("HelloCL_Kernels.cl"); //Kernelname std::string prog(std::istreambuf_iterator<char>(file),(std::istreambuf_iterator<char>())); cl::Program::Sources source(1,std::make_pair(prog.c_str(), prog.length()+1)); //Building CL Programm for Device cl::Program program=cl::Program(context,source,&error); checkErr(error,"Error building Programm"); program.build(devices); //finally Kernels: cl::Kernel kernel=cl::Kernel(program,"add",&error); //CL_Programm, Funktions-/Kernelname, errorcode //checkErr(error,"Error setting Kernel"); //Suche Fehler: std::string fehler=program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]); //std::cout<<fehler<<std::endl; int wsize=1; float *a=new float[1]; float *b=new float[1]; float *c=new float[wsize]; a[0]=1; b[0]=1; //c[0]=0.0f; //initialing OpenCL Buffer(MemoryObjects) cl::Buffer CL1=cl::Buffer(context,CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR,sizeof(float) * wsize,a,&error); cl::Buffer CL2=cl::Buffer(context,CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR,sizeof(float) * wsize,b,&error); cl::Buffer CL3=cl::Buffer(context,CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR,sizeof(float) * wsize,c,&error); //setting Kernel Arguments kernel.setArg(0,CL1); kernel.setArg(1,CL2); kernel.setArg(2,CL3); //write mCL mem to device quere.enqueueWriteBuffer(CL1,CL_TRUE,0,sizeof(float) * wsize,a); quere.enqueueWriteBuffer(CL2,CL_TRUE,0,sizeof(float) * wsize,b); //Running Kernel clock_t time; time=clock(); cl_int err = quere.enqueueNDRangeKernel(kernel,cl::NullRange,cl::NDRange(wsize,1,1),cl::NDRange(wsize,1,1),NULL,NULL); //quere.enqueueNDRangeKernel(kernelname,cl::NullRange,cl::NDRange(arraylänge),cl::NDRange(1,1),NULL,NULL); if (err != CL_SUCCESS) { std::cerr << "CommandQueue::enqueueNDRangeKernel()" \ " failed (" << err << ")\n"; return -1; } err = quere.enqueueReadBuffer (CL3,CL_TRUE,0,sizeof(float) * wsize,c); if (err != CL_SUCCESS) { std::cerr << "enqueueReadBuffer" \ " failed (" << err << ")\n"; return -1; } time=clock()-time; for(int i=0;i<wsize;i++){ std::cout<<" "<<c<<std::endl; } return 0; }
Thank you for you help.
Running the Code I get the error: CommandQueue::enqueueNDRangeKernel() failed (-48)
Do you know what it is?
Is there a list of OpenCL error code?
lg Sheeep
EDIT:
Sry, my error, it is working! Thank you very much for your help.
Sheeep,
It seems you have already found problem. change HelloCL_Kernels.cl to Testkernel.cl.
For each error code there is a preprocessor def in cl.h file. Read perticular function Description in spec to find cause for perticular error code.