cancel
Showing results for 
Search instead for 
Did you mean: 

Archives Discussions

Sheeep
Journeyman III

reading data from CL Kernel

Hi all,

i worked with ati stream and brook+ for 1 year.

now i'm trying to work with opencl. My problem is, the cl kernel is running, but i don't get the output. I tried using cpu and gpu and allways i have this problem.

My CL code ist a simple addition:

 

#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
__kernel void add(__global int *a,__global int *b,__global int *c){
    size_t tid = get_global_id(0);

        c[0]=a[0]+b[0];
   
}

 

 

And this is my hostcode using c++ bindings:

 

#include

#define alloca _alloca
#include
#include
#include
#include
#include
#include

int main(int argc, char** argv){
    try{
        //get CL platform info
        std::vector platforms;
            cl:: Platform::get(&platforms);
              std::vector::iterator i;
            if(platforms.size() > 0)
            {
                for(i = platforms.begin(); i != platforms.end(); ++i)
                {
                    if(!strcmp((*i).getInfo().c_str(), "Advanced Micro Devices, Inc."))
                    {
                        break;
                    }
                }
            }
         cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*i)(), 0 };

        //creating CL context
        cl::Context context(CL_DEVICE_TYPE_GPU,cps);   
   
        //get CL device list
        std::vector devices = context.getInfo();

        //create CommandQuere
         cl::CommandQueue queue(context, devices[0]);

        //reading CL file
        std::ifstream file("HelloCL_Kernels.cl");
        std::string prog(std::istreambuf_iterator(file), (std::istreambuf_iterator())); 
        cl:: Program::Sources source(1, std::make_pair(prog.c_str(), prog.length()+1));

        // create CL program
        cl:: Program* pProgram=new cl:: Program(context, source);
        cl:: Program&    program = *pProgram;
        program.build(devices);

        //create CL kernel
        cl::Kernel kernel(program, "add");
       
        //create Host mem objects
        int wsize=1;
        float *hostIn1=new float[wsize];
        int *hostIn2=new int[wsize];
        int *hostOut=new int[wsize];
        hostIn1[0]=1;
        hostIn1[0]=5;

        //create CL mem objects
        cl::Buffer clIn1(context,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(float)*wsize,&hostIn1,NULL);
        cl::Buffer clIn2(context,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(float)*wsize,&hostIn2,NULL);
        cl::Buffer clOut(context,CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(float)*wsize,&hostOut,NULL);

        //write mem objects to buffer
        queue.enqueueWriteBuffer(clIn1,CL_TRUE,0,sizeof(float)*wsize,hostIn1,NULL,NULL);
        queue.enqueueWriteBuffer(clIn2,CL_TRUE,0,sizeof(float)*wsize,hostIn2,NULL,NULL);
       

        //set kernelargs
        kernel.setArg(0,clIn1);
        kernel.setArg(1,clIn2);
        kernel.setArg(2,clOut);
       
        //run CL program
        queue.finish();
        queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(4,4), cl::NDRange(2, 2));
       
        //read buffer
        queue.enqueueReadBuffer(clOut,CL_TRUE,0,sizeof(float)*wsize,hostOut,NULL,NULL);

        queue.finish();

        std::cout<<<hostOut[0]<<std::endl;

        delete pProgram;
       
    }
    catch(cl::Error err){
     std::cerr << "ERROR: " << err.what() << "(" << err.err() << ")" << std::endl;
    }

    return EXIT_SUCCESS;
}

do anybody nows, why i don't get the addition's result, but 0?

 

0 Likes
8 Replies
genaganna
Journeyman III

Sheeep,

   Your wsize=1 but your globalWorkSize=16. you are accessing out o fbound memory.

  Please make it globalWorkSize=1 and localWorkGroupSize=1.

 

0 Likes

Hi,

i change:

queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(1), cl::NDRange(1));

so i have global worksize 1 and wsize is 1, too. is that right?

and:

int *hostIn2=new int[wsize];
int *hostOut=new int[wsize];

to float.

but i have the same problem.

 

can you please explain my worksizeproblem. i don't really understand it at the moment.

0 Likes

Sheeep,

        I am not able to compile your code.  Please use HelloCL sample as starting point.

       I hope you are using latest OpenCL SDK.  You will find latest SDK at http://developer.amd.com/gpu/ATIStreamSDK/Pages/default.aspx

0 Likes

ok, i try it.

But i can compile it, using vs2008 and ati stream sdk 2.0 final

the kernel is running, the only problem is getting result back.

 

Whats error give the compiler back?

EDIT:

The Hello CL example use  queue.enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(4, 4), cl::NDRange(2, 2));

 

So I have a total worksize of 16 and a workgroupsize of 4? And the data-dimensions are NullRange? Why that?

 

 

Merry Christmas

Sheeep

0 Likes

Ok, I tried to create a OpenCL Program, using the HelloCL example:

 

I get this host code:

#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <vector>
#include <CL\cl.hpp>
#include <ctime>

inline void checkErr(cl_int err, const char * name){
    if (err != CL_SUCCESS) {
        std::cerr << "ERROR: " << name << "(" << err << ")" << std::endl;
        exit(EXIT_FAILURE);
    }
};
int main(int argc, char** argv){
    //error code
    cl_int error;
    //get CL platform info
    std::vector<cl::Platform> platforms;
        cl::Platform::get(&platforms);
          std::vector<cl::Platform>::iterator i;
        if(platforms.size() > 0)
        {
            for(i = platforms.begin(); i != platforms.end(); ++i)
            {
                if(!strcmp((*i).getInfo<CL_PLATFORM_VENDOR>().c_str(), "Advanced Micro Devices, Inc."))
                {
                    break;
                }
            }
        }
     cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*i)(), 0 };
    //Creating CL Device;
    cl::Context context=cl::Context(CL_DEVICE_TYPE_GPU,cps,NULL,NULL,&error);
    if(error==CL_SUCCESS){
        std::cout<<"Using GPU"<<std::endl;
    }
    if(error!=CL_SUCCESS){
        context=cl::Context(CL_DEVICE_TYPE_CPU,cps,NULL,NULL,&error);
        std::cout<<"Using CPU"<<std::endl;
    }
    //getting Device List
    std::vector<cl::Device> devices=context.getInfo<CL_CONTEXT_DEVICES>();
    //creating CommandQuere
    cl::CommandQueue quere=cl::CommandQueue(context,devices[0]);
    //Reading CL Programm from file
    std::ifstream file("Testkernel.cl");    //Kernelname
    std::string prog(std::istreambuf_iterator<char>(file),(std::istreambuf_iterator<char>()));
    cl::Program::Sources source(1,std::make_pair(prog.c_str(), prog.length()+1));
    //Building CL Programm for Device
    cl::Program program=cl::Program(context,source,&error);
    checkErr(error,"Error building Programm");
    program.build(devices);
    //finally Kernels:
    cl::Kernel kernel=cl::Kernel(program,"add",&error);    //CL_Programm, Funktions-/Kernelname, errorcode
    //checkErr(error,"Error setting Kernel");
    //Suche Fehler:
    std::string fehler=program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]);
    //std::cout<<fehler<<std::endl;
    int wsize=1;
    float *a=new float[1];
    float *b=new float[1];
    float *c=new float[wsize];
    a[0]=1;
    b[0]=1;

    //c[0]=0.0f;
    //initialing OpenCL Buffer(MemoryObjects)
    cl::Buffer CL1=cl::Buffer(context,CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR,wsize,a,&error);
    cl::Buffer CL2=cl::Buffer(context,CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR,wsize,b,&error);
    cl::Buffer CL3=cl::Buffer(context,CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR,wsize,c,&error);
   
    //setting Kernel Arguments
    kernel.setArg(0,CL1);
    kernel.setArg(1,CL2);
    kernel.setArg(2,CL3);
    //write mCL mem to device
    quere.enqueueWriteBuffer(CL1,CL_TRUE,0,wsize,a);
    quere.enqueueWriteBuffer(CL2,CL_TRUE,0,wsize,b);

   
    //Running Kernel
    clock_t time;
    time=clock();
    quere.enqueueNDRangeKernel(kernel,cl::NDRange(),cl::NDRange(wsize,1,1),cl::NDRange(wsize,1,1),NULL,NULL); //quere.enqueueNDRangeKernel(kernelname,cl::NullRange,cl::NDRange(arraylänge),cl::NDRange(1,1),NULL,NULL);
   
    quere.enqueueReadBuffer (CL3,CL_TRUE,0,wsize,c);
    time=clock()-time;
    //Ausgabe:
    std::cout<<"Ergebnis GPU: "<<std::endl;
    for(int i=0;i<wsize;i++){
        std::cout<<"    "<<c<<std::endl;
    }
    std::cout<<"Zeit GPU:   "<<time<<std::endl;
    return 0;
}


using this kernel:

__kernel void add(__global const float *a,__global const float *b,__global float *c){
    int id0 = get_global_id(0);
    c[id0] = a[id0] + b[id0];
}

I can run it on CPU, but not on GPU. Does anybody know why?

My GPU is a ATI Radeon 4870, typ MSI 4870OC Edition.



0 Likes

Sheeep,

            There are problems in the code.  many places you are specifing wsize instead of sizeof(float) * wsize.

See attached code. I have modified code which gives same results both on CPU and GPU

 

#include <cstdio> #include <cstdlib> #include <fstream> #include <iostream> #include <vector> #include <CL\cl.hpp> #include <ctime> inline void checkErr(cl_int err, const char * name){ if (err != CL_SUCCESS) { std::cerr << "ERROR: " << name << "(" << err << ")" << std::endl; exit(EXIT_FAILURE); } }; int main(int argc, char** argv){ //error code cl_int error; //get CL platform info std::vector<cl::Platform> platforms; cl::Platform::get(&platforms); std::vector<cl::Platform>::iterator i; if(platforms.size() > 0) { for(i = platforms.begin(); i != platforms.end(); ++i) { if(!strcmp((*i).getInfo<CL_PLATFORM_VENDOR>().c_str(), "Advanced Micro Devices, Inc.")) { break; } } } cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*i)(), 0 }; //Creating CL Device; cl::Context context=cl::Context(CL_DEVICE_TYPE_GPU,cps,NULL,NULL,&error); if(error==CL_SUCCESS){ std::cout<<"Using GPU"<<std::endl; } if(error!=CL_SUCCESS){ context=cl::Context(CL_DEVICE_TYPE_CPU,cps,NULL,NULL,&error); std::cout<<"Using CPU"<<std::endl; } //getting Device List std::vector<cl::Device> devices=context.getInfo<CL_CONTEXT_DEVICES>(); //creating CommandQuere cl::CommandQueue quere=cl::CommandQueue(context,devices[0]); //Reading CL Programm from file std::ifstream file("HelloCL_Kernels.cl"); //Kernelname std::string prog(std::istreambuf_iterator<char>(file),(std::istreambuf_iterator<char>())); cl::Program::Sources source(1,std::make_pair(prog.c_str(), prog.length()+1)); //Building CL Programm for Device cl::Program program=cl::Program(context,source,&error); checkErr(error,"Error building Programm"); program.build(devices); //finally Kernels: cl::Kernel kernel=cl::Kernel(program,"add",&error); //CL_Programm, Funktions-/Kernelname, errorcode //checkErr(error,"Error setting Kernel"); //Suche Fehler: std::string fehler=program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]); //std::cout<<fehler<<std::endl; int wsize=1; float *a=new float[1]; float *b=new float[1]; float *c=new float[wsize]; a[0]=1; b[0]=1; //c[0]=0.0f; //initialing OpenCL Buffer(MemoryObjects) cl::Buffer CL1=cl::Buffer(context,CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR,sizeof(float) * wsize,a,&error); cl::Buffer CL2=cl::Buffer(context,CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR,sizeof(float) * wsize,b,&error); cl::Buffer CL3=cl::Buffer(context,CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR,sizeof(float) * wsize,c,&error); //setting Kernel Arguments kernel.setArg(0,CL1); kernel.setArg(1,CL2); kernel.setArg(2,CL3); //write mCL mem to device quere.enqueueWriteBuffer(CL1,CL_TRUE,0,sizeof(float) * wsize,a); quere.enqueueWriteBuffer(CL2,CL_TRUE,0,sizeof(float) * wsize,b); //Running Kernel clock_t time; time=clock(); cl_int err = quere.enqueueNDRangeKernel(kernel,cl::NullRange,cl::NDRange(wsize,1,1),cl::NDRange(wsize,1,1),NULL,NULL); //quere.enqueueNDRangeKernel(kernelname,cl::NullRange,cl::NDRange(arraylänge),cl::NDRange(1,1),NULL,NULL); if (err != CL_SUCCESS) { std::cerr << "CommandQueue::enqueueNDRangeKernel()" \ " failed (" << err << ")\n"; return -1; } err = quere.enqueueReadBuffer (CL3,CL_TRUE,0,sizeof(float) * wsize,c); if (err != CL_SUCCESS) { std::cerr << "enqueueReadBuffer" \ " failed (" << err << ")\n"; return -1; } time=clock()-time; for(int i=0;i<wsize;i++){ std::cout<<" "<<c<<std::endl; } return 0; }

0 Likes

Thank you for you help.

 

Running the Code I get the error: CommandQueue::enqueueNDRangeKernel() failed (-48)

 

Do  you know what it is?

Is there a list of OpenCL error code?

 

lg Sheeep

 

EDIT:

Sry, my error, it is working! Thank you very much for your help.

 

 

0 Likes

Sheeep,

It seems you have already found problem.  change HelloCL_Kernels.cl to Testkernel.cl.

 

For each error code there is a preprocessor def in cl.h file.  Read perticular function Description in spec to find cause for perticular error code.

0 Likes