Archives Discussions

chevydevil · ‎01-09-2010

I'm trying to get the interop with the c++ bindings to run under windows. Is it possible and if what do i have to do? In the cl.hpp are the clEnqueueAcquireObject and clEnqueueReleaseObjects forwarded so I thought there has to be a way. Here is my source code:

///Kernel//////////// #pragma OPENCL EXTENSION cl_khr_gl_sharing : enable __kernel void squareArray( __global float* input2, __global float* output, float stime) { output[get_global_id(0)] = input2[get_global_id(0)]*stime; }; /////cl_test.hpp/////////////////// #include <utility> #define __NO_STD_VECTOR // Use cl::vector and cl::string and #define __NO_STD_STRING // not STL versions, more on this later #include <gl/glee.h> #include <gl/Gl.h> #include <gl/glu.h> #include <gl/glut.h> #include <CL/cl.hpp> #include <cl/cl_gl.h> #include <cstdio> #include <cstdlib> #include <fstream> #include <iostream> #include <string> #include <ctime> const size_t problemSize = 4000000; class CL_test { public: CL_test(); ~CL_test(); void setupCL(); void setupCLKernels(const char*, const char*); void runCLKernels(); cl_float random(cl_float, cl_float); void updatekernel(cl_float); inline void checkErr(cl_int, const char*); cl::Context context; cl::Kernel kernel; cl::vector<cl::Device> devices; cl::Program program; cl::CommandQueue cmdQ; cl_int err; cl::Buffer inputBuffer; cl::Buffer inputBuffer2; cl::Buffer outputBuffer; cl_float stime; cl_float input[problemSize]; cl_float input2[problemSize]; cl_float output[problemSize]; GLuint vbo[1]; cl_mem out; protected: void allocateMemory(); private: }; ////////////cl_test.cpp/////////////////// #include "cl_test.hpp" CL_test::CL_test(){ stime = 0.02; }; CL_test::~CL_test(){ #if defined (_WIN32) _aligned_free(output); _aligned_free(input2); _aligned_free(input); #else free(output); free(input2); free(input); #endif } inline void CL_test::checkErr(cl_int err, const char *name){ if (err != CL_SUCCESS) { std::cerr << "ERROR: " << name << "("<< err <<")" << std::endl; //exit(EXIT_FAILURE); } } cl_float CL_test::random(cl_float low, cl_float high){ cl_float range = high - low; cl_float multiplier = ( cl_float ) rand() / RAND_MAX; return range * multiplier + low; } void CL_test::setupCL(){ //create context containing the target devices cl::vector<cl::Platform> platforms; err = cl::Platform::get(&platforms); checkErr(err, "Platform::get()"); // cl::vector<cl::Platform>::iterator i; // for (i = platforms.begin(); i!= platforms.end(); ++i) { // // pick a platform and do something // std::cout<< " Platform Name: " << (*i).getInfo<CL_PLATFORM_VENDOR>().c_str() << std::endl; // } cl::vector<cl::Platform>::iterator i; if(platforms.size() > 0) { for(i = platforms.begin(); i != platforms.end(); ++i) { std::cout<<(*i).getInfo<CL_PLATFORM_VENDOR>().c_str()<<std::endl; if(!strcmp((*i).getInfo<CL_PLATFORM_VENDOR>().c_str(), "Advanced Micro Devices, Inc.")) { break; } } } err = (*i).getDevices(CL_DEVICE_TYPE_GPU, &devices); checkErr(err, "Device::getDevice"); // cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*i)(), 0 }; //is this right or do i miss something? cl_context_properties lProperties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(*i)(),CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(), CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(), 0 }; // cl::Context context(CL_DEVICE_TYPE_GPU, cps, NULL, NULL, &err); context = cl::Context(devices, lProperties, NULL, NULL, &err); checkErr(err, "Context::Context()"); } void CL_test::setupCLKernels(const char* _kernelfile, const char* _kernelname){ //read __kernels std::ifstream file(_kernelfile); checkErr(file.is_open() ? CL_SUCCESS:-1, _kernelfile); std::string prog( std::istreambuf_iterator<char>(file), (std::istreambuf_iterator<char>())); //Gather all the kernel sources for the OpenCL program cl::Program::Sources source; source.push_back(std::make_pair(prog.c_str(), prog.length()+1)); //Make an OpenCl program program= cl::Program (context, source); //get all available devices in context devices = context.getInfo<CL_CONTEXT_DEVICES>(); checkErr(devices.size() > 0 ? CL_SUCCESS : -1, "devices.size() > 0"); //build kernel source for devices in context err = program.build(devices); checkErr(err, "Program::build()"); //get squareArray kernel = cl::Kernel(program, _kernelname, &err); checkErr(err, "Program::build()"); allocateMemory(); err=clEnqueueAcquireGLObjects(cmdQ, 1, &out, 0, NULL,NULL); //doesnt work because cmdQ is cl:CommandCueue checkErr(err, "clEnqueueAcquireGLObjects"); // err = kernel.setArg(0, inputBuffer); // checkErr(err, "kernel.setarg0)"); err = kernel.setArg(0, inputBuffer2); checkErr(err, "kernel.setarg1"); err = kernel.setArg(1, out); checkErr(err, "kernel.setarg2)"); err = kernel.setArg(2, stime); checkErr(err, "kernel.setarg3"); cmdQ.finish(); } void CL_test::runCLKernels(){ //Get a Functor which will run the kernel on every input item in blocks of 64 threads cl::KernelFunctor func =kernel.bind(cmdQ, cl::NDRange(problemSize), cl::NDRange(200) ); //wait for kernel to finish func().wait(); //for outputdata // cmdQ.enqueueReadBuffer(outputBuffer, // true, // 0, // sizeof(cl_float) * problemSize, // &output[0]); // err = clEnqueueReleaseGLObjects(cmdQ, 1, &out, 0, NULL, NULL); //same problem as above // checkErr(err, "clEnqueueReleaseGLObjects"); std::cout<<"!"; } void CL_test::allocateMemory(){ //queue for jobs on specified compute device cmdQ = cl::CommandQueue(context, devices[0],0, &err); checkErr(err, "CommandQueue::CommandQueue"); //allocate memory for input // inputBuffer = cl::Buffer(context, // CL_MEM_READ_ONLY, // sizeof(cl_float) * problemSize, // 0, // &err // ); // checkErr(err,"Buffer::inputBuffer"); inputBuffer2 = cl::Buffer(context, CL_MEM_READ_ONLY, sizeof(cl_float) * problemSize, 0, &err ); checkErr(err,"Buffer::inputBuffer"); //clEnqueueAcquireGLObjects() // cmdQ.enqueueWriteBuffer(inputBuffer, // CL_TRUE, // 0, // sizeof(cl_float) * problemSize, // input); cmdQ.enqueueWriteBuffer(inputBuffer2, CL_TRUE, 0, sizeof(cl_float) * problemSize, input2); //allocate memory for output out = clCreateFromGLBuffer(context,CL_MEM_READ_WRITE, vbo[0],&err); //again cl_context vs cl::Context checkErr(err,"createFromOpenGLBuffer"); } void CL_test::updatekernel(cl_float _stime){ err = kernel.setArg(2, _stime); checkErr(err, "kernel.setarg1"); } ///main.cpp #include "cl_test.hpp" CL_test test; cl_float up=0.0; // rotation values for the navigation float rotation[3] = { 0.0, 0.0, 0.0 }; //----------------------------------------------------------------------------- // parameters for the navigation // position of the mouse when pressed int mousePressedX = 0, mousePressedY = 0; // mouse button states int leftMouseButtonActive = 0; //----------------------------------------------------------------------------- void mouseFunc(int button, int state, int x, int y) { // get the mouse buttons if (button == GLUT_LEFT_BUTTON && state == GLUT_DOWN) leftMouseButtonActive += 1; else leftMouseButtonActive -= 1; mousePressedX = x; mousePressedY = y; } //----------------------------------------------------------------------------- void mouseMotionFunc(int x, int y) { // rotation if (leftMouseButtonActive) { rotation[0] += ((mousePressedY - y)); rotation[1] += ((mousePressedX - x)); mousePressedY = y; mousePressedX = x; } } //----------------------------------------------------------------------------- void init(){ for (int i=0; i<problemSize-3; i+=3){ test.output=test.random(0.1,30.0); test.output[i+1]=test.random(0.1,30); test.output[i+2]=test.random(0.1,30.0); } for (int i=0; i<problemSize-3; i+=3){ test.input2=test.random(0.1,30.0); test.input2[i+1]=test.random(0.1,30); test.input2[i+2]=test.random(0.1,30.0); } glGenBuffers(1, test.vbo); glBindBuffer(GL_ARRAY_BUFFER, test.vbo[0]); glBufferData(GL_ARRAY_BUFFER, sizeof(cl_float) * problemSize, test.output, GL_DYNAMIC_COPY_ARB); glBindBuffer(GL_ARRAY_BUFFER, 0); } void idlefunc(){ up +=0.002; if (up>=15) { up=0.0; } test.updatekernel(up); glutPostRedisplay(); } void draw(){ glBindBuffer(GL_ARRAY_BUFFER, test.vbo[0]); glEnableClientState(GL_VERTEX_ARRAY); glVertexPointer(3, GL_FLOAT, 0, 0); glDrawArrays(GL_POINTS, 0, problemSize/3); glBindBuffer(GL_ARRAY_BUFFER, 0); glDisableClientState(GL_VERTEX_ARRAY); } void displayfunc() { glClearColor(0.0 ,0.0, 0.0, 0.0); glClear(GL_COLOR_BUFFER_BIT); glClear(GL_DEPTH_BUFFER_BIT); glPointSize(1.0); glBlendFunc(GL_SRC_ALPHA, GL_ONE); glEnable(GL_BLEND); glDepthMask(GL_FALSE); glColor3f(1.0f,0.0f,1.0f); glMatrixMode(GL_MODELVIEW); glLoadIdentity(); glRotatef( rotation[0], 1.0f, 0.0f, 0.0f ); glRotatef( rotation[1], 0.0f, 1.0f, 0.0f ); test.runCLKernels(); draw(); glFlush(); glutSwapBuffers(); } void reShape(int w,int h) { glViewport(0, 0, w, h); glViewport(0, 0, w, h); glMatrixMode(GL_PROJECTION); glLoadIdentity(); gluPerspective(33.0, 1.33, 1.0, 1000.0); glTranslatef(-10.0,-10.0,-120.0); //gluLookAt (0.0, 0.0, -2.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0); } int main(int argc,char** argv) { glutInit(&argc, argv); glutInitWindowSize (640, 480); glutInitDisplayMode (GLUT_RGBA | GLUT_DOUBLE | GLUT_DEPTH); glutCreateWindow("GLEW Test"); glutMouseFunc (mouseFunc); glutMotionFunc (mouseMotionFunc); glutDisplayFunc(displayfunc); glutReshapeFunc(reShape); glutIdleFunc(idlefunc); test.setupCL(); init(); test.setupCLKernels("lesson1_kernels.cl","squareArray"); glutMainLoop(); return 0; }

nou · ‎01-09-2010

you can look into my code here. http://forums.amd.com/devforum/messageview.cfm?catid=390&threadid=124923&enterthread=y it is on Linux and C but it work. and one thing. it work only with GPU device.

1. create OpenGL context

2. create OpenCL context

3. create GL buffers.

4. make CL mem object from GL buffers

5. glFinish() acquire GL/CL object.

6. run kernel

7. clFinish() release GL/CL object

genaganna · ‎01-10-2010

Originally posted by: chevydevil I'm trying to get the interop with the c++ bindings to run under windows. Is it possible and if what do i have to do? In the cl.hpp are the clEnqueueAcquireObject and clEnqueueReleaseObjects forwarded so I thought there has to be a way. Here is my source code:

chevydevil,

Interop classes(BufferGL) not added yet to cl.hpp. These will be added in future releases.

genaganna · ‎01-10-2010

Originally posted by: chevydevil I'm trying to get the interop with the c++ bindings to run under windows. Is it possible and if what do i have to do? In the cl.hpp are the clEnqueueAcquireObject and clEnqueueReleaseObjects forwarded so I thought there has to be a way. Here is my source code:

chevydevil,

Interop classes(BufferGL) not added yet to cl.hpp. These will be added in future releases.

chevydevil · ‎01-10-2010

Thx for the replys. I ported my code to c and it works. But i have a problem: On my Radeon 5870 there is a strange stuttering. The program runs arround 27fps but its like a stop and go. I'm attaching the code. Could please someone test this?

////lesson1_kernels.cl/////// #pragma OPENCL EXTENSION cl_khr_gl_sharing : enable __kernel void particle( __global float* input2, __global float* output, __global float* time, __global float* stime) { int id = get_global_id(0); float t = time[id]-stime[id]; stime[id]+=0.002; output[id] = input2[id]*t*t; if(t<0){ output[id] = 0.0; stime[id] = 0.0; } }; /////cl_test.hpp///////////////////////// #include <utility> #define __NO_STD_VECTOR // Use cl::vector and cl::string and #define __NO_STD_STRING // not STL versions, more on this later #include <gl/glee.h> #include <gl/Gl.h> #include <gl/glu.h> #include <gl/glut.h> #include <CL/cl.h> #include <cl/cl_gl.h> #include <cstdio> #include <cstdlib> #include <fstream> #include <iostream> #include <string> #include <ctime> const size_t problemSize = 2097152; class CL_test { public: CL_test(); ~CL_test(); void setupCL(); void setupCLKernels(); void runCLKernels(); cl_float random(cl_float, cl_float); void updatekernel(cl_float); inline void checkErr(cl_int, const char*); cl_context cl_c; cl_device_id device; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_mem cl_input; cl_mem cl_out; cl_mem cl_time; cl_mem cl_stime; cl_int err_code; cl_float input[problemSize]; cl_float inputTime[problemSize]; cl_float output[problemSize]; cl_float stime[problemSize]; GLuint vbo[1]; protected: void allocateMemory(); private: }; //////cl_test.cpp////////////// #include "cl_test.hpp" const char kernel_source[] = "#include \"lesson1_kernels.cl\""; CL_test::CL_test(){ }; CL_test::~CL_test(){ #if defined (_WIN32) _aligned_free(output); _aligned_free(input); #else free(output); free(input2); free(input); #endif } inline void CL_test::checkErr(cl_int err, const char *name){ if (err != CL_SUCCESS) { std::cerr << "ERROR: " << name << "("<< err <<")" << std::endl; //exit(EXIT_FAILURE); } } cl_float CL_test::random(cl_float low, cl_float high){ cl_float range = high - low; cl_float multiplier = ( cl_float ) rand() / RAND_MAX; return range * multiplier + low; } void CL_test::setupCL(){ cl_platform_id platform; err_code = clGetPlatformIDs(1, &platform, NULL); if(err_code) { printf("Unable get platform %d\n", err_code); exit(1); } cl_context_properties cl_properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(), CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(), 0}; cl_c = clCreateContextFromType(cl_properties, CL_DEVICE_TYPE_GPU, NULL, NULL, &err_code); if(err_code) { printf("Unable create context %d\n", err_code); exit(1); } clGetContextInfo(cl_c, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &device, NULL); queue = clCreateCommandQueue(cl_c, device, NULL, &err_code); if(err_code) { printf("Unable create command queue %d\n", err_code); exit(1); } } void CL_test::setupCLKernels(){ //read __kernels size_t source_len[] = { sizeof(kernel_source)-1 }; const char *s[] = {kernel_source , 0}; program = clCreateProgramWithSource(cl_c, 1, s, source_len, &err_code); if(err_code) { printf("Unable create program %d\n", err_code); exit(1); } err_code = clBuildProgram(program, 1, &device, "-I.", NULL, NULL); if(err_code) { printf("Unable build program %d\n", err_code); char log[10000]; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 10000, log, NULL); printf("BUILD LOG:\n%s", log); exit(1); } kernel = clCreateKernel(program, "particle", &err_code); allocateMemory(); glFinish(); err_code = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_input); checkErr(err_code,"clSetKernelArg 0"); err_code = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_out); checkErr(err_code,"clSetKernelArg 1"); err_code = clSetKernelArg(kernel, 2, sizeof(cl_mem), &cl_time); checkErr(err_code,"clSetKernelArg 1"); err_code = clSetKernelArg(kernel, 3, sizeof(cl_mem), &cl_stime); checkErr(err_code,"clSetKernelArg 2"); } void CL_test::runCLKernels(){ //std::cout<<"!"; size_t globalwork[] = {problemSize}; size_t localwork[] = {256}; clFinish(queue); err_code = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalwork,localwork, 0, NULL, NULL); checkErr(err_code,"clEnqueueNDRangeKernel"); err_code = clEnqueueReleaseGLObjects(queue, 1, &cl_out, 0, NULL, NULL); checkErr(err_code,"clEnqueueReleaseGLObjects"); clFinish(queue); } void CL_test::allocateMemory(){ cl_input = clCreateBuffer( cl_c, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_float) * problemSize, input, &err_code); checkErr(err_code,"clCreateBuffer"); cl_time = clCreateBuffer( cl_c, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_float) * problemSize, inputTime, &err_code); checkErr(err_code,"clCreateBuffer"); cl_stime = clCreateBuffer( cl_c, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_float) * problemSize, stime, &err_code); checkErr(err_code,"clCreateBuffer"); cl_out = clCreateFromGLBuffer(cl_c,CL_MEM_READ_WRITE, vbo[0],&err_code); checkErr(err_code,"createFromOpenGLBuffer"); err_code = clEnqueueAcquireGLObjects(queue, 1, &cl_out, 0, NULL, NULL); checkErr(err_code,"clEnqueueAcquireGLObjects"); } void CL_test::updatekernel(cl_float _stime){ err_code = clSetKernelArg(kernel,3,sizeof(cl_float), &_stime); checkErr(err_code, "upateKernelArg2"); } ////main.cpp///////////////// #include "cl_test.hpp" CL_test test; cl_float up=0.0; // rotation values for the navigation float rotation[3] = { 0.0, 0.0, 0.0 }; //----------------------------------------------------------------------------- // parameters for the navigation // position of the mouse when pressed int mousePressedX = 0, mousePressedY = 0; // mouse button states int leftMouseButtonActive = 0; //----------------------------------------------------------------------------- void mouseFunc(int button, int state, int x, int y) { // get the mouse buttons if (button == GLUT_LEFT_BUTTON && state == GLUT_DOWN) leftMouseButtonActive += 1; else leftMouseButtonActive -= 1; mousePressedX = x; mousePressedY = y; } //----------------------------------------------------------------------------- void mouseMotionFunc(int x, int y) { // rotation if (leftMouseButtonActive) { rotation[0] += ((mousePressedY - y)); rotation[1] += ((mousePressedX - x)); mousePressedY = y; mousePressedX = x; } } //----------------------------------------------------------------------------- void init(){ for (int i=0; i<problemSize-3; i+=3){ test.output=test.random(0.1,30.0); test.output[i+1]=test.random(0.1,30); test.output[i+2]=test.random(0.1,30.0); } for (int i=0; i<problemSize-3; i+=3){ test.input=test.random(0.1,30.0); test.input[i+1]=test.random(0.1,30); test.input[i+2]=test.random(0.1,30.0); } for (int i=0; i<problemSize-3; i+=3) { cl_float time = test.random(0.5,5.0); test.inputTime = time; test.inputTime[i+1] = time; test.inputTime[i+2] = time; } for (int i=0; i<problemSize-3; ++i) { test.stime = 0.0; } glGenBuffers(1, test.vbo); glBindBuffer(GL_ARRAY_BUFFER, test.vbo[0]); glBufferData(GL_ARRAY_BUFFER, sizeof(cl_float) * problemSize, test.output, GL_DYNAMIC_COPY_ARB); glBindBuffer(GL_ARRAY_BUFFER, 0); } void idlefunc(){ up +=0.02; if (up>=5) { up=0.0; } // test.updatekernel(up); glutPostRedisplay(); } void draw(){ glBindBuffer(GL_ARRAY_BUFFER, test.vbo[0]); glEnableClientState(GL_VERTEX_ARRAY); glVertexPointer(3, GL_FLOAT, 0, 0); glDrawArrays(GL_POINTS, 0, problemSize/3); glBindBuffer(GL_ARRAY_BUFFER, 0); glDisableClientState(GL_VERTEX_ARRAY); } void displayfunc() { glClearColor(0.0 ,0.0, 0.0, 0.0); glClear(GL_COLOR_BUFFER_BIT); glClear(GL_DEPTH_BUFFER_BIT); glPointSize(1.0); glBlendFunc(GL_SRC_ALPHA, GL_ONE); glEnable(GL_BLEND); glDepthMask(GL_FALSE); glColor3f(1.0f,0.0f,1.0f); glMatrixMode(GL_MODELVIEW); glLoadIdentity(); glRotatef( rotation[0], 1.0f, 0.0f, 0.0f ); glRotatef( rotation[1], 0.0f, 1.0f, 0.0f ); test.runCLKernels(); // for (int i=0; i<problemSize;) // { // glBegin(GL_POINTS); // { // glVertex3d(0.1+test.output, // 0.1+test.output[i+1], // 0.1+test.output[i+2]); // } // glEnd(); // i+=3; // } draw(); glFlush(); glutSwapBuffers(); } void reShape(int w,int h) { glViewport(0, 0, w, h); glViewport(0, 0, w, h); glMatrixMode(GL_PROJECTION); glLoadIdentity(); gluPerspective(25.0, 1.33, 1.0, 1000.0); glTranslatef(-10.0,-10.0,-120.0); //gluLookAt (0.0, 0.0, -2.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0); } int main(int argc,char** argv) { glutInit(&argc, argv); glutInitWindowSize (640, 480); glutInitDisplayMode (GLUT_RGBA | GLUT_DOUBLE | GLUT_DEPTH); glutCreateWindow("GLEW Test"); glutMouseFunc (mouseFunc); glutMotionFunc (mouseMotionFunc); glutDisplayFunc(displayfunc); glutReshapeFunc(reShape); glutIdleFunc(idlefunc); test.setupCL(); init(); test.setupCLKernels(); glutMainLoop(); return 0; }

chevydevil · ‎01-10-2010

Another thing. I tried this on the cpu and it runs about 25fps faster and without the stuttering. Is there a problem in my code or with my gpu? I'm attaching the CPU code as well:

*Edit* Okay. The Workitemsize was one performance problem. But the stuttering is still there. With 4 Mio particles CPU @ 20fps and the GPU@ 35fps.

///lesson1_kernels.cl////////////////////// #pragma OPENCL EXTENSION cl_khr_gl_sharing : enable __kernel void particle( __global float* input2, __global float* output, __global float* time, __global float* stime) { int id = get_global_id(0); float t = time[id]-stime[id]; stime[id]+=0.002; output[id] = input2[id]*t*t; if(t<0){ output[id] = 0.0; stime[id] = 0.0; } }; //////////////cl_test.hpp////////////////////// #include <utility> #define __NO_STD_VECTOR // Use cl::vector and cl::string and #define __NO_STD_STRING // not STL versions, more on this later #include <gl/glee.h> #include <gl/Gl.h> #include <gl/glu.h> #include <gl/glut.h> #include <CL/cl.h> #include <cl/cl_gl.h> #include <cstdio> #include <cstdlib> #include <fstream> #include <iostream> #include <string> #include <ctime> const size_t problemSize = 2097152; class CL_test { public: CL_test(); ~CL_test(); void setupCL(); void setupCLKernels(); void runCLKernels(); cl_float random(cl_float, cl_float); void updatekernel(cl_float); inline void checkErr(cl_int, const char*); cl_context cl_c; cl_device_id device; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_mem cl_input; cl_mem cl_out; cl_mem cl_time; cl_mem cl_stime; cl_int err_code; cl_float input[problemSize]; cl_float inputTime[problemSize]; cl_float output[problemSize]; cl_float stime[problemSize]; GLuint vbo[1]; protected: void allocateMemory(); private: }; //////cl_test.cpp///////////////////// #include "cl_test.hpp" const char kernel_source[] = "#include \"lesson1_kernels.cl\""; CL_test::CL_test(){ }; CL_test::~CL_test(){ #if defined (_WIN32) _aligned_free(output); _aligned_free(input); #else free(output); free(input2); free(input); #endif } inline void CL_test::checkErr(cl_int err, const char *name){ if (err != CL_SUCCESS) { std::cerr << "ERROR: " << name << "("<< err <<")" << std::endl; //exit(EXIT_FAILURE); } } cl_float CL_test::random(cl_float low, cl_float high){ cl_float range = high - low; cl_float multiplier = ( cl_float ) rand() / RAND_MAX; return range * multiplier + low; } void CL_test::setupCL(){ cl_platform_id platform; err_code = clGetPlatformIDs(1, &platform, NULL); if(err_code) { printf("Unable get platform %d\n", err_code); exit(1); } cl_context_properties cl_properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(), CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(), 0}; cl_c = clCreateContextFromType(cl_properties, CL_DEVICE_TYPE_CPU, NULL, NULL, &err_code); if(err_code) { printf("Unable create context %d\n", err_code); exit(1); } clGetContextInfo(cl_c, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &device, NULL); queue = clCreateCommandQueue(cl_c, device, NULL, &err_code); if(err_code) { printf("Unable create command queue %d\n", err_code); exit(1); } } void CL_test::setupCLKernels(){ //read __kernels size_t source_len[] = { sizeof(kernel_source)-1 }; const char *s[] = {kernel_source , 0}; program = clCreateProgramWithSource(cl_c, 1, s, source_len, &err_code); if(err_code) { printf("Unable create program %d\n", err_code); exit(1); } err_code = clBuildProgram(program, 1, &device, "-I.", NULL, NULL); if(err_code) { printf("Unable build program %d\n", err_code); char log[10000]; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 10000, log, NULL); printf("BUILD LOG:\n%s", log); exit(1); } kernel = clCreateKernel(program, "particle", &err_code); allocateMemory(); glFinish(); err_code = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_input); checkErr(err_code,"clSetKernelArg 0"); err_code = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_out); checkErr(err_code,"clSetKernelArg 1"); err_code = clSetKernelArg(kernel, 2, sizeof(cl_mem), &cl_time); checkErr(err_code,"clSetKernelArg 1"); err_code = clSetKernelArg(kernel, 3, sizeof(cl_mem), &cl_stime); checkErr(err_code,"clSetKernelArg 2"); } void CL_test::runCLKernels(){ //std::cout<<"!"; size_t globalwork[] = {problemSize}; size_t localwork[] = {1024}; clFinish(queue); err_code = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalwork,localwork, 0, NULL, NULL); checkErr(err_code,"clEnqueueNDRangeKernel"); clEnqueueReadBuffer( queue, cl_out, CL_FALSE, 0, problemSize * sizeof(cl_float), output, 0, NULL, NULL); // err_code = clEnqueueReleaseGLObjects(queue, 1, &cl_out, 0, NULL, NULL); // checkErr(err_code,"clEnqueueReleaseGLObjects"); clFinish(queue); } void CL_test::allocateMemory(){ cl_input = clCreateBuffer( cl_c, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_float) * problemSize, input, &err_code); checkErr(err_code,"clCreateBuffer"); cl_time = clCreateBuffer( cl_c, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_float) * problemSize, inputTime, &err_code); checkErr(err_code,"clCreateBuffer"); cl_stime = clCreateBuffer( cl_c, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_float) * problemSize, stime, &err_code); checkErr(err_code,"clCreateBuffer"); cl_out = clCreateBuffer(cl_c,CL_MEM_READ_WRITE,sizeof(cl_mem)*problemSize,0,&err_code); checkErr(err_code,"clCreateBuffer"); // cl_out = clCreateFromGLBuffer(cl_c,CL_MEM_READ_WRITE, vbo[0],&err_code); // checkErr(err_code,"createFromOpenGLBuffer"); // err_code = clEnqueueAcquireGLObjects(queue, 1, &cl_out, 0, NULL, NULL); // checkErr(err_code,"clEnqueueAcquireGLObjects"); //clEnqueueWriteBuffer(queue,cl_out,0,0,0,&input[0],0,NULL,NULL); } void CL_test::updatekernel(cl_float _stime){ err_code = clSetKernelArg(kernel,3,sizeof(cl_float), &_stime); checkErr(err_code, "upateKernelArg2"); } /////main.cpp////////////////////////// #include "cl_test.hpp" CL_test test; cl_float up=0.0; // rotation values for the navigation float rotation[3] = { 0.0, 0.0, 0.0 }; //----------------------------------------------------------------------------- // parameters for the navigation // position of the mouse when pressed int mousePressedX = 0, mousePressedY = 0; // mouse button states int leftMouseButtonActive = 0; //----------------------------------------------------------------------------- void mouseFunc(int button, int state, int x, int y) { // get the mouse buttons if (button == GLUT_LEFT_BUTTON && state == GLUT_DOWN) leftMouseButtonActive += 1; else leftMouseButtonActive -= 1; mousePressedX = x; mousePressedY = y; } //----------------------------------------------------------------------------- void mouseMotionFunc(int x, int y) { // rotation if (leftMouseButtonActive) { rotation[0] += ((mousePressedY - y)); rotation[1] += ((mousePressedX - x)); mousePressedY = y; mousePressedX = x; } } //----------------------------------------------------------------------------- void init(){ for (int i=0; i<problemSize-3; i+=3){ test.output=test.random(0.1,30.0); test.output[i+1]=test.random(0.1,30); test.output[i+2]=test.random(0.1,30.0); } for (int i=0; i<problemSize-3; i+=3){ test.input=test.random(0.1,30.0); test.input[i+1]=test.random(0.1,30); test.input[i+2]=test.random(0.1,30.0); } for (int i=0; i<problemSize-3; i+=3) { cl_float time = test.random(0.5,5.0); test.inputTime = time; test.inputTime[i+1] = time; test.inputTime[i+2] = time; } for (int i=0; i<problemSize-3; ++i) { test.stime = 0.0; } // glGenBuffers(1, test.vbo); // glBindBuffer(GL_ARRAY_BUFFER, test.vbo[0]); // glBufferData(GL_ARRAY_BUFFER, sizeof(cl_float) * problemSize, test.output, GL_DYNAMIC_COPY_ARB); // glBindBuffer(GL_ARRAY_BUFFER, 0); } void idlefunc(){ up +=0.02; if (up>=5) { up=0.0; } // test.updatekernel(up); glutPostRedisplay(); } void draw(){ // glBindBuffer(GL_ARRAY_BUFFER, test.vbo[0]); glEnableClientState(GL_VERTEX_ARRAY); glVertexPointer(3, GL_FLOAT, 0, test.output); glDrawArrays(GL_POINTS, 0, problemSize/3); glBindBuffer(GL_ARRAY_BUFFER, 0); glDisableClientState(GL_VERTEX_ARRAY); } void displayfunc() { glClearColor(0.0 ,0.0, 0.0, 0.0); glClear(GL_COLOR_BUFFER_BIT); glClear(GL_DEPTH_BUFFER_BIT); glPointSize(1.0); glBlendFunc(GL_SRC_ALPHA, GL_ONE); glEnable(GL_BLEND); glDepthMask(GL_FALSE); glColor3f(1.0f,0.0f,1.0f); glMatrixMode(GL_MODELVIEW); glLoadIdentity(); glRotatef( rotation[0], 1.0f, 0.0f, 0.0f ); glRotatef( rotation[1], 0.0f, 1.0f, 0.0f ); test.runCLKernels(); // for (int i=0; i<problemSize;) // { // glBegin(GL_POINTS); // { // glVertex3d(0.1+test.output, // 0.1+test.output[i+1], // 0.1+test.output[i+2]); // } // glEnd(); // i+=3; // } draw(); glFlush(); glutSwapBuffers(); } void reShape(int w,int h) { glViewport(0, 0, w, h); glViewport(0, 0, w, h); glMatrixMode(GL_PROJECTION); glLoadIdentity(); gluPerspective(25.0, 1.33, 1.0, 1000.0); glTranslatef(-10.0,-10.0,-120.0); //gluLookAt (0.0, 0.0, -2.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0); } int main(int argc,char** argv) { glutInit(&argc, argv); glutInitWindowSize (640, 480); glutInitDisplayMode (GLUT_RGBA | GLUT_DOUBLE | GLUT_DEPTH); glutCreateWindow("GLEW Test"); glutMouseFunc (mouseFunc); glutMotionFunc (mouseMotionFunc); glutDisplayFunc(displayfunc); glutReshapeFunc(reShape); glutIdleFunc(idlefunc); test.setupCL(); init(); test.setupCLKernels(); glutMainLoop(); return 0; }

nou · ‎01-10-2010

i think it is because you does not properly acquire and release GL objects. you must acquire object before working with him in OpenCL and then release him when you want work in OpenGL. second you must ensure the synchronization of access to objects from OpenCL and OpenGL.

so put glFinish() clEnqueueAcquireGLObjects() before kernel run and clEnqueueReleaseGLObjects(), clFinish() after kernel run.

chevydevil · ‎01-11-2010

4 Mio was wrong its of course only 1,33 Mio since i use arrays.

@nou: I played arround with glFinish() and clFinish() in the way you told. But it makes no difference at all. Except i'll take them all out then the program doesnt run. So I guess I'm doing this the right way. But I dont get why the performance is so bad and what this stuttering is.

nou · ‎01-11-2010

well you should acquire and release object. it can ocasionaly crash and this type of bug is hard to find.

with my code i do not have problem with shutering. but with your code it jitter. maybe it is in GLUT.

Meteorhead · ‎05-02-2012

Hi!

I would need help with something asap. Help is appreciated.

I have tried CL-GL interop with the C++ bindings, but I can't seem to get it to work. I have seen genaganna's post about it not working, but that was a long time ago. I have checked the cl.hpp file, and I cannot help but notice, that something is not right... enqueueAcquireGLBuffer wants a vector of cl::Memory as an argument. Problem is, vectors of cl::Memory and cl::BufferGL are not convertible. Why does the cl::BufferGL type exist if I cannot acquire a vector of those?

This non-sense in the nomenclature would not be painful, if things would work. I got

std::vector<cl::Memory> vertexBuffs;

vertexBuffs.push_back(cl::BufferGL(auxRuntime->appContexts[0], CL_MEM_READ_WRITE, m_vbo, &CL_err));

auxRuntime->appQueues[0].enqueueAcquireGLObjects(&vertexBuffs, NULL, NULL);

auxRuntime->appQueues[0].enqueueNDRangeKernel(update_vertex, cl::NullRange, upd_global, upd_local, NULL, NULL);

auxRuntime->appQueues[0].enqueueReleaseGLObjects(&vertexBuffs, NULL, NULL);

and none of the functions fail, but I get trash memory inside the kernel.

What are the proper types to go about this without falling back to the C API by getting the respective handles out of the C++ objects? (I have created C API interop application before and it worked, and this one should work also. I have already tracked down the problem to having been caused by the erronous sharing.

nou · ‎05-12-2012

this seems right code. cl::BufferGL inherit from cl::Buffer and that from cl::Memory so they are convertible. otherwise you get error during compilation.

Meteorhead · ‎05-12-2012

The code seems right, but I tried removing the graph VBOs from a vector, and it still doesn't work. I don't know if I'm getting something wrong using the VAOs? AFAIK this is the way to set them up and to use them.

Meteorhead · ‎05-15-2012

Hi,

turns out that the mistake is not on my part, but the bug is inside the AMD OpenCL runtime. I have been debugging this error for the past 4 days now, only to figure out that the code runs just fine using both the Intel runtime and the Nvidia runtime.

Since the entire source code to reproduce the issue is beyond that point that I would like to post publicly on a forum, I would be glad if someone from AMD could write a PM, so I can send the VS2010 project to reproduce the issue.

Problem is, that clEnqueueAcquireGLObjects seems not to function properly. I acquire 3 VBOs, and only one of them are acquired properly. All writes made to the other 2 don't get applied to __global. This doesn't happen with either Intel, nor NV runtime, where all 3 get updated properly.

So if someone relevant to solving the issue could PM me, that would be appreciated. Unfortunately this is 4 days I really miss, given the remaining 2 weeks until diploma thesis deadline.

Cheers,

Máté

Meteorhead · ‎05-12-2012

This is the second time I write this post (bluescreen), but I try to cover everything again.

After cleaning up in my head the confusion of acquring and releasing OpenGL Buffers in the C++ API of OpenCL, I came across a problem I have not been able to debug for 3 days now, so I can say I put significant effort into it before posting.

I have a simulation that uses OpenGL interop, and it has a kernel for running the simulation and a bloody simple update_vertex kernel that only updates the VBOs that need to be displayed. Problem comes when I try to update multiple VBOs. I have 3 of them at the moment, but only one gets updated properly, as if the writes from update_vertex would not make it into the VBO.

If I update the contents of the VBOs from host side, they display properly.

If I set one of these 'defunct' VBOs as all of the targets in update_vertex, and I acquire just this one before executing update_vertex, the display driver crashes.

The application is very large, it's multithreaded (yes, I know it's not easy to write multithreaded OpenGL/OpenCL interop, but I took great care in locking the OpenGL context whenever I touch it and it does work fine with displaying one VBO), so let me only attach the relevant parts of the code. I have looked at it at least 3 dozen times, but have not found the difference between the creation, argument setting, acquiring/releasing of these VBOs, so I don't know why one works, and the others dont. (Those do not work that are stored inside vectors, the graph_vbo/vao)

Strange thing is though, that I have printed out the handles of these objects, and they are not numbered in the order they were created, as if the OpenGL runtime would've known in advance how many of these objects I will create (very spooky infact).

So let me attach the relevant parts of the code, and please, take a look at it what's wrong.

Thank you in advance,

Máté

Edit: What I forgot to write, is that the strange output on std::cout in this init sequence is

VBOs: 4 5 6

VAOs: 1 2 3

Edit2: fixed a vector indexer in this attached code (that was only faulty here).

tzachi_cohen · ‎05-15-2012

Does the issue reproduces when you initialize a CPU device ?

Meteorhead · ‎05-15-2012

Yes, same thing happens on the AMD CPU device.

tzachi_cohen · ‎05-15-2012

Well, our CPU implementation is simply mapping the GL buffer using the standard GL API, runs the kernel and un-maps it.

Our CPU device can actually interop with any OpenGL implementation.

Hence this is not memory spaces issue.

If you can expose a compiling version of the code I can look more closely into it .

Meteorhead · ‎05-15-2012

If I write into the VBO using clBufferSubData, then it displays correctly. if the kernel writes into it, nothing happens. If you write a mail (my address should be publicly visible), I can send you a compiling version. (There was another means of communicating over this forum, but I can't find it again.)

Meteorhead · ‎05-16-2012

Hello Tzachi,

it would be nice to get in touch to solve the problem. I have continued developing my program with the Intel SDK, since that one is at hand and it works properly. Problem is that it is naturally A LOT slower than running on the GPU, which would be the goal ultimately. I have made a lot of alterations to the code (concerning results, not application design), and just for kicks I gave it a shot to see what happens if I switch again to GPU. It crashed windows so bad, it simply restarted. No bluescreen or whatever.

I would be very glad to provide you with compiling code, but for good reasons I do not want the code to be publicly available. (I could strip down the code to remove sensitive stuff, but that would take more time than I wanted to invest at first approach.

Cheers,

Máté

Edit: I just checked, and the AMD CPU device works now. (Apart from seeing some garbage due to using unitialized VBOs, all buffers display properly once they got sensible data written into them)

Archives Discussions

CL/GL Interop with c++ bindings