Hello. My program attached below runs on a gtx285 very well. Using my Radeon 5870 the kernel execution time is about 10 times higher and so I am getting instead of 150 fps on the gtx arround 20 fps on the radeon. I also get an error when I add the input values directly to the output instead of declaring 3 floats a,b,c in the kernel. Can anyone tell me whats wrong?
Edit: I ran it via the Stream Profiler and it speeds up. But why? How?
Edit2: Running the executable speeds it up by factor 10. This is the performance i expected. So I think its solved.
//lesson1_kernels.cl///////////////////////////////////////////// __kernel void particle( __global float4* output, __global float4* stime, __global float2* grav_time ) { int id = get_global_id(0); // __global float4* stime = &_stime[id]; grav_time[id].y += 0.002; float a = stime[id].x+stime[id].w; float b = stime[id].y+(-grav_time[id].x+stime[id].w); float c = stime[id].z+stime[id].w; output[id].x += a; output[id].y += b; output[id].z += c; output[id].w = 1.0; if(grav_time[id].y > stime[id].w) { output[id] = (float4)(0.0,0.0,0.0,1.0); grav_time[id].y = 0.0; } }; //cl_test.hpp////////////////////////////////////////////////// #include <utility> // #define __NO_STD_VECTOR // Use cl::vector and cl::string and // #define __NO_STD_STRING // not STL versions, more on this later #include <gl/glee.h> #include <gl/Gl.h> #include <gl/glu.h> #include <gl/freeglut.h> #include <CL/cl.h> #include <cl/cl_gl.h> // #define CL_GL_CONTEXT_KHR 0x2008 // #define CL_EGL_DISPLAY_KHR 0x2009 // #define CL_GLX_DISPLAY_KHR 0x200A // #define CL_WGL_HDC_KHR 0x200B // #define CL_CGL_SHAREGROUP_KHR 0x200C typedef float float2[2]; typedef float float4[4]; #include <cstdio> #include <cstdlib> #include <fstream> #include <iostream> #include <string> #include <ctime> #include <math.h> const size_t problemSize = 1048576; class CL_test { public: CL_test(); ~CL_test(); void setupCL(); void setupCLKernels(); void runCLKernels(); cl_float random(cl_float, cl_float); inline void checkErr(cl_int, const char*); cl_context cl_c; cl_device_id *device; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_mem cl_input; cl_mem cl_input_grav; cl_mem cl_out; cl_mem cl_ltime; cl_mem cl_stime; cl_int err_code; float4 input[problemSize]; cl_float ltime[problemSize]; cl_float output[problemSize]; cl_float stime[problemSize]; float2 grav_time[problemSize]; GLuint vbo[1]; GLuint vbo2[1]; protected: void allocateMemory(); private: }; //cl_test.cpp//////////////////////////////////////////////////////////// #include "cl_test.hpp" const char kernel_source[] = "#include \"lesson1_kernels.cl\""; CL_test::CL_test(){ }; CL_test::~CL_test(){ #if defined (_WIN32) _aligned_free(output); _aligned_free(input); #else free(output); free(input2); free(input); #endif } inline void CL_test::checkErr(cl_int err, const char *name){ if (err != CL_SUCCESS) { std::cerr << "ERROR: " << name << "("<< err <<")" << std::endl; //exit(EXIT_FAILURE); } } cl_float CL_test::random(cl_float low, cl_float high){ cl_float range = high - low; cl_float multiplier = ( cl_float ) rand() / RAND_MAX; return range * multiplier + low; } void CL_test::setupCL(){ cl_platform_id platform; err_code = clGetPlatformIDs(1, &platform, NULL); if(err_code) { printf("Unable get platform %d\n", err_code); exit(1); } cl_context_properties cl_properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(), CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(), 0}; cl_c = clCreateContextFromType(cl_properties, CL_DEVICE_TYPE_GPU, NULL, NULL, &err_code); if(err_code) { printf("Unable create context %d\n", err_code); exit(1); } //clGetContextInfo(cl_c, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &device, NULL); //cl_device_id *device; size_t numDevices; clGetContextInfo(cl_c, CL_CONTEXT_DEVICES, NULL, NULL, &numDevices); device = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id)); clGetContextInfo(cl_c, CL_CONTEXT_DEVICES, numDevices * sizeof(cl_device_id), device, 0); queue = clCreateCommandQueue(cl_c, device[0], CL_QUEUE_PROFILING_ENABLE, &err_code); if(err_code) { printf("Unable create command queue %d\n", err_code); exit(1); } } void CL_test::setupCLKernels(){ //read __kernels size_t source_len[] = { sizeof(kernel_source)-1 }; const char *s[] = {kernel_source , 0}; program = clCreateProgramWithSource(cl_c, 1, s, source_len, &err_code); if(err_code) { printf("Unable create program %d\n", err_code); exit(1); } err_code = clBuildProgram(program, 1, device, "-I.", NULL, NULL); if(err_code) { printf("Unable build program %d\n", err_code); char log[10000]; clGetProgramBuildInfo(program, device[0], CL_PROGRAM_BUILD_LOG, 10000, log, NULL); printf("BUILD LOG:\n%s", log); system("pause"); exit(1); } kernel = clCreateKernel(program, "particle", &err_code); // glFinish(); allocateMemory(); err_code = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_out); checkErr(err_code,"clSetKernelArg 2"); err_code = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_input); checkErr(err_code,"clSetKernelArg 0"); err_code = clSetKernelArg(kernel, 2, sizeof(cl_mem), &cl_input_grav); checkErr(err_code,"clSetKernelArg 1"); // err_code = clSetKernelArg(kernel, 3, sizeof(cl_mem), &cl_ltime); // checkErr(err_code,"clSetKernelArg 3"); // err_code = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_stime); // checkErr(err_code,"clSetKernelArg 4"); } void CL_test::runCLKernels(){ //std::cout<<"!"; glFinish(); // size_t gsize = 0; // err_code = clGetKernelWorkGroupInfo(kernel, // device[0], // CL_KERNEL_WORK_GROUP_SIZE, // sizeof(size_t), // &gsize, // NULL); // if (err_code != CL_SUCCESS) { // fprintf(stderr, "Failed to get OpenCL kernel work group size info: %d\n", err_code); // exit(-1); // } // // // // unsigned int workGroupSize = (unsigned int) gsize; cl_event myEvent; cl_ulong startTime, endTime; size_t globalwork[] = {problemSize,1,1}; size_t localwork[] = {16,1,1}; err_code = clEnqueueAcquireGLObjects(queue, 1, &cl_out, 0, NULL, NULL); checkErr(err_code,"clEnqueueAcquireGLObjects"); err_code = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalwork,localwork, 0, NULL, &myEvent); checkErr(err_code,"clEnqueueNDRangeKernel"); clFinish(queue); err_code = clEnqueueReleaseGLObjects(queue, 1, &cl_out, 0, NULL, NULL); checkErr(err_code,"clEnqueueReleaseGLObjects"); clFinish(queue); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime,NULL); cl_ulong elapsedTime = endTime-startTime; std::cout<<elapsedTime/1000000<<"ms"<<"\n"; } void CL_test::allocateMemory(){ cl_input = clCreateBuffer( cl_c, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_float) *4* problemSize, &input, &err_code); checkErr(err_code,"clCreateBuffer"); cl_input_grav = clCreateBuffer( cl_c, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(float) *2* problemSize, &grav_time[0], &err_code); checkErr(err_code,"clCreateBuffer"); // // cl_ltime = clCreateBuffer( // cl_c, // CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, // sizeof(cl_float) * problemSize, // <ime[0], // &err_code); // checkErr(err_code,"clCreateBuffer"); // // cl_stime = clCreateBuffer( // cl_c, // CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, // sizeof(float) * problemSize, // &stime[0], // &err_code); // checkErr(err_code,"clCreateBuffer"); // cl_out = clCreateBuffer( // cl_c, // CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, // sizeof(cl_float) * problemSize, // output, // &err_code); // checkErr(err_code,"clCreateBuffer"); cl_out = clCreateFromGLBuffer(cl_c,CL_MEM_READ_WRITE, vbo[0],&err_code); checkErr(err_code,"createFromOpenGLBuffer"); // cl_stime = clCreateFromGLBuffer(cl_c,CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, vbo2[0],&err_code); // checkErr(err_code,"createFromOpenGLBuffer"); // err_code = clEnqueueAcquireGLObjects(queue, 1, &cl_stime, 0, NULL, NULL); // checkErr(err_code,"clEnqueueAcquireGLObjects"); } //main.cpp////////////////////////////////////////////////////// #include "cl_test.hpp" CL_test test; cl_float up=0.0; int kerneltime=10; cl_float starttime; // rotation values for the navigation float rotation[3] = { 0.0, 0.0, 0.0 }; //----------------------------------------------------------------------------- // parameters for the navigation // position of the mouse when pressed int mousePressedX = 0, mousePressedY = 0; // mouse button states int leftMouseButtonActive = 0; //----------------------------------------------------------------------------- void mouseFunc(int button, int state, int x, int y) { // get the mouse buttons if (button == GLUT_LEFT_BUTTON && state == GLUT_DOWN) leftMouseButtonActive += 1; else leftMouseButtonActive -= 1; mousePressedX = x; mousePressedY = y; } //----------------------------------------------------------------------------- void mouseMotionFunc(int x, int y) { // rotation if (leftMouseButtonActive) { rotation[0] += ((mousePressedY - y)); rotation[1] += ((mousePressedX - x)); mousePressedY = y; mousePressedX = x; } } //----------------------------------------------------------------------------- void init(){ // for (int i=0; i<problemSize; ++i){ // test.output=0.0; // test.output[i+1]=test.random(0.0,0.5); // test.output[i+2]=0.0; // // } for (int i=0; i<problemSize; ++i){ starttime = test.random(0.5,1.0); test.ltime = starttime; starttime = test.random(0.5,0.9); test.stime = starttime; } for (int i=0; i<problemSize; ++i){ cl_float4 temp = {test.random(-0.1,0.1),test.random(0.2,0.4),test.random(-0.1,0.1),test.random(0.1,1.0)}; test.input[0] = test.random(-0.1,0.1); test.input[1] = test.random(0.2,0.4); test.input[2] = test.random(-0.1,0.1); test.input[3] = test.random(0.5,1.0); test.grav_time[0] = 0.5; test.grav_time[1] = test.random(0.5,2.0); } glGenBuffers(1, test.vbo); glBindBuffer(GL_ARRAY_BUFFER, test.vbo[0]); glBufferData(GL_ARRAY_BUFFER, sizeof(cl_float) *4* problemSize, 0, GL_STREAM_DRAW); glBindBuffer(GL_ARRAY_BUFFER, 0); // glGenBuffers(1, test.vbo2); // glBindBuffer(GL_ARRAY_BUFFER, test.vbo2[0]); // glBufferData(GL_ARRAY_BUFFER, sizeof(cl_float) * problemSize, test.stime, GL_DYNAMIC_COPY_ARB); // glBindBuffer(GL_ARRAY_BUFFER, 0); } void idlefunc(){ // test.updatekernel(up); glutPostRedisplay(); } void draw(){ glPointSize(1); glBindBuffer(GL_ARRAY_BUFFER, test.vbo[0]); glEnableClientState(GL_VERTEX_ARRAY); glVertexPointer(4, GL_FLOAT, 0, 0); glDrawArrays(GL_POINTS, 0, problemSize); glBindBuffer(GL_ARRAY_BUFFER, 0); glDisableClientState(GL_VERTEX_ARRAY); } void displayfunc() { glClearColor(0.0 ,0.0, 0.0, 0.0); glClear(GL_COLOR_BUFFER_BIT); // glClear(GL_DEPTH_BUFFER_BIT); glColor3f(1.0f,1.0f,0.0f); glMatrixMode(GL_MODELVIEW); glLoadIdentity(); glTranslatef(0.0,-1.0, -100.0); glRotatef( rotation[0], 1.0f, 0.0f, 0.0f ); glRotatef( rotation[1], 0.0f, 1.0f, 0.0f ); test.runCLKernels(); draw(); glutSwapBuffers(); } void reShape(int w,int h) { glViewport(0, 0, w, h); glMatrixMode(GL_PROJECTION); glLoadIdentity(); gluPerspective(33.0, 1.33, 1.0, 1000.0); //glTranslatef(-10.0,-10.0,-400.0); //gluLookAt (0.0, 0.0, -2.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0); } int main(int argc,char** argv) { glutInit(&argc, argv); glutInitWindowSize (640, 480); glutInitWindowPosition(100, 100); glutInitDisplayMode (GLUT_RGBA | GLUT_DOUBLE | GLUT_DEPTH); glutCreateWindow("GLEW Test"); glutMouseFunc (mouseFunc); glutMotionFunc (mouseMotionFunc); glutDisplayFunc(displayfunc); glutReshapeFunc(reShape); glutIdleFunc(idlefunc); test.setupCL(); init(); test.setupCLKernels(); glutMainLoop(); return 0; }
chevydevil,
Could you please give us OS details? Are you running 32bit or 64 bit?
I'm running it with VS08 on Win7 x64 machine but in win32 mode. I think VS08 produces a strange overhead or something.
Edit: Okay. It's the debug information. Run without debugging works fine. What I still don't get is that the same project with a win 7 x64 machine and the gtx285 runs fine with the debug info created.
Originally posted by: chevydevil I'm running it with VS08 on Win7 x64 machine but in win32 mode. I think VS08 produces a strange overhead or something.
Could you please run on 64 bit mode? It is a know issue that there is huge performance drop when application running 32 bit mode.
Wonder what is result performance? Though it may also be affected by OS issues.
chevydevil,
Your local work size is 16. Local work size must be a multiple of 64 to get any good performance using ati hardware.
Of course 16 is no good idea but i was desperate. In fact 128 is the best I think in this case on my machine. I compiled it now in 64bit and it makes no difference. But as I said without debugging, the performance is fine.