cancel
Showing results for 
Search instead for 
Did you mean: 

Archives Discussions

chevydevil
Adept II

Performance Issues

Hello. My program attached below runs on a gtx285 very well. Using my Radeon 5870 the kernel execution time is about 10 times higher and so I am getting instead of 150 fps on the gtx arround 20 fps on the radeon. I also get an error when I add the input values directly to the output instead of declaring 3 floats a,b,c in the kernel. Can anyone tell me whats wrong?

 

Edit: I ran it via the Stream Profiler and it speeds up. But why? How?

Edit2: Running the executable speeds it up by factor 10. This is the performance i expected. So I think its solved.

//lesson1_kernels.cl///////////////////////////////////////////// __kernel void particle( __global float4* output, __global float4* stime, __global float2* grav_time ) { int id = get_global_id(0); // __global float4* stime = &_stime[id]; grav_time[id].y += 0.002; float a = stime[id].x+stime[id].w; float b = stime[id].y+(-grav_time[id].x+stime[id].w); float c = stime[id].z+stime[id].w; output[id].x += a; output[id].y += b; output[id].z += c; output[id].w = 1.0; if(grav_time[id].y > stime[id].w) { output[id] = (float4)(0.0,0.0,0.0,1.0); grav_time[id].y = 0.0; } }; //cl_test.hpp////////////////////////////////////////////////// #include <utility> // #define __NO_STD_VECTOR // Use cl::vector and cl::string and // #define __NO_STD_STRING // not STL versions, more on this later #include <gl/glee.h> #include <gl/Gl.h> #include <gl/glu.h> #include <gl/freeglut.h> #include <CL/cl.h> #include <cl/cl_gl.h> // #define CL_GL_CONTEXT_KHR 0x2008 // #define CL_EGL_DISPLAY_KHR 0x2009 // #define CL_GLX_DISPLAY_KHR 0x200A // #define CL_WGL_HDC_KHR 0x200B // #define CL_CGL_SHAREGROUP_KHR 0x200C typedef float float2[2]; typedef float float4[4]; #include <cstdio> #include <cstdlib> #include <fstream> #include <iostream> #include <string> #include <ctime> #include <math.h> const size_t problemSize = 1048576; class CL_test { public: CL_test(); ~CL_test(); void setupCL(); void setupCLKernels(); void runCLKernels(); cl_float random(cl_float, cl_float); inline void checkErr(cl_int, const char*); cl_context cl_c; cl_device_id *device; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_mem cl_input; cl_mem cl_input_grav; cl_mem cl_out; cl_mem cl_ltime; cl_mem cl_stime; cl_int err_code; float4 input[problemSize]; cl_float ltime[problemSize]; cl_float output[problemSize]; cl_float stime[problemSize]; float2 grav_time[problemSize]; GLuint vbo[1]; GLuint vbo2[1]; protected: void allocateMemory(); private: }; //cl_test.cpp//////////////////////////////////////////////////////////// #include "cl_test.hpp" const char kernel_source[] = "#include \"lesson1_kernels.cl\""; CL_test::CL_test(){ }; CL_test::~CL_test(){ #if defined (_WIN32) _aligned_free(output); _aligned_free(input); #else free(output); free(input2); free(input); #endif } inline void CL_test::checkErr(cl_int err, const char *name){ if (err != CL_SUCCESS) { std::cerr << "ERROR: " << name << "("<< err <<")" << std::endl; //exit(EXIT_FAILURE); } } cl_float CL_test::random(cl_float low, cl_float high){ cl_float range = high - low; cl_float multiplier = ( cl_float ) rand() / RAND_MAX; return range * multiplier + low; } void CL_test::setupCL(){ cl_platform_id platform; err_code = clGetPlatformIDs(1, &platform, NULL); if(err_code) { printf("Unable get platform %d\n", err_code); exit(1); } cl_context_properties cl_properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(), CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(), 0}; cl_c = clCreateContextFromType(cl_properties, CL_DEVICE_TYPE_GPU, NULL, NULL, &err_code); if(err_code) { printf("Unable create context %d\n", err_code); exit(1); } //clGetContextInfo(cl_c, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &device, NULL); //cl_device_id *device; size_t numDevices; clGetContextInfo(cl_c, CL_CONTEXT_DEVICES, NULL, NULL, &numDevices); device = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id)); clGetContextInfo(cl_c, CL_CONTEXT_DEVICES, numDevices * sizeof(cl_device_id), device, 0); queue = clCreateCommandQueue(cl_c, device[0], CL_QUEUE_PROFILING_ENABLE, &err_code); if(err_code) { printf("Unable create command queue %d\n", err_code); exit(1); } } void CL_test::setupCLKernels(){ //read __kernels size_t source_len[] = { sizeof(kernel_source)-1 }; const char *s[] = {kernel_source , 0}; program = clCreateProgramWithSource(cl_c, 1, s, source_len, &err_code); if(err_code) { printf("Unable create program %d\n", err_code); exit(1); } err_code = clBuildProgram(program, 1, device, "-I.", NULL, NULL); if(err_code) { printf("Unable build program %d\n", err_code); char log[10000]; clGetProgramBuildInfo(program, device[0], CL_PROGRAM_BUILD_LOG, 10000, log, NULL); printf("BUILD LOG:\n%s", log); system("pause"); exit(1); } kernel = clCreateKernel(program, "particle", &err_code); // glFinish(); allocateMemory(); err_code = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_out); checkErr(err_code,"clSetKernelArg 2"); err_code = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_input); checkErr(err_code,"clSetKernelArg 0"); err_code = clSetKernelArg(kernel, 2, sizeof(cl_mem), &cl_input_grav); checkErr(err_code,"clSetKernelArg 1"); // err_code = clSetKernelArg(kernel, 3, sizeof(cl_mem), &cl_ltime); // checkErr(err_code,"clSetKernelArg 3"); // err_code = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_stime); // checkErr(err_code,"clSetKernelArg 4"); } void CL_test::runCLKernels(){ //std::cout<<"!"; glFinish(); // size_t gsize = 0; // err_code = clGetKernelWorkGroupInfo(kernel, // device[0], // CL_KERNEL_WORK_GROUP_SIZE, // sizeof(size_t), // &gsize, // NULL); // if (err_code != CL_SUCCESS) { // fprintf(stderr, "Failed to get OpenCL kernel work group size info: %d\n", err_code); // exit(-1); // } // // // // unsigned int workGroupSize = (unsigned int) gsize; cl_event myEvent; cl_ulong startTime, endTime; size_t globalwork[] = {problemSize,1,1}; size_t localwork[] = {16,1,1}; err_code = clEnqueueAcquireGLObjects(queue, 1, &cl_out, 0, NULL, NULL); checkErr(err_code,"clEnqueueAcquireGLObjects"); err_code = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalwork,localwork, 0, NULL, &myEvent); checkErr(err_code,"clEnqueueNDRangeKernel"); clFinish(queue); err_code = clEnqueueReleaseGLObjects(queue, 1, &cl_out, 0, NULL, NULL); checkErr(err_code,"clEnqueueReleaseGLObjects"); clFinish(queue); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime,NULL); cl_ulong elapsedTime = endTime-startTime; std::cout<<elapsedTime/1000000<<"ms"<<"\n"; } void CL_test::allocateMemory(){ cl_input = clCreateBuffer( cl_c, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_float) *4* problemSize, &input, &err_code); checkErr(err_code,"clCreateBuffer"); cl_input_grav = clCreateBuffer( cl_c, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(float) *2* problemSize, &grav_time[0], &err_code); checkErr(err_code,"clCreateBuffer"); // // cl_ltime = clCreateBuffer( // cl_c, // CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, // sizeof(cl_float) * problemSize, // &ltime[0], // &err_code); // checkErr(err_code,"clCreateBuffer"); // // cl_stime = clCreateBuffer( // cl_c, // CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, // sizeof(float) * problemSize, // &stime[0], // &err_code); // checkErr(err_code,"clCreateBuffer"); // cl_out = clCreateBuffer( // cl_c, // CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, // sizeof(cl_float) * problemSize, // output, // &err_code); // checkErr(err_code,"clCreateBuffer"); cl_out = clCreateFromGLBuffer(cl_c,CL_MEM_READ_WRITE, vbo[0],&err_code); checkErr(err_code,"createFromOpenGLBuffer"); // cl_stime = clCreateFromGLBuffer(cl_c,CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, vbo2[0],&err_code); // checkErr(err_code,"createFromOpenGLBuffer"); // err_code = clEnqueueAcquireGLObjects(queue, 1, &cl_stime, 0, NULL, NULL); // checkErr(err_code,"clEnqueueAcquireGLObjects"); } //main.cpp////////////////////////////////////////////////////// #include "cl_test.hpp" CL_test test; cl_float up=0.0; int kerneltime=10; cl_float starttime; // rotation values for the navigation float rotation[3] = { 0.0, 0.0, 0.0 }; //----------------------------------------------------------------------------- // parameters for the navigation // position of the mouse when pressed int mousePressedX = 0, mousePressedY = 0; // mouse button states int leftMouseButtonActive = 0; //----------------------------------------------------------------------------- void mouseFunc(int button, int state, int x, int y) { // get the mouse buttons if (button == GLUT_LEFT_BUTTON && state == GLUT_DOWN) leftMouseButtonActive += 1; else leftMouseButtonActive -= 1; mousePressedX = x; mousePressedY = y; } //----------------------------------------------------------------------------- void mouseMotionFunc(int x, int y) { // rotation if (leftMouseButtonActive) { rotation[0] += ((mousePressedY - y)); rotation[1] += ((mousePressedX - x)); mousePressedY = y; mousePressedX = x; } } //----------------------------------------------------------------------------- void init(){ // for (int i=0; i<problemSize; ++i){ // test.output=0.0; // test.output[i+1]=test.random(0.0,0.5); // test.output[i+2]=0.0; // // } for (int i=0; i<problemSize; ++i){ starttime = test.random(0.5,1.0); test.ltime = starttime; starttime = test.random(0.5,0.9); test.stime = starttime; } for (int i=0; i<problemSize; ++i){ cl_float4 temp = {test.random(-0.1,0.1),test.random(0.2,0.4),test.random(-0.1,0.1),test.random(0.1,1.0)}; test.input[0] = test.random(-0.1,0.1); test.input[1] = test.random(0.2,0.4); test.input[2] = test.random(-0.1,0.1); test.input[3] = test.random(0.5,1.0); test.grav_time[0] = 0.5; test.grav_time[1] = test.random(0.5,2.0); } glGenBuffers(1, test.vbo); glBindBuffer(GL_ARRAY_BUFFER, test.vbo[0]); glBufferData(GL_ARRAY_BUFFER, sizeof(cl_float) *4* problemSize, 0, GL_STREAM_DRAW); glBindBuffer(GL_ARRAY_BUFFER, 0); // glGenBuffers(1, test.vbo2); // glBindBuffer(GL_ARRAY_BUFFER, test.vbo2[0]); // glBufferData(GL_ARRAY_BUFFER, sizeof(cl_float) * problemSize, test.stime, GL_DYNAMIC_COPY_ARB); // glBindBuffer(GL_ARRAY_BUFFER, 0); } void idlefunc(){ // test.updatekernel(up); glutPostRedisplay(); } void draw(){ glPointSize(1); glBindBuffer(GL_ARRAY_BUFFER, test.vbo[0]); glEnableClientState(GL_VERTEX_ARRAY); glVertexPointer(4, GL_FLOAT, 0, 0); glDrawArrays(GL_POINTS, 0, problemSize); glBindBuffer(GL_ARRAY_BUFFER, 0); glDisableClientState(GL_VERTEX_ARRAY); } void displayfunc() { glClearColor(0.0 ,0.0, 0.0, 0.0); glClear(GL_COLOR_BUFFER_BIT); // glClear(GL_DEPTH_BUFFER_BIT); glColor3f(1.0f,1.0f,0.0f); glMatrixMode(GL_MODELVIEW); glLoadIdentity(); glTranslatef(0.0,-1.0, -100.0); glRotatef( rotation[0], 1.0f, 0.0f, 0.0f ); glRotatef( rotation[1], 0.0f, 1.0f, 0.0f ); test.runCLKernels(); draw(); glutSwapBuffers(); } void reShape(int w,int h) { glViewport(0, 0, w, h); glMatrixMode(GL_PROJECTION); glLoadIdentity(); gluPerspective(33.0, 1.33, 1.0, 1000.0); //glTranslatef(-10.0,-10.0,-400.0); //gluLookAt (0.0, 0.0, -2.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0); } int main(int argc,char** argv) { glutInit(&argc, argv); glutInitWindowSize (640, 480); glutInitWindowPosition(100, 100); glutInitDisplayMode (GLUT_RGBA | GLUT_DOUBLE | GLUT_DEPTH); glutCreateWindow("GLEW Test"); glutMouseFunc (mouseFunc); glutMotionFunc (mouseMotionFunc); glutDisplayFunc(displayfunc); glutReshapeFunc(reShape); glutIdleFunc(idlefunc); test.setupCL(); init(); test.setupCLKernels(); glutMainLoop(); return 0; }

0 Likes
6 Replies
genaganna
Journeyman III

chevydevil,

 

       Could you please give us OS details? Are you running 32bit or 64 bit?

0 Likes

I'm running it with VS08 on Win7 x64 machine but in win32 mode. I think VS08 produces a strange overhead or something.

 

Edit: Okay. It's the debug information. Run without debugging works fine. What I still don't get is that the same project with a win 7 x64 machine and the gtx285 runs fine with the debug info created.

0 Likes

Originally posted by: chevydevil I'm running it with VS08 on Win7 x64 machine but in win32 mode. I think VS08 produces a strange overhead or something.

 

Could you please run on 64 bit mode?  It is a know issue that there is huge performance drop when application running 32 bit mode.

0 Likes

Wonder what is result performance? Though it may also be affected by OS issues.

0 Likes
aymankh
Journeyman III

chevydevil,

 Your local work size is 16. Local work size must be a multiple of 64 to get any good performance using ati hardware.

0 Likes

Of course 16 is no good idea but i was desperate. In fact 128 is the best I think in this case on my machine. I compiled it now in 64bit and it makes no difference. But as I said without debugging, the performance is fine.

0 Likes