7 Replies Latest reply on Oct 9, 2013 8:35 AM by himanshu.gautam

    Any hope of this running better?

    swoop

      I have an opencl kernel which reads from an opengl texture (the texture is a render target attached to a framebuffer object). Once the texture is rendered to, I need to run my kernel to calculate another texture for the next render step where both will be used together. Everything works correctly, I read results back that look correct, but it kills my performance. I was hoping for performance that would rival that of GLSL shaders, Im wondering if Im doing something wrong.

       

      Kernel

       

          const char* source =
          "#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable \n"
          "\n"       
          "__constant float4 kToLum = (float4)(0.299f, 0.587f, 0.114f, 0.0f); \n"
          "\n"
          "__kernel void Lum(__read_only image2d_t readImage2D, __global uchar* lumHist)\n"
          "{\n"
          "   const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; \n"
          "   int ix_s = get_global_id(0); \n"
          "   int iy_t = get_global_id(1); \n"
          "   int2 ixy_st = (int2)(ix_s, iy_t); \n"
          "\n"
          "   float4 color = read_imagef(readImage2D, sampler, ixy_st); \n"
          "   float lum = dot(color, kToLum) * 255.0f; \n"
          "   uchar index = convert_uchar_rte(lum); \n"
          "\n"
          "   int imgHeight = get_image_height(readImage2D);  \n"
          "   iy_t = (imgHeight - 1) - iy_t;  \n"
          "\n"       
          "   int imgWidth = get_image_width(readImage2D);  \n"
          "   lumHist[ix_s + (iy_t * imgWidth)] = index; \n"
          "}\n";

       

      Setup

       

      void _CreateBuffer(oclLumBuffer_t& oclb, const oclrt_t& oclrt)

      {

          cl_int error = 0;

          GLint mipLevel = 0;

          cl_mem_flags memFlags = CL_MEM_READ_ONLY;

          oclb.readBufferIn_0 = clCreateFromGLTexture2D(oclrt.context, memFlags, GL_TEXTURE_2D, mipLevel, oclb.tex2DID, &error);

       

          int bufferLen = oclb.texWidth * oclb.texHeight;

          oclb.bufferSize = bufferLen * sizeof(oclLumBuffer_t::bufferType_t);

          oclb.indexBuffer = new oclLumBuffer_t::bufferType_t[bufferLen];       

          MemClr(&oclb.indexBuffer[0], oclb.bufferSize);

       

          memFlags = CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR;     // write to app memory

          oclb.writeBufferOut_0 = clCreateBuffer(oclrt.context, memFlags, oclb.bufferSize, &oclb.indexBuffer[0], &error);

      }

       

      void _SetBuffer(oclLumBuffer_t& oclb, const oclProgram_t& oclp)

      {

          cl_uint argIndex = 0;

          clSetKernelArg(oclp.kernel, argIndex, sizeof(oclb.readBufferIn_0), &oclb.readBufferIn_0);

       

          argIndex = 1;

          clSetKernelArg(oclp.kernel, argIndex, sizeof(cl_mem), (void*)&oclb.writeBufferOut_0);

      }

       

      Run kernel

       

      void _AquireGL(oclLumBuffer_t& oclb, const oclrt_t& oclrt)

      {

          cl_int error = 0;

          cl_uint numObjs = 1;

          cl_uint numEventsInWaitList = 0;

          const cl_event* eventWaitList = NULL;

          cl_event* event = NULL;

          error = clEnqueueAcquireGLObjects(oclrt.deviceCmdQueue, numObjs, &oclb.readBufferIn_0, numEventsInWaitList, eventWaitList, event);

      }

       

      void _ReleaseGL(oclLumBuffer_t& oclb, const oclrt_t& oclrt)

      {

          cl_int error = 0;

          cl_uint numObjs = 1;

          cl_uint numEventsInWaitList = 0;

          const cl_event* eventWaitList = NULL;

          cl_event* event = NULL;

          error = clEnqueueReleaseGLObjects(oclrt.deviceCmdQueue, numObjs, &oclb.readBufferIn_0, numEventsInWaitList, eventWaitList, event);       

      }

       

      void _RunCompute(oclProgram_t& oclp, oclLumBuffer_t& oclb, const oclrt_t& oclrt)

      {

          ///

          //glFlush();

          //glFinish();

          ///

       

          _AquireGL(oclb, oclrt);

       

          cl_uint workDim = 2;

          const size_t* globalWorkOffset = NULL;

          const size_t globalWorkSize[] = {oclb.texWidth, oclb.texHeight};        // of length "work_dim"

          //const size_t localWorkSize[] = {16, 16};

          const size_t* localWorkSize = NULL;

       

          cl_uint numEventsInWaitList = 0;

          const cl_event* eventWaitList = NULL;

          cl_int error = clEnqueueNDRangeKernel(oclrt.deviceCmdQueue, oclp.kernel, workDim, globalWorkOffset, globalWorkSize, localWorkSize,

                                                                                               numEventsInWaitList, eventWaitList, &oclp.kernelEvent);

         

          clFlush(oclrt.deviceCmdQueue);

       

          {

              cl_int status = CL_SUCCESS;

              cl_int eventStatus = CL_QUEUED;

              while(eventStatus != CL_COMPLETE) {

                  status = clGetEventInfo(oclp.kernelEvent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL);

              }

       

              status = clReleaseEvent(oclp.kernelEvent);

          }

       

          _ReleaseGL(oclb, oclrt);

       

          clFlush(oclrt.deviceCmdQueue);

       

          int bufferByteLen = oclb.bufferSize;

          void* buffer = &oclb.indexBuffer[0];

       

          cl_bool blockingRead = CL_TRUE;

          size_t readOffset = 0;

          error = clEnqueueReadBuffer(oclrt.deviceCmdQueue, oclb.writeBufferOut_0, blockingRead, readOffset, bufferByteLen, buffer, 0, NULL, NULL);

      }

       

      Sorry for not using the formatting , it lags and doesnt work very well.

        • Re: Any hope of this running better?
          swoop

          I looked into the timing, apparently the bulk of the time is spent in my _Aquire() call where clEnqueueAcquireGLObjects() is the only thing being called. What would be the reason for clEnqueueAcquireGLObjects taking so long? The call to clEnqueueReadBuffer takes a while too but it still only takes about 1/4 the time

          my aquire call takes. Is there anything that can be done to improve this?

           

          Most of my fbo textures are rendered to and then used immediately in the next stage, but sticking this compute kernel in between two stages is hammering my performance. Im using Catalyst 13.4 on 5870.

            • Re: Any hope of this running better?
              himanshu.gautam

              That is interesting to know. Will it be possible for you to create a small testcase, that I can try running at our end.

              Also as I am not very familiar with CL_GL Interop land, it will help, if you can let me know how NVIDIA is performing for the same code. If nvidia is performing better , I can forward these results to relevant AMD engineers.

                • Re: Any hope of this running better?
                  swoop

                  I ran some timing on a nvidia geforce 670, although a newer generation card then my amd 5870, the time differences are rather large.

                   

                                                     AMD          NV

                  Aquire GL                   11.6ms          0.034ms

                  Enqueue kernel           0.004ms        0.013ms

                  Release GL                 0.002ms       0.02ms

                  Enqueue read              5.4ms          0.76ms

                   

                  Unfortunately I cant strip this down to small testcase, It was attempted as an addition to a larger project. But I can try to give as many details as possible.

                   

                  - Using 2 FBOs

                  - FBO #1, with 3 x 16bit RGBA and one 32bit float depth attached (1 of the 16bit RGBAs is shared, the depth is also shared)

                  - FBO #2, 4 x 16bit RGBA attached (1 shared with FBO 1)

                  - Rendering is done to FBO #1

                  - Then rendering is done to FBO #2, two of the texture attachments are rendered to (MRT, 1 of which is the shared 16bit)

                  - once that is done, these two are used as inputs to render to the third

                  - at this point the, once the third is rendered to, the kernel above is run, this third texture is the GL input to the kernel

                  - - this input texture (the 3rd attachment of FBO #2) is 16bit float RGBA which may be worth noting? conversion of some sort happening?

                  - - although the cl type CL_HALF_FLOAT should handle the channel type

                  - this third texture goes on as an input when rendering to the fourth attachment (with or without the results from the kernel, depending on if I have it enabled)

                   

                  the first bold line above is where the performance hit happens

                    • Re: Any hope of this running better?
                      himanshu.gautam

                      Well, What is the source of these timing information you have shared? cl_events may not be very reliable specially in case of AMD.

                      What catalyst driver and NVIDIA driver you had used for comparison? System configuration of the systems in which the GPUs were put?

                       

                      One suggestion I can give is to check the performance of SimpleGL sample (from APP SDK or its equivalent from NVIDIA SDK), on both these cards. That code can then be used as a testcase.

                        • Re: Any hope of this running better?
                          swoop

                          The timings come from running a continuous loop and using hi res timer to check the start and end times of the individual code sections. The original problem was noticed while running the visual studio profiler, and the large performance hit that is taken on the amd system by inserting this kernel and accessing the fbo texture.

                           

                          The AMD System is 4gb ram, Phenom II X4 955 3.2ghz, running 5870 on cat 13.4, win7 pro 64

                          The NV System is 16gb ram, i7-3770 3.4ghz, running GTX 670 on 320.49, win7 ultimate 64

                           

                          The SimpleGL example seems to run fine, but it accesses a gl vertex buffer, not a gl texture which is attached to a framebufferobject. The NV opencl samples have demo showing a kernel accessing a gl texture but it uses pixel buffer objects rather then framebufferobjects.