AnsweredAssumed Answered

Any hope of this running better?

Question asked by swoop on Jul 27, 2013
Latest reply on Oct 9, 2013 by himanshu.gautam

I have an opencl kernel which reads from an opengl texture (the texture is a render target attached to a framebuffer object). Once the texture is rendered to, I need to run my kernel to calculate another texture for the next render step where both will be used together. Everything works correctly, I read results back that look correct, but it kills my performance. I was hoping for performance that would rival that of GLSL shaders, Im wondering if Im doing something wrong.

 

Kernel

 

    const char* source =
    "#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable \n"
    "\n"       
    "__constant float4 kToLum = (float4)(0.299f, 0.587f, 0.114f, 0.0f); \n"
    "\n"
    "__kernel void Lum(__read_only image2d_t readImage2D, __global uchar* lumHist)\n"
    "{\n"
    "   const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; \n"
    "   int ix_s = get_global_id(0); \n"
    "   int iy_t = get_global_id(1); \n"
    "   int2 ixy_st = (int2)(ix_s, iy_t); \n"
    "\n"
    "   float4 color = read_imagef(readImage2D, sampler, ixy_st); \n"
    "   float lum = dot(color, kToLum) * 255.0f; \n"
    "   uchar index = convert_uchar_rte(lum); \n"
    "\n"
    "   int imgHeight = get_image_height(readImage2D);  \n"
    "   iy_t = (imgHeight - 1) - iy_t;  \n"
    "\n"       
    "   int imgWidth = get_image_width(readImage2D);  \n"
    "   lumHist[ix_s + (iy_t * imgWidth)] = index; \n"
    "}\n";

 

Setup

 

void _CreateBuffer(oclLumBuffer_t& oclb, const oclrt_t& oclrt)

{

    cl_int error = 0;

    GLint mipLevel = 0;

    cl_mem_flags memFlags = CL_MEM_READ_ONLY;

    oclb.readBufferIn_0 = clCreateFromGLTexture2D(oclrt.context, memFlags, GL_TEXTURE_2D, mipLevel, oclb.tex2DID, &error);

 

    int bufferLen = oclb.texWidth * oclb.texHeight;

    oclb.bufferSize = bufferLen * sizeof(oclLumBuffer_t::bufferType_t);

    oclb.indexBuffer = new oclLumBuffer_t::bufferType_t[bufferLen];       

    MemClr(&oclb.indexBuffer[0], oclb.bufferSize);

 

    memFlags = CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR;     // write to app memory

    oclb.writeBufferOut_0 = clCreateBuffer(oclrt.context, memFlags, oclb.bufferSize, &oclb.indexBuffer[0], &error);

}

 

void _SetBuffer(oclLumBuffer_t& oclb, const oclProgram_t& oclp)

{

    cl_uint argIndex = 0;

    clSetKernelArg(oclp.kernel, argIndex, sizeof(oclb.readBufferIn_0), &oclb.readBufferIn_0);

 

    argIndex = 1;

    clSetKernelArg(oclp.kernel, argIndex, sizeof(cl_mem), (void*)&oclb.writeBufferOut_0);

}

 

Run kernel

 

void _AquireGL(oclLumBuffer_t& oclb, const oclrt_t& oclrt)

{

    cl_int error = 0;

    cl_uint numObjs = 1;

    cl_uint numEventsInWaitList = 0;

    const cl_event* eventWaitList = NULL;

    cl_event* event = NULL;

    error = clEnqueueAcquireGLObjects(oclrt.deviceCmdQueue, numObjs, &oclb.readBufferIn_0, numEventsInWaitList, eventWaitList, event);

}

 

void _ReleaseGL(oclLumBuffer_t& oclb, const oclrt_t& oclrt)

{

    cl_int error = 0;

    cl_uint numObjs = 1;

    cl_uint numEventsInWaitList = 0;

    const cl_event* eventWaitList = NULL;

    cl_event* event = NULL;

    error = clEnqueueReleaseGLObjects(oclrt.deviceCmdQueue, numObjs, &oclb.readBufferIn_0, numEventsInWaitList, eventWaitList, event);       

}

 

void _RunCompute(oclProgram_t& oclp, oclLumBuffer_t& oclb, const oclrt_t& oclrt)

{

    ///

    //glFlush();

    //glFinish();

    ///

 

    _AquireGL(oclb, oclrt);

 

    cl_uint workDim = 2;

    const size_t* globalWorkOffset = NULL;

    const size_t globalWorkSize[] = {oclb.texWidth, oclb.texHeight};        // of length "work_dim"

    //const size_t localWorkSize[] = {16, 16};

    const size_t* localWorkSize = NULL;

 

    cl_uint numEventsInWaitList = 0;

    const cl_event* eventWaitList = NULL;

    cl_int error = clEnqueueNDRangeKernel(oclrt.deviceCmdQueue, oclp.kernel, workDim, globalWorkOffset, globalWorkSize, localWorkSize,

                                                                                         numEventsInWaitList, eventWaitList, &oclp.kernelEvent);

   

    clFlush(oclrt.deviceCmdQueue);

 

    {

        cl_int status = CL_SUCCESS;

        cl_int eventStatus = CL_QUEUED;

        while(eventStatus != CL_COMPLETE) {

            status = clGetEventInfo(oclp.kernelEvent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL);

        }

 

        status = clReleaseEvent(oclp.kernelEvent);

    }

 

    _ReleaseGL(oclb, oclrt);

 

    clFlush(oclrt.deviceCmdQueue);

 

    int bufferByteLen = oclb.bufferSize;

    void* buffer = &oclb.indexBuffer[0];

 

    cl_bool blockingRead = CL_TRUE;

    size_t readOffset = 0;

    error = clEnqueueReadBuffer(oclrt.deviceCmdQueue, oclb.writeBufferOut_0, blockingRead, readOffset, bufferByteLen, buffer, 0, NULL, NULL);

}

 

Sorry for not using the formatting , it lags and doesnt work very well.

Outcomes