AnsweredAssumed Answered

AMD's OCL global work size with 2d work dimensions limit

Question asked by pontiacgtx on Jul 16, 2020
Latest reply on Jul 17, 2020 by pontiacgtx

I have found a problem when executing a kernel the second dimesion of the work units get_global_id(1) get limited to around 120  whille the first dimension can execute every unit until the max set on the first dimension on the host  I wonder if there is some kind of limit for each work dimension (some length limit)

 

 

 

 

for testing this I just printed the second dimension form the kernel

 

__kernel void Interpolation(__global struct Color* source,__global struct Color* target,uint64 width,uint64 height,uint64 ratio,uint64 limit, uint64 originalHeight)
        {
            __private fp32 wIndex = (int64)get_global_id(0);
            __private fp32 hIndex = (int64)get_global_id(1);

            printf("%d",get_global_id(1));

       
        }

 

the host looks like

 

   

void* source = imageObj->originalPixels->data();
        void* target = imageObj->processedPixels->data();



     

        cl_mem originalPixelsBuffer = clCreateBuffer(p1.context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(Color) * imageObj->SourceLength(), source, &p1.status);
        CheckErrorCode(p1.status, p1.program, p1.devices[0], "Failed to Create buffer 0");


        cl_mem targetBuffer = clCreateBuffer(p1.context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(Color) * imageObj->OutputLength(), target, &p1.status);
        CheckErrorCode(p1.status, p1.program, p1.devices[0], "Failed to Create buffer 1");





        cl_kernel interpolationKernel = NULL;
        p1.CreateKernel(interpolationKernel, "Interpolation");

   



       p1.status = clSetKernelArg(interpolationKernel, 0, sizeof(cl_mem), (void*)&originalPixelsBuffer);
      
        CheckErrorCode(p1.status, p1.program, p1.devices[p1.deviceIndex], "It Couldn't set Argument 0 for kernel");
        p1.status = clSetKernelArg(interpolationKernel, 1, sizeof(cl_mem), (void*)&targetBuffer);
      
        CheckErrorCode(p1.status, p1.program, p1.devices[p1.deviceIndex], "It Couldn't set Argument 1 for kernel");
        p1.status = clSetKernelArg(interpolationKernel, 2, sizeof(width), (void*)&width);
        CheckErrorCode(p1.status, p1.program, p1.devices[p1.deviceIndex], "It Couldn't set Argument 2 for kernel");
        p1.status = clSetKernelArg(interpolationKernel, 3, sizeof(height), (void*)&height);
        CheckErrorCode(p1.status, p1.program, p1.devices[p1.deviceIndex], "It Couldn't set Argument 3 for kernel);
        p1.status = clSetKernelArg(interpolationKernel, 4, sizeof(ratio), (void*)&ratio);
        CheckErrorCode(p1.status, p1.program, p1.devices[p1.deviceIndex], "It Couldn't set Argument 4 for kernel);
        p1.status = clSetKernelArg(interpolationKernel, 5, sizeof(limit), &limit);
        CheckErrorCode(p1.status, p1.program, p1.devices[p1.deviceIndex], "It Couldn't set Argument 5 for kernel);
        p1.status = clSetKernelArg(interpolationKernel, 6, sizeof(orgHeight), &orgHeight);
        CheckErrorCode(p1.status, p1.program, p1.devices[p1.deviceIndex],"It Couldn't set Argument 6 for kernel);



        StartTimer();
        ////Pass the buffer to the GPU to read


        p1.status = clEnqueueWriteBuffer(p1.commandQueue, originalPixelsBuffer, CL_FALSE, 0, sizeof(Color) * imageObj->SourceLength(), source, 0, NULL, NULL);
        CheckErrorCode(p1.status, p1.program, p1.devices[0], "Failed to write buffer 0");

        ////Pass the buffer to the GPU to write

        p1.status = clEnqueueWriteBuffer(p1.commandQueue, targetBuffer, CL_TRUE, 0, sizeof(Color) * imageObj->OutputLength(), target, 0, NULL, NULL);
        CheckErrorCode(p1.status, p1.program, p1.devices[0], "Failed to write buffer 1");

                                       
        size_t  globalWorkSize[2] = { imageObj->originalWidth * 4, imageObj->originalHeight * 4 };
        size_t localWorkSize[2]{ 64,64 };
        SetLocalWorkSize(IsDivisibleBy64(localWorkSize[0]), localWorkSize);
       
        p1.status = clEnqueueNDRangeKernel(p1.commandQueue, interpolationKernel, 2, NULL, globalWorkSize, IsDisibibleByLocalWorkSize(globalWorkSize, localWorkSize) ? localWorkSize : NULL, 0, NULL, NULL);
        CheckErrorCode(p1.status, p1.program, p1.devices[0], "Failed to clEnqueueDRangeKernel");

        //size_t  globalWorkSize[1] = { imageObj->originalWidth * imageObj->originalHeight };




        p1.status = clEnqueueReadBuffer(p1.commandQueue, targetBuffer, CL_TRUE, 0, sizeof(Color) * imageObj->OutputLength(), target, 0, NULL, NULL);
        CheckErrorCode(p1.status, p1.program, p1.devices[0], "Failed to write buffer 2");

attached the source code from a visual studio project file

 

 

now I wonder how woould change the code if I used a SVM instead the cl_mem model from opencl 1.2? would it change how the local work size determines the work group size? if not  then what can i do to make it work with this global work  size with  2 dimensions?

Attachments

Outcomes