6 Replies Latest reply on Nov 13, 2010 8:43 AM by himanshu.gautam

    query utilized local memory size return wrong value

    rotor
      CL_KERNEL_LOCAL_MEM_SIZE return zero value

      Hi all,

      I try to query the size of utilized local memory of a kernel but it return value equal zero even my kernel has utilized 1KB of local memory already. A snip of code is attached here. What is the reason of this wrong value? is that a bug?

       

      clSetKernelArg( kernel, 0, 512*sizeof(cl_ushort), NULL); //check used local memory status = clGetKernelWorkGroupInfo( kernel, glob_devices,//devices[0], CL_KERNEL_LOCAL_MEM_SIZE, sizeof(cl_ulong), &usedLocalMemory, NULL); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetKernelWorkGroupInfo CL_KERNEL_LOCAL_MEM_SIZE failed.")) { return SDK_FAILURE; }

        • query utilized local memory size return wrong value
          himanshu.gautam

          hi rotor,

          You can only allocate local memory.You cannot initialize with some buffer or array.Refer to openCL spec for more details.

           

            • query utilized local memory size return wrong value
              rotor

              Hi Himanshu,

              May be the word "utilize" I use here is not fully appropriate, but yes I know that we can only allocate local array. That what I exactly did when calling:

                 clSetKernelArg(
                    kernel,
                    0,
                    512*sizeof(cl_ushort),
                    NULL);

              BTW my question is not how to "utilize/alocate" local memory, my question is query the size of used local memory via CL_KERNEL_LOCAL_MEM_SIZE but I got a zero value return. What I post here just for inllustration but even when I use ATI SDK's samples, the same problem happened

              Thanks

                • query utilized local memory size return wrong value
                  himanshu.gautam

                  hi rotor,

                  I tried on my system and i am able to get non-zero local memory size from the clGetKernelWorkGroupInfo API.

                  For a buffer of 512floats i get 2048 as the memory allocated.

                  Can you also share the kernel code?

                  Please post your system info also:CPU GPU,SDK,DRIVER,OS.

                   

                    • query utilized local memory size return wrong value
                      rotor

                      Hi Himanshu,

                      here is my system specs: Windows 7, 64 bits; ATI SDK 2.1; ATI 5870 card.

                      The simplest example for myproblem is the reduction application in ATI SDK samples. When I run that application, I got the zero value return. I copy here the Reduction::runCLKernels() routine that mainly do the jobs.

                      Reduction::runCLKernels() { cl_int status; cl_event events[1]; /* * This algorithm reduces each group of work-items to a single value * on OpenCL device and later each reduced items per group is further * reduced to a single value on CPU */ /* Declare temporary output buffer */ int numBlocks = length / (cl_int)groupSize; cl_uint* tempOut = NULL; #if defined (_WIN32) tempOut = (cl_uint*)_aligned_malloc(numBlocks * sizeof(cl_uint4), 16); #else tempOut = (cl_uint*)memalign(16, numBlocks * sizeof(cl_uint4)); #endif if(tempOut == NULL) { sampleCommon->error("Failed to allocate host memory. (tempOut)"); return SDK_FAILURE; } /* Create memory objects for temporary output array */ cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, numBlocks * sizeof(cl_uint4), tempOut, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateBuffer failed. (outputBuffer)")) { return SDK_FAILURE; } int timer = sampleCommon->createTimer(); sampleCommon->resetTimer(timer); sampleCommon->startTimer(timer); std::cout << "Executing kernel for " << iterations << " iterations" << std::endl; std::cout << "-------------------------------------------" << std::endl; /* Run the kernel for a number of iterations */ for(int i = 0; i < iterations; i++) { /*** Set appropriate arguments to the kernel ***/ /* the input array */ status = clSetKernelArg(kernel, 0, sizeof(cl_mem),(void*)&inputBuffer); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clSetKernelArg failed. (inputBuffer)")) { return SDK_FAILURE; } /* temporary output buffer */ status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&outputBuffer); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clSetKernelArg failed. (outputBuffer)")) { return SDK_FAILURE; } /* local array */ status = clSetKernelArg(kernel, 2, groupSize * sizeof(cl_uint4), NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clSetKernelArg failed. (local memory)")) { return SDK_FAILURE; } /* * Enqueue a kernel run call. */ size_t globalThreads[] = {length}; size_t localThreads[] = {groupSize}; if(localThreads[0] > maxWorkItemSizes[0] || localThreads[0] > maxWorkGroupSize) { std::cout << "Unsupported: Device does not support" "requested number of work items."; return SDK_FAILURE; } status = clGetKernelWorkGroupInfo(kernel, devices[0], CL_KERNEL_LOCAL_MEM_SIZE, sizeof(cl_ulong), &usedLocalMemory, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetKernelWorkGroupInfo" " CL_KERNEL_LOCAL_MEM_SIZE failed.")) { return SDK_FAILURE; } if(usedLocalMemory > totalLocalMemory) { std::cout << "Unsupported: Insufficient local memory on device." << std::endl; return SDK_FAILURE; } status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clEnqueueNDRangeKernel failed.")) { return SDK_FAILURE; } status = clFinish(commandQueue); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clFinish failed.")) { return SDK_FAILURE; } /* Enqueue readBuffer*/ status = clEnqueueReadBuffer(commandQueue, outputBuffer, CL_TRUE, 0, numBlocks * sizeof(cl_uint4), tempOut, 0, NULL, &events[0]); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clEnqueueReadBuffer failed.")) { return SDK_FAILURE; } /* Wait for the read buffer to finish execution */ status = clWaitForEvents(1, &events[0]); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clWaitForEvents failed.")) { return SDK_FAILURE; } clReleaseEvent(events[0]); /* Clear the output value */ output = 0; /* Add individual sum of blocks */ for(int i = 0; i < numBlocks * VECTOR_SIZE; ++i) { output += tempOut[i]; } } sampleCommon->stopTimer(timer); /* Compute total time */ kernelTime = (double)(sampleCommon->readTimer(timer)) / iterations; status = clReleaseMemObject(outputBuffer); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clReleaseMemObject failed.")) { return SDK_FAILURE; } if(tempOut) { #ifdef _WIN32 _aligned_free(tempOut); #else free(tempOut); #endif tempOut = NULL; } return SDK_SUCCESS; }