schrotti007

Strange Error with Local Memory on RV770

Discussion created by schrotti007 on Nov 11, 2010
Latest reply on Feb 16, 2011 by MicahVillmow
When I allocate more local Memory than actually needed, my kernel stops working

Hello

I Discovered (maybe) a quite strange bug. I have a simple Kernel that does nothing more than copying the input data into a local buffer and copying from that buffer into the output buffer.

On the host, I must allocate at least 4*localworksize, because i copy 4 bytes at once. But if I allocate more, lets say 16*localworksize, the kernel does not copy the data correctly. But I can find out only by comparing the input data with the output data, the returnvalue of clEnqueueNDRangeKernel is always CL_SUCCESS.

Is it just a bug in the local Memory of the RV770 (I use a 4850), or a does the local memory just not work this way? (I didnt even access the local memory from other threads, so of course the use of local memory is pretty useless here, its just a simple example to spot the problem)

Almost forgot:

Using Stream Sdk 2.2, Catalyst 10.10, Radeon 4850 with 512Mb and Windows 7 64 bit.

//HelloCL_Kernels.cl __kernel void blocktest(__global uchar4 * output , __global uchar4 * input , __local uchar4 * block0 ) { unsigned int blockIdx = get_group_id(0); unsigned int localIndex = get_local_id(0); unsigned int globalIndex = blockIdx * get_local_size(0) + localIndex; block0[localIndex] = input[globalIndex]; output[globalIndex] = block0[localIndex]; } //HelloCL.cpp #include <stdio.h> #include <CL/cl.h> #include <iostream> #define CHECKSTATUS if (status != CL_SUCCESS) {cout << "error\n";return 1;} using namespace std; int main() { cl_uint numPlatforms; cl_platform_id platform = NULL; cl_int status; size_t size = 1048576*64; cl_int localthreads = 64; status = clGetPlatformIDs(0, NULL, &numPlatforms); CHECKSTATUS cout << numPlatforms << " OpenCl Platforms found:\n"; cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL); CHECKSTATUS if (numPlatforms!=1) { unsigned int pf; cout << "Select Platform: "; cin >> pf; if (pf<numPlatforms) platform = platforms[pf]; else {cout << "Falsche Antwort :(";return 1;} } else platform = platforms[0]; cl_device_id deviceId; status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &deviceId, 0); CHECKSTATUS cl_context context; cl_command_queue commandQueue; context = clCreateContext(NULL, 1, &deviceId, NULL, NULL, &status); CHECKSTATUS commandQueue = clCreateCommandQueue(context, deviceId, NULL, &status); CHECKSTATUS FILE* f = fopen("d:\\oo.exe", "rb"); if (!f) { cout << "File not found\n"; return 1; } fseek(f, 0, SEEK_END); size_t filesize = ftell(f); fseek(f, 0, SEEK_SET); if (filesize < size) { cout << "File 2 small\n"; return 1; } char* indata = (char*)malloc(size); fread(indata, 1, size, f); fclose(f); char* outdata = (char*)malloc(size); f = fopen("HelloCL_Kernels.cl", "rb"); if (!f) { cout << "File not found\n"; return 1; } fseek(f, 0, SEEK_END); filesize = ftell(f); fseek(f, 0, SEEK_SET); char* kernelString = (char*)malloc(filesize+1); fread(kernelString, 1, filesize, f); fclose(f); kernelString[filesize]=0; cl_program program; cl_kernel kernel; program = clCreateProgramWithSource(context, 1,(const char**) &kernelString, &filesize, &status); CHECKSTATUS status = clBuildProgram(program, 1, &deviceId, NULL, NULL, NULL); if (status != CL_SUCCESS) { cout << "Cant compile Program\n";//, status is "<<status<< StrCLError(status) << "\n"; char* programLog = (char*) malloc(1024000); // evtl ein bischen übertrieben, aber 10000 is definitiv zuwenig! size_t muhh = 1337; status = clGetProgramBuildInfo(program, deviceId, CL_PROGRAM_BUILD_LOG, 1024000, programLog, &muhh); if (status != CL_SUCCESS) cout << "Cant get extended infos\n"; cout << muhh << "\n"; programLog[muhh]=0; cout << programLog; return 1; } size_t globalThreads[1]= {size/4}; // every globalthread should copy 4 bytes size_t localThreads[1] = {4}; cl_event evt; kernel = clCreateKernel(program, "blocktest", &status); CHECKSTATUS cl_command_queue commandqueue = clCreateCommandQueue(context, deviceId, NULL, &status); CHECKSTATUS cl_mem indata_device = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status); CHECKSTATUS status = clEnqueueWriteBuffer(commandqueue, indata_device, CL_TRUE, 0, size, indata, 0, NULL, NULL); CHECKSTATUS cl_mem outdata_device = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status); CHECKSTATUS status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&outdata_device); CHECKSTATUS status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&indata_device); CHECKSTATUS status = clSetKernelArg(kernel, 2, localthreads*64, NULL); CHECKSTATUS status = clEnqueueNDRangeKernel(commandqueue, kernel, 1, NULL, globalThreads, localThreads, 0, NULL, &evt); CHECKSTATUS status = clWaitForEvents(1, &evt); CHECKSTATUS status = clEnqueueReadBuffer(commandqueue, outdata_device, CL_TRUE, 0, size, outdata, 0, NULL, NULL); CHECKSTATUS cout << "Compare inbuffer to outbuffer\n"; for (unsigned int i=0;i<size;i++) { if (indata[i] != outdata[i]) { cout << "Data mismatch at " << i << "\n"; } } cout << "Done!\n"; cin >> status; }

Outcomes