12 Replies Latest reply on Mar 9, 2010 3:05 AM by pavandsp

    code working in CPU and not in GPU

    pavandsp

      Hi,

      When I execute the simple (multiplier) code on CPU the output is correct but when executed on GPU with proper modification in context ,command APIs output is not proper.

      Kernel: Multiply 8x8 Matrix by 2.i.e A*2.

      actually I have other Algo in the func  which is not working so I commented and trying with this multiply so as to get the simple func to work in GPU.

      lines=Len=8; globalThreads[0] =8;  globalThreads[1] =8;

      status = clEnqueueNDRangeKernel(
                                   commandQueue,
                       kernel, 2, NULL,
                       globalThreads,
                      NULL, //localThreads
                       0,
                       NULL,
                       &events[0]);


      _kernel void myKernel(const float x,
                              const float y,
                              const int lines,
                              const int Len, //width
                              __global  unsigned char * output,
                              __global  unsigned char * input)
      {
          uint tx = get_global_id(0);
          uint ty = get_global_id(1);
         output[(ty * Len) + tx] = input[(ty * Len) + tx] * 2;

      }

      Details:GPU ATI RV710.AMD CPU

      Input:
      0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 .

      Output in CPUOutput is 3 times the size of input).Correct
      0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74 76 78 80 82 84 86 88 90 92 94 96 98 100 102 104 106 108 110 112 114 116 118 120 122 124 126 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

      Output in GPUOutput is 3 times the size of input).Wrong
      6 0 0 0 14 0 0 0 22 0 0 0 30 0 0 0 38 0 0 0 46 0 0 0 54 0 0 0 62 0 0 0 70 0 0 0 78 0 0 0 86 0 0 0 94 0 0 0 102 0 0 0 110 0 0 0 118 0 0 0 126 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.

      I am not sure whats happening i think all the Max work items sizes and maxworkgroupsize are within the limit because my size is 8x8.

      Also I am not clear of

      1.global work items and it relation to parallelism

      2.work item :How many pixel elements wil be processed in a work item and where do i get this data


      Thanks in Advance

      Pavan

        • code working in CPU and not in GPU
          omkaranathan

          pavandsp,

          I am not able to reproduce your issue. Could you post the whole code? both host and kernel

            • code working in CPU and not in GPU
              gaurav.garg

              Can you try it with explicit work-group size? Try with 8*8, 8*4 and 4*4 and see if it works with anyone.

              • code working in CPU and not in GPU
                pavandsp

                Hi

                I have attached kernel and host code .Please let me know for any modification i Have to do so as to able run in CPU.I have reused the Template Example .

                For time being I am not using LocalThreads in clEnqueueNDRangeKernel.Lets OpenCL decide the work group.

                /*! * Sample kernel which multiplies every element of the input array with * a constant and stores it at the corresponding output array */ #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable unsigned int prev_line_num= 1; unsigned int cnt=1; __kernel void templateKernel(const float x, const float y, const int lines, const int Len, __global unsigned char * output, __global unsigned char * input) { uint tx = get_global_id(0); uint ty = get_global_id(1); uint value=0; output[(ty * Len) + tx] = input[(ty * Len) + tx] * 2; } ----------------------------------------------------------------------------Template.cpp------------- #include "Template.hpp" /* * \brief Host Initialization * Allocate and initialize memory * on the host. Print input array. */ int initializeHost(void) { input = NULL; output = NULL; x =1.0f; y =1.0f; lines =8; Len =8; width =lines*Len; ///////////////////////////////////////////////////////////////// // Allocate and initialize memory used by host ///////////////////////////////////////////////////////////////// cl_uint sizeInBytes = width * sizeof(cl_uchar); input = (cl_uchar *)malloc(sizeInBytes); if(input == NULL) { std::cout<<"Error: Failed to allocate input memory on host\n"; return 1; } output = (cl_uchar *)malloc(sizeInBytes*3); if(output == NULL) { std::cout<<"Error: Failed to allocate output memory on host\n"; return 1; } for(cl_uint i = 0; i < width; i++) { input[i] = (cl_uint)i; printf("%d ",input[i]); } return 0; } * * Converts the contents of a file into a string */ std::string convertToString(const char *filename) { size_t size; char* str; std::string s; std::fstream f(filename, (std::fstream::in | std::fstream::binary)); if(f.is_open()) { size_t fileSize; f.seekg(0, std::fstream::end); size = fileSize = f.tellg(); f.seekg(0, std::fstream::beg); str = new char[size+1]; if(!str) { f.close(); return NULL; } f.read(str, fileSize); f.close(); str[size] = '\0'; s = str; return s; } return NULL; } int initializeCL(void) { cl_int status = 0; size_t deviceListSize; cl_device_type device_type=NULL; cl_uint num_devices; /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(status != CL_SUCCESS) { printf("Error: Getting Platforms. (clGetPlatformsIDs)\n"); return 1; } if(numPlatforms > 0) { cl_platform_id* platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(status != CL_SUCCESS) { printf("Error: Getting Platform Ids. (clGetPlatformsIDs)\n"); return 1; } /* * If we could find our platform, use it. Otherwise pass a NULL and get whatever the * implementation thinks we should be using. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; cl_context_properties* cprops = (NULL == platform) ? NULL : cps; ///////////////////////////////////////////////////////////////// // Create an OpenCL context ///////////////////////////////////////////////////////////////// context = clCreateContextFromType(cprops, // CL_DEVICE_TYPE_CPU, CL_DEVICE_TYPE_GPU, NULL, NULL, &status); if(status != CL_SUCCESS) { std::cout<<"Error: Creating Context. (clCreateContextFromType)\n"; return 1; } /* First, get the size of device list data */ status = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); if(status != CL_SUCCESS) { std::cout<< "Error: Getting Context Info \ (device list size, clGetContextInfo)\n"; return 1; } ///////////////////////////////////////////////////////////////// // Detect OpenCL devices ///////////////////////////////////////////////////////////////// devices = (cl_device_id *)malloc(deviceListSize); if(devices == 0) { std::cout<<"Error: No devices found.\n"; return 1; } /* Now, get the device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL); if(status != CL_SUCCESS) { std::cout<< "Error: Getting Context Info \ (device list, clGetContextInfo)\n"; return 1; } ///////////////////////////////////////////////////////////////// // Create an OpenCL command queue ///////////////////////////////////////////////////////////////// commandQueue = clCreateCommandQueue( context, // devices[0], //CPU devices[1], CL_QUEUE_PROFILING_ENABLE, &status); if(status != CL_SUCCESS) { std::cout<<"Creating Command Queue. (clCreateCommandQueue)\n"; return 1; } ///////////////////////////////////////////////////////////////// // Create OpenCL memory buffers ///////////////////////////////////////////////////////////////// inputBuffer = clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uchar) * width, input, &status); if(status != CL_SUCCESS) { std::cout<<"Error: clCreateBuffer (inputBuffer)\n"; return 1; } outputBuffer = clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uchar) * width * 3, output, &status); if(status != CL_SUCCESS) { std::cout<<"Error: clCreateBuffer (outputBuffer)\n"; return 1; } ///////////////////////////////////////////////////////////////// // Load CL file, build CL program object, create CL kernel object ///////////////////////////////////////////////////////////////// const char * filename = "Template_Kernels.cl"; std::string sourceStr = convertToString(filename); const char * source = sourceStr.c_str(); size_t sourceSize[] = { strlen(source) }; program = clCreateProgramWithSource( context, 1, &source, sourceSize, &status); /* create a cl program executable for all the devices specified */ status = clBuildProgram(program,1,devices, NULL, NULL, NULL); size_t len; char buffer[2048]; cl_build_status buffer1; kernel = clCreateKernel(program, "templateKernel", &status); if(status != CL_SUCCESS) { std::cout<<"Error: Creating Kernel from program. (clCreateKernel)\n"; return 1; } * * \brief Run OpenCL program * * Bind host variables to kernel argumenats * Run the CL kernel */ int runCLKernels(void) { cl_int status; cl_uint maxDims; cl_event events[2]; size_t globalThreads[2]; size_t localThreads[2]; size_t maxWorkGroupSize; size_t maxWorkItemSizes[3]; size_t length; size_t kernelWorkGroupSize; cl_device_type device_type; cl_ulong startTime ,endTime; char devicebuff[100]; /** * Query device capabilities. Maximum * work item dimensions and the maximmum * work item sizes */ clGetDeviceInfo( devices[1], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void*)&maxWorkGroupSize, NULL); clGetDeviceInfo( devices[1], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void*)&maxDims, NULL); clGetDeviceInfo( devices[1], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxDims, (void*)maxWorkItemSizes, NULL); clGetDeviceInfo( devices[1], CL_DEVICE_TYPE, sizeof(cl_device_type), (void*)device_type, NULL); clGetDeviceInfo( devices[1], CL_DEVICE_NAME, sizeof(devicebuff), (void*)devicebuff, NULL); globalThreads[0] =8; globalThreads[1] =8; localThreads[0] =32; localThreads[1] =32; if(globalThreads[0] > maxWorkItemSizes[0] || localThreads[0] > maxWorkGroupSize) { std::cout<<"Unsupported: Device does not support requested number of work items."; return 1; } /*** Set appropriate arguments to the kernel ***/ /*x*/ status = clSetKernelArg( kernel, 0, sizeof(cl_float), (void *)&x); if(status != CL_SUCCESS) { std::cout<< "Error: Setting kernel argument. (x)\n"; return 1; } /*y*/ status = clSetKernelArg( kernel, 1, sizeof(cl_float), (void *)&y); if(status != CL_SUCCESS) { std::cout<< "Error: Setting kernel argument. (y)\n"; return 1; } /*lines*/ status = clSetKernelArg( kernel, 2, sizeof(cl_int), (void *)&lines); if(status != CL_SUCCESS) { std::cout<< "Error: Setting kernel argument. (lines)\n"; return 1; } /*line*/ status = clSetKernelArg( kernel, 3, sizeof(cl_int), (void *)&Len); /* the output array to the kernel */ status = clSetKernelArg( kernel, 4, sizeof(cl_mem), (void *)&outputBuffer); if(status != CL_SUCCESS) { std::cout<<"Error: Setting kernel argument. (output)\n"; return 1; } /* the input array to the kernel */ status = clSetKernelArg( kernel, 5, sizeof(cl_mem), (void *)&inputBuffer); if(status != CL_SUCCESS) { std::cout<<"Error: Setting kernel argument. (input)\n"; return 1; } /* * Enqueue a kernel run call. */ status = clEnqueueNDRangeKernel( commandQueue, kernel, 2, NULL, globalThreads, //localThreads, NULL, 0, NULL, &events[0]); /* wait for the kernel call to finish execution */ status = clWaitForEvents(1, &events[0]); clReleaseEvent(events[0]); /* Enqueue readBuffer*/ status = clEnqueueReadBuffer( commandQueue, outputBuffer, CL_TRUE, 0, width * 3 * sizeof(cl_uchar), output, 0, NULL, &events[1]); /* Wait for the read buffer to finish execution */ status = clWaitForEvents(1, &events[1]); clReleaseEvent(events[1]); return 0; } /* * \brief Release OpenCL resources (Context, Memory etc.) */ int cleanupCL(void) { cl_int status; status = clReleaseKernel(kernel); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseKernel \n"; return 1; } status = clReleaseProgram(program); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseProgram\n"; return 1; } status = clReleaseMemObject(inputBuffer); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseMemObject (inputBuffer)\n"; return 1; } status = clReleaseMemObject(outputBuffer); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseMemObject (outputBuffer)\n"; return 1; } status = clReleaseCommandQueue(commandQueue); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseCommandQueue\n"; return 1; } status = clReleaseContext(context); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseContext\n"; return 1; return 0; } /* * \brief Releases program's resources */ void cleanupHost(void) { if(input != NULL) { free(input); input = NULL; } if(output != NULL) { free(output); output = NULL; } if(devices != NULL) { free(devices); devices = NULL; } int main(int argc, char * argv[]) { // Initialize Host application if(initializeHost()==1) return 1; // Initialize OpenCL resources if(initializeCL()==1) return 1; // Run the CL program if(runCLKernels()==1) return 1; // Print output array for(cl_uint i = 0; i <(width*3); i++) { printf("%d ",output[i]); } // Releases OpenCL resources if(cleanupCL()==1) return 1; // Release host resources cleanupHost(); return 0; }

              • code working in CPU and not in GPU
                MicahVillmow
                The problem here is you are using an extension that the device does not support. Make sure you query OpenCL runtime to check if your device that you are running on supports the extensions specified in your kernel. Not following this will result in undefined behavior.