laobrasuca

testing built-in vector types

Discussion created by laobrasuca on Sep 9, 2011
Latest reply on Sep 13, 2011 by notzed
wrong results with int4

Hi all,

I was making some tests with vector types, like int4, and I found some weird results. I created a test case where I have an array of int4 (cl_int4 *array_out) of size array_size/4 and I want to fill it with values from 0 to array_size, like: the first int4 of the array gets x, y, z and w values from 0 to 3, the second one get values from 4 to 7, the third from 8 to 11 and so on until the last element (the (array_size/4)th element) getting values from (array_size/4) - 4 to (array_size/4) - 1. It's like I had an array of int with size array_size and want to fill it with values from 0 to array_size-1, but I want to do it with int4.

In the coded attached I have the char kernel_cl[] variable which holds the kernel which should do it, but results are not consistent if I use GPU. A few values are correct stored while the most of it is not set.

for example, if I set array_size to 16 and initialize the array_out with -1 everywhere, I have:

array_out before running kernel =
-1 -1 -1 -1; -1 -1 -1 -1; -1 -1 -1  -1; -1 -1 -1 -1
array out after running kernel =
-1 -1  2 -1; -1 -1 -1   7; -1 -1 -1 11; -1 -1 14 -1

while I should have

array out after running kernel =
0 1 2 3; 4 5 6 7; 8 9 10 11; 12 13 14 15

What is weird is that if I run the kernel with some printf (see char kernel_printf_cl[]), the results are just OK! Actually, I used printf to try to debug, and I was surprised to see that array_out was correctly set after this! I than created a 3rd kernel (char kernel_choice_printf_cl[]), where I can chose to use printf or not (set PRINT_INFO to 1 or 0), with results OK if I use and not OK if I dont use printf. Unfortunately, for value of array_size >= 40, results are not correct even if I use printf in the kernel.

I can't see what's wrong, specially because when I use CPU (Phenom II X4 965), everything runs just fine (with or without printf, for any value of array_size). I'd be glad if you could check it. Is there any known problem with vector types and GPU?

AMD APP SDK 2.5, GPU driver 11.8, Win7 64bits, Radeon HD 5870.

#ifdef __APPLE__ #include <OpenCL/cl.h> #else #include <CL/cl.h> #endif #include <stdio.h> #include <stdlib.h> #include <string.h> #ifndef max #define max(a,b) (((a) > (b)) ? (a) : (b)) #endif #ifndef min #define min(a,b) (((a) < (b)) ? (a) : (b)) #endif cl_platform_id platform; cl_context context; cl_device_id device; cl_command_queue commandQueue; cl_program program; cl_kernel kernel; char dev_type[] = "GPU"; cl_uint maxComputeUnits; size_t maxWorkItemSizes[3]; cl_int4* array_out; cl_mem array_out_buffer; cl_uint array_size = 4*4; //for simplicity, make it multiple of 4 bool doProfiling = false; cl_int InitializeCL(void); cl_int SetHostData(void); cl_int SetDeviceData(void); cl_int RunKernel(void); cl_int ReadDataBack(void); cl_int ClearDeviceData(void); cl_int ClearHostData(void); cl_int SetCLPlataform(void); cl_int SetCLContext(void); cl_int SetCLDevices(void); cl_int SetCLCommandQueue(void); cl_int SetCLProgram(void); cl_int ProgramBuild(const char* source, cl_program &program, const char* program_name, cl_device_id device); cl_int SetCLKernel(void); cl_int main(int argc, char * argv[]) { cl_int status; if ((status = InitializeCL()) != 0) { printf("InitializeCL problem (error! %d)\n", status); return 1; } if ((status = SetHostData()) != 0) { printf("SetHostData problem (error! %d)\n", status); return 2; } if ((status = SetDeviceData()) != 0) { printf("SetDeviceData problem (error! %d)\n", status); return 3; } if ((status = RunKernel()) != 0) { printf("SetDeviceData problem (error! %d)\n", status); return 4; } if ((status = ReadDataBack()) != 0) { printf("ReadDataBack problem (error! %d)\n", status); return 5; } if ((status = ClearDeviceData()) != 0) { printf("ClearDeviceData problem (error! %d)\n", status); return 2; } if ((status = ClearHostData()) != 0) { printf("ClearHostData problem (error! %d)\n", status); return 3; } return 0; } //Sequence d'initialisation d'OpenCL cl_int InitializeCL() { cl_int status = 0; ///////////////////////////////////////////////////////////////// // Have a look at the available platforms. ///////////////////////////////////////////////////////////////// if (SetCLPlataform() != 0) return 1; ///////////////////////////////////////////////////////////////// // Create an OpenCL context ///////////////////////////////////////////////////////////////// if (SetCLContext() != 0) return 2; ///////////////////////////////////////////////////////////////// // Detect OpenCL devices ///////////////////////////////////////////////////////////////// if (SetCLDevices() != 0) return 3; ///////////////////////////////////////////////////////////////// // Create an OpenCL command queue ///////////////////////////////////////////////////////////////// if (SetCLCommandQueue() != 0) return 4; ///////////////////////////////////////////////////////////////// // Load CL file, build CL program object ///////////////////////////////////////////////////////////////// if ((status = SetCLProgram()) != 0) return status; ///////////////////////////////////////////////////////////////// // Create CL kernel object, get devices info for NDRange kernel ///////////////////////////////////////////////////////////////// if (SetCLKernel() != 0) return 6; return 0; } cl_int SetHostData(void) { //Allocate memory array_out = (cl_int4*) malloc(sizeof(cl_int4) * (array_size>>2)); if (array_out == NULL) return 2; //Initialize values for (cl_uint i = 0; i < array_size; ++i) array_out[i>>2].s[i%4] = -1; //print for debugging cl_int4 *ptr_array_out = array_out; printf("Array_out before running kernel = \n"); for (int i = (array_size>>2); i-- ; ++ptr_array_out) printf("%d %d %d %d ", (*ptr_array_out).s[0], (*ptr_array_out).s[1], (*ptr_array_out).s[2], (*ptr_array_out).s[3]); printf("\n"); return 0; } cl_int SetDeviceData(void) { cl_int status; cl_event events; //1: create buffer array_out_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_int4) * (array_size>>2), NULL, &status); if(status != CL_SUCCESS) return 1; //2: send array_out to device //a.enqueue sending status = clEnqueueWriteBuffer(commandQueue, array_out_buffer, CL_TRUE, 0, sizeof(cl_int4) * (array_size>>2), array_out,0, NULL, &events); if(status != CL_SUCCESS) return 2; //b.check elapsed time if (doProfiling) { // wait for the EnqueueWrite call to finish execution if ((status = clWaitForEvents(1, &events)) != 0) return status; cl_ulong startTime, endTime; clGetEventProfilingInfo(events, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL); clGetEventProfilingInfo(events, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); printf("Host -> device data transfer elapsed time: %d ns\n", endTime - startTime); if ((status = clReleaseEvent(events)) != 0) return 3; } //3: set kernel arguments //a.array_out_buffer clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&array_out_buffer); if(status != CL_SUCCESS) return 4; //c.array_size clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&array_size); if(status != CL_SUCCESS) return 5; return 0; } cl_int RunKernel(void) { cl_int status; cl_event events; //Compute local and global thread size for kernel size_t local_size = max(min(array_size/maxComputeUnits, min((cl_uint)maxWorkItemSizes[0], (cl_uint)256)), (cl_uint)1); size_t mod_t = array_size % local_size; size_t global_size = array_size + ((mod_t) ? (local_size - mod_t) : 0); status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, &events); if (status != CL_SUCCESS) { printf("Error in clEnqueueNDRangeKernel (CL error: %d)\n", status); return 1; } if (doProfiling) { // wait for the kernel call to finish execution if ((status = clWaitForEvents(1, &events)) != 0) return status; cl_ulong startTime, endTime; clGetEventProfilingInfo(events, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL); clGetEventProfilingInfo(events, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); printf("Kernel run in %d ns\n", endTime - startTime); if ((status = clReleaseEvent(events)) != 0) return 3; } return 0; } cl_int ReadDataBack(void) { cl_int status; cl_event events; status = clEnqueueReadBuffer(commandQueue, array_out_buffer, CL_TRUE, 0, sizeof(cl_int4) * (array_size>>2), array_out, 0, 0, &events); if (status != CL_SUCCESS) { printf("Error in clEnqueueReadBuffer (CL error: %d)\n", status); return 1; } if (doProfiling) { if ((status = clWaitForEvents(1, &events)) != 0) return status; cl_ulong startTime, endTime; clGetEventProfilingInfo(events, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL); clGetEventProfilingInfo(events, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); printf("Device -> host data transfer elapsed time: %d ns\n", endTime - startTime); if ((status = clReleaseEvent(events)) != 0) return 3; } cl_int4 *ptr_array_out = array_out; printf("Array out after running kernel = \n"); for (int i = (array_size>>2); i-- ; ++ptr_array_out) printf("%d %d %d %d ", (*ptr_array_out).s[0], (*ptr_array_out).s[1], (*ptr_array_out).s[2], (*ptr_array_out).s[3]); printf("\n"); return 0; } cl_int ClearDeviceData(void) { cl_int status; status = clFinish(commandQueue);//Ensure that all commands in queue will be executed after this call if (status != CL_SUCCESS) return status; // Release buffer if (array_out_buffer) status = clReleaseMemObject(array_out_buffer); if (status != CL_SUCCESS) return status; // Release kernel if (kernel) status = clReleaseKernel(kernel); if (status != CL_SUCCESS) return status; // Release program if (program) status = clReleaseProgram(program); if (status != CL_SUCCESS) return status; // Release command queue if (commandQueue) status = clReleaseCommandQueue(commandQueue); if (status != CL_SUCCESS) return status; // Release context if (context) status = clReleaseContext(context); if (status != CL_SUCCESS) return status; return 0; } cl_int ClearHostData(void) { free(array_out);array_out= NULL; return 0; } cl_int SetCLPlataform(void) { cl_int status; cl_uint numPlatforms; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(status != CL_SUCCESS) { printf("OpenCL error querying number of platform. (CL error: %d).\n", status); return 1; } if(numPlatforms > 0) { cl_platform_id* platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * numPlatforms); if (platforms == NULL) return 2; status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(status != CL_SUCCESS) { printf("OpenCL error querying Platform Ids. (CL error: %d).\n", status); return 3; } for(cl_uint i = 0; i < numPlatforms; ++i) { char pbuff[100]; status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); if(status != CL_SUCCESS) { printf("OpenCL error querying CL_PLATFORM_VENDOR info. (CL error: %d).\n", status); return 4; } platform = platforms[i]; if(!strcmp(pbuff, "Advanced Micro Devices, Inc.")) { break; } } free(platforms); } else { printf("No OpenCL platform found.\n"); return 5; } return 0; } // Create openCL context cl_int SetCLContext(void) { cl_int status; cl_context_properties props[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0}; context = clCreateContextFromType(props, strncmp(dev_type, "GPU", 3) ? CL_DEVICE_TYPE_CPU : CL_DEVICE_TYPE_GPU, NULL, NULL, &status); if(status != CL_SUCCESS) { printf("Error creating OpenCL context. (CL error: %d).\n", status); return 1; } return 0; } // Identify and set opencl GPU device, if any cl_int SetCLDevices(void) { cl_int status; cl_uint maxDims; size_t deviceListSize; ///////////////////////////////////////////////////////////////// // First, get the size of device list data ///////////////////////////////////////////////////////////////// status = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); if(status != CL_SUCCESS) { printf("OpenCL error querying device list size. (CL error: %d).\n", status); return 1; } if(deviceListSize == 0) { printf("No device found on the OpenCL context.\n"); return 2; } ///////////////////////////////////////////////////////////////// // Now, get the device list data ///////////////////////////////////////////////////////////////// cl_device_id* devices = (cl_device_id *)malloc(deviceListSize); if (devices == NULL) return 3; status = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL); if(status != CL_SUCCESS) { printf("OpenCL error querying device list. (CL error: %d).\n", status); return 4; } //and get one GPU device = devices[0]; free(devices);devices = NULL; if (device == NULL) { printf("No OpenCL GPU device found.\n!\n"); return 5; } ///////////////////////////////////////////////////////////////// // Query GPU device capabilities. ///////////////////////////////////////////////////////////////// // The number of parallel compute cores on the OpenCL device status = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), (void*)&maxComputeUnits, NULL); if(status != CL_SUCCESS) { printf("OpenCL error querying CL_DEVICE_MAX_COMPUTE_UNITS info. (CL error: %d).\n", status); return 8; } // Maximum dimensions that specify the global and local work-item IDs used by the data parallel execution model status = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void*)&maxDims, NULL); if(status != CL_SUCCESS) { printf("OpenCL error querying CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS info. (CL error: %d).\n", status); return 9; } // Maximum number of work-items that can be specified in each dimension of the work-group to clEnqueueNDRangeKernel status = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxDims, (void*)maxWorkItemSizes, NULL); if(status != CL_SUCCESS) { printf("OpenCL error querying CL_DEVICE_MAX_WORK_ITEM_SIZES info. (CL error: %d).\n", status); return 10; } return 0; } // Create the command-queue data structure to coordinate execution of the kernels on the device cl_int SetCLCommandQueue(void) { cl_int status; //Create command-queue and enable commands profiling if requested commandQueue = clCreateCommandQueue(context, device, (doProfiling) ? CL_QUEUE_PROFILING_ENABLE : NULL, &status); if(status != CL_SUCCESS) { printf("OpenCL error creating command queue for GPU device. (CL error: %d).\n", status); return 1; } return 0; } // Create and build the program cl_int SetCLProgram(void) { cl_int status; char kernel_printf_cl[] = "#pragma OPENCL EXTENSION cl_amd_printf : enable \n\ \r__kernel void test_int4(__global int4* array_out, \n\ \r const uint array_size) \n\ \r{ \n\ \r int gid = get_global_id(0); \n\ \r if (gid < array_size) \n\ \r { \n\ \r int gid_m = gid%4; \n\ \r int gid_d = gid>>2; \n\ \r printf(\"gid = %d: grid_d = %d, grid_m = %d \", gid, gid_d, gid_m); \n\ \r if (gid_m == 0) \n\ \r { \n\ \r printf(\" -> x \\n\"); \n\ \r array_out[gid_d].x = gid; \n\ \r } \n\ \r if (gid_m == 1) \n\ \r { \n\ \r printf(\" -> y \\n\"); \n\ \r array_out[gid_d].y = gid; \n\ \r } \n\ \r if (gid_m == 2) \n\ \r { \n\ \r printf(\" -> z \\n\"); \n\ \r array_out[gid_d].z = gid; \n\ \r } \n\ \r if (gid_m == 3) \n\ \r { \n\ \r printf(\" -> w \\n\"); \n\ \r array_out[gid_d].w = gid; \n\ \r } \n\ \r } \n\ \r}\0"; char kernel_choice_printf_cl[] = "#define PRINT_INFO 0 \n\ \r#if (defined(cl_amd_printf) && (PRINT_INFO == 1)) \n\ \r#pragma OPENCL EXTENSION cl_amd_printf : enable \n\ \r#endif \n\ \r__kernel void test_int4(__global int4* array_out, \n\ \r const uint array_size) \n\ \r{ \n\ \r int gid = get_global_id(0); \n\ \r if (gid < array_size) \n\ \r { \n\ \r int gid_m = gid%4; \n\ \r int gid_d = gid>>2; \n\ \r#if (defined(cl_amd_printf) && (PRINT_INFO == 1)) \n\ \r printf(\"gid = %d: grid_d = %d, grid_m = %d \", gid, gid_d, gid_m); \n\ \r#endif \n\ \r if (gid_m == 0) \n\ \r { \n\ \r#if (defined(cl_amd_printf) && (PRINT_INFO == 1)) \n\ \r printf(\" -> x \\n\"); \n\ \r#endif \n\ \r array_out[gid_d].x = gid; \n\ \r } \n\ \r if (gid_m == 1) \n\ \r { \n\ \r#if (defined(cl_amd_printf) && (PRINT_INFO == 1)) \n\ \r printf(\" -> y \\n\"); \n\ \r#endif \n\ \r array_out[gid_d].y = gid; \n\ \r } \n\ \r if (gid_m == 2) \n\ \r { \n\ \r#if (defined(cl_amd_printf) && (PRINT_INFO == 1)) \n\ \r printf(\" -> z \\n\"); \n\ \r#endif \n\ \r array_out[gid_d].z = gid; \n\ \r } \n\ \r if (gid_m == 3) \n\ \r { \n\ \r#if (defined(cl_amd_printf) && (PRINT_INFO == 1)) \n\ \r printf(\" -> w \\n\"); \n\ \r#endif \n\ \r array_out[gid_d].w = gid; \n\ \r } \n\ \r } \n\ \r}\0"; char kernel_cl[] = "__kernel void test_int4(__global int4* array_out, \n\ \r const uint array_size) \n\ \r{ \n\ \r int gid = get_global_id(0); \n\ \r if (gid < array_size) \n\ \r { \n\ \r int gid_m = gid%4; \n\ \r int gid_d = gid>>2; \n\ \r if (gid_m == 0) array_out[gid_d].x = gid; \n\ \r if (gid_m == 1) array_out[gid_d].y = gid; \n\ \r if (gid_m == 2) array_out[gid_d].z = gid; \n\ \r if (gid_m == 3) array_out[gid_d].w = gid; \n\ \r } \n\ \r}\0"; if ((status = ProgramBuild(kernel_cl, program, "test_int4", device)) != 0) return 1; return 0; } // Create/build program cl_int ProgramBuild(const char* source, cl_program &program, const char* program_name, cl_device_id device) { cl_int status; program = clCreateProgramWithSource(context, 1, &source, NULL, &status); if(status != CL_SUCCESS) { printf("OpenCL error creating %s program. (CL error: %d).\n", program_name, status); return 1; } status = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if(status != CL_SUCCESS) { size_t LogSize; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, NULL, NULL, &LogSize); char* Build_Log = (char*) malloc(LogSize); if (Build_Log == NULL) return 2; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, LogSize, (void*)Build_Log, NULL); printf("OpenCL error building %s program. (CL error: %d).\n", program_name, status); printf("Here's the log:\n%s\n", Build_Log); free(Build_Log); return 3; } return 0; } // Create kernel cl_int SetCLKernel(void) { cl_int status; kernel = clCreateKernel(program, "test_int4", &status); if(status != CL_SUCCESS) { printf("OpenCL error creating %s kernel. (CL error: %d).\n", "test_int4", status); return 1; } return 0; }

Outcomes