notyou

Differing CPU and GPU Results

Discussion created by notyou on Aug 15, 2010
Latest reply on Aug 20, 2010 by notyou

Hello,

I've been working on the 0-1 knapsack problem recently and I've extended my code to work with input sizes > 256. When running the code on the CPU, everything works as it should, even for inputs > 1024 (max workgroup size), but when I move to the GPU, anything > 256 gives me an error when verifying the results. Does anyone see anything wrong with my code as for what could be causing this?

-Matt

gpu_knapsack.cl __kernel void knapsack(__global int *gpu_results, __global int *weight, __global int *profit, int NUM_ROWS, int NUM_COLS, __local int *gpu_subset, int WORKGROUP_SIZE) { int i = 0; int j = 0; int local_id = get_local_id(0); int num_groups = NUM_COLS / WORKGROUP_SIZE; int group_offset = 0; int weightval = 0; int profitval = 0; int gpu_subset1 = 0; int gpu_subset2 = 0; int boolcheck = 0; int val = 0; int offset = 0; /*start with a row of 0's, set to 0 so we don't need to read from global memory*/ for(i = 0; i < num_groups; i++) gpu_subset[(WORKGROUP_SIZE * i) + local_id] = 0; for(i = 1; i < NUM_ROWS; i++) { weightval = weight[i - 1]; profitval = profit[i - 1]; for(j = 0; j < num_groups; j++) { group_offset = WORKGROUP_SIZE * j; gpu_subset1 = gpu_subset[group_offset + local_id]; /*results will be undefined if local_id < weightval but that is fine since they will be filtered out by the boolcheck later*/ gpu_subset2 = gpu_subset[group_offset + local_id - weightval] + profitval; boolcheck = (group_offset + local_id >= weightval) && (gpu_subset2 > gpu_subset1); val = (boolcheck) ? gpu_subset2 : gpu_subset1; /*offset for where in global memory we'll write to and perform global write*/ offset = (i * NUM_COLS) + group_offset + local_id; gpu_results[offset] = val; /*force all threads to finish execution before writing our results back into the local array since a thread may be delayed and thus get an updated value instead of the "old" one*/ barrier(CLK_LOCAL_MEM_FENCE); /*now that all threads have finished execution update local memory with our new values for the next iteration*/ gpu_subset[group_offset + local_id] = val; } } } Main.cpp #include <CL/cl.hpp> #include <iostream> #include <iomanip> #include <omp.h> #include <fstream> #include <string> #include <sstream> using namespace std; //#define cpu //run on CPU, comment out to run on GPU, if running on GPU, change BLOCK_SIZE to 16 #define NUM_RUNS 1 #define NUM_ITEMS 512 //# rows, add 1 for empty row #define MAX_WEIGHT (NUM_ITEMS - 1) //# cols, add 1 for empty column #define NUM_COLS (MAX_WEIGHT + 1) #define NUM_ROWS (NUM_ITEMS + 1) #define NUM_ELEMENTS (NUM_COLS * NUM_ROWS) #define CPU_WORKGROUP_SIZE 1024 #define GPU_WORKGROUP_SIZE 256 int weight[NUM_ITEMS]; int profit[NUM_ITEMS]; bool errorCheck(cl_int, string); //returns true if there is an error (and prints appropriate message) void seq_knapsack(int[], int[]); void generateRandomFile(int, string); void loadFromFile(string, int[]); void verify(); int seq_results[NUM_ELEMENTS]; int gpu_results[NUM_ELEMENTS]; int main(int argc, char *argv[]) { cout<<setprecision(10); stringstream val; val << NUM_COLS; string s = val.str(); generateRandomFile(NUM_ITEMS, "weight" + s + ".txt"); generateRandomFile(NUM_ITEMS, "profit" + s + ".txt"); loadFromFile("weight" + s + ".txt", weight); loadFromFile("profit" + s + ".txt", profit); //initialization cl_int error; //check to see if we error our during most steps cl_platform_id platform; cl_uint numPlatforms; cl_uint num_devices_returned = 0; //get a list of all the platforms error = clGetPlatformIDs(0, NULL, &numPlatforms); if (0 < numPlatforms) { cl_platform_id* platforms = new cl_platform_id[numPlatforms]; error = clGetPlatformIDs(numPlatforms, platforms, NULL); for (unsigned i = 0; i < numPlatforms; ++i) { //char pbuf[100]; //error = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); platform = platforms[i]; } delete[] platforms; } //get our GPU device cl_device_id device; #ifndef cpu error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); #endif #ifdef cpu error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL); #endif //if(!errorCheck(error, "Getting device info")) // printDeviceInfo(device); //create a context with our devices cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &error); errorCheck(error, "Creating context"); //create a command queue cl_command_queue cmd_queue; cmd_queue = clCreateCommandQueue(context, device, 0, &error); errorCheck(error, "Creating command queue"); //create our program char *fileName = "F:\\University\\comp4520\\0-1 knapsack\\knapsack\\knapsack\\gpu_knapsack.cl"; const char *source = ""; string line; string sourceFile = ""; ifstream file (fileName); if(file.is_open()) { while(!file.eof()) { getline(file, line); sourceFile.append(line); } } source = sourceFile.c_str(); file.close(); cl_program program; program = clCreateProgramWithSource(context, 1, &source, NULL, &error); errorCheck(error, "Creating program with source"); cout<<"Building Program"<<endl; //build our program error = clBuildProgram(program, 1, &device, "", NULL, NULL); errorCheck(error, "Building program"); char logFile[16384]; error = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(logFile), &logFile, 0); errorCheck(error, "Getting Program Info"); cout<<"\n--------------------Build Log--------------------\n\n"<<logFile<<"\n--------------------End Build Log--------------------\n\n"<<endl; //create kernel objects for all kernel functions in the program object cl_kernel kernel; cl_uint numKernels; error = clCreateKernelsInProgram(program, 1, &kernel, &numKernels); errorCheck(error, "Creating kernel in program"); #ifdef cpu size_t local[1] = {CPU_WORKGROUP_SIZE}; size_t global[1] = {CPU_WORKGROUP_SIZE}; if(NUM_COLS < CPU_WORKGROUP_SIZE) { local[0] = NUM_COLS; global[0] = NUM_COLS; } #else size_t local[1] = {GPU_WORKGROUP_SIZE}; size_t global[1] = {GPU_WORKGROUP_SIZE}; if(NUM_COLS < GPU_WORKGROUP_SIZE) { local[0] = NUM_COLS; global[0] = NUM_COLS; } #endif //zero the first row and column for(int j = 0; j < NUM_COLS; j++) gpu_results[j] = 0; for(int i = 0; i < NUM_ROWS; i++) gpu_results[i * NUM_COLS] = 0; cl_mem gpu_results_buffer; cl_mem gpu_subset_buffer; cl_mem weight_buffer; cl_mem profit_buffer; cl_event cmd_event; seq_knapsack(weight, profit); int rows = NUM_ROWS; int cols = NUM_COLS; int workgroup_size; #ifdef cpu workgroup_size = CPU_WORKGROUP_SIZE; if(NUM_COLS < CPU_WORKGROUP_SIZE) workgroup_size = NUM_COLS; #else workgroup_size = GPU_WORKGROUP_SIZE; if(NUM_COLS < GPU_WORKGROUP_SIZE) workgroup_size = NUM_COLS; #endif //current working row of the problem int gpu_subset[NUM_COLS]; //these can be set here since they won't be changing during execution weight_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(weight), NULL, &error); errorCheck(error, "Creating weight buffer"); profit_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(profit), NULL, &error); errorCheck(error, "Creating profit buffer"); gpu_results_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(gpu_results), NULL, &error); errorCheck(error, "Creating parallel buffer"); gpu_subset_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(gpu_subset), NULL, &error); errorCheck(error, "Creating subset buffer"); error = clEnqueueWriteBuffer(cmd_queue, gpu_results_buffer, CL_TRUE, 0, sizeof(gpu_results), &gpu_results, 0, NULL, &cmd_event); errorCheck(error, "Enqueue gpu_results_buffer"); error = clEnqueueWriteBuffer(cmd_queue, weight_buffer, CL_TRUE, 0, sizeof(weight), &weight, 0, NULL, &cmd_event); errorCheck(error, "Enqueue weight buffer"); error = clEnqueueWriteBuffer(cmd_queue, profit_buffer, CL_TRUE, 0, sizeof(profit), &profit, 0, NULL, &cmd_event); errorCheck(error, "Enqueue profit buffer"); error = clEnqueueWriteBuffer(cmd_queue, gpu_subset_buffer, CL_TRUE, 0, sizeof(gpu_subset), &gpu_subset, 0, NULL, &cmd_event); errorCheck(error, "Enqueue profit buffer"); error = clSetKernelArg(kernel, 0, sizeof(gpu_results_buffer), &gpu_results_buffer); errorCheck(error, "Setting kernel arg [0]"); error = clSetKernelArg(kernel, 1, sizeof(weight_buffer), &weight_buffer); errorCheck(error, "Setting kernel arg [1]"); error = clSetKernelArg(kernel, 2, sizeof(profit_buffer), &profit_buffer); errorCheck(error, "Setting kernel arg [2]"); error = clSetKernelArg(kernel, 3, sizeof(int), &rows); errorCheck(error, "Setting kernel arg [3]"); error = clSetKernelArg(kernel, 4, sizeof(int), &cols); errorCheck(error, "Setting kernel arg [4]"); error = clSetKernelArg(kernel, 5, sizeof(gpu_subset_buffer), NULL); errorCheck(error, "Setting kernel arg [5]"); error = clSetKernelArg(kernel, 6, sizeof(int), &workgroup_size); //enqueue our kernel to execute on the device error = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, global, local, 0, NULL, NULL); errorCheck(error, "Enqueuing ND Range Kernel"); //wait for execution to finish clFinish(cmd_queue); //read the results from the device cl_event resultsEvent; error = clEnqueueReadBuffer(cmd_queue, gpu_results_buffer, CL_TRUE, 0, sizeof(gpu_results), &gpu_results, 0, NULL, &resultsEvent); errorCheck(error, "Reading Results Buffer"); clReleaseMemObject(gpu_subset_buffer); clReleaseMemObject(gpu_results_buffer); clReleaseMemObject(weight_buffer); clReleaseMemObject(profit_buffer); verify(); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); system("pause"); return 0; } void seq_knapsack(int weight[], int profit[]) { int i, j; //zero the first row and column for(j = 0; j < NUM_COLS; j++) seq_results[j] = 0; for(i = 0; i < NUM_ROWS; i++) seq_results[i * NUM_COLS] = 0; for(i = 1; i < NUM_ROWS; i++) for(j = 1; j < NUM_COLS; j++) { if(j >= weight[i - 1] && (seq_results[(i - 1) * NUM_COLS + j - weight[i - 1]] + profit[i - 1]) > seq_results[(i - 1) * NUM_COLS + j]) seq_results[(i * NUM_COLS) + j] = (seq_results[(i - 1) * NUM_COLS + j - weight[i - 1]] + profit[i - 1]); else seq_results[(i * NUM_COLS) + j] = (seq_results[(i - 1) * NUM_COLS + j]); } #ifdef verbose //print array j = 0; for(i = 0; i < NUM_ELEMENTS; i++) { cout<<seq_results[i]<<" "; if(j == MAX_WEIGHT) { cout<<endl; j = 0; } else j++; } cout<<endl; #endif } void loadFromFile(string loadFile, int myArray[NUM_ITEMS]) { ifstream myfile (loadFile); int num = 0; int i = 0; if (myfile.is_open()) { while (!myfile.eof()) { myfile >> num; myArray[i] = num; i++; if(i == NUM_ITEMS) break; } } myfile.close(); } void generateRandomFile(int num_vars, string outFile) { int num; ifstream inputFile; ofstream outputFile; inputFile.open(outFile); if(inputFile.is_open()) { inputFile.close(); return; } else outputFile.open(outFile); srand((unsigned int)(time(NULL))); for(int i = 0; i < num_vars; i++) { num = rand() % 26; //num from 0-25 outputFile<<num<<" "; } outputFile.close(); } void verify() { bool error = false; cout<<endl; for(int i = 0; i < NUM_ELEMENTS; i++) if(seq_results[i] != gpu_results[i]) { cout<<"Sequential[i]="<<seq_results[i]<<"\tGPU[i]="<<gpu_results[i]<<".\t Error at i="<<i<<". Stopping."<<endl; error = true; break; } if(!error) cout<<"No difference between calculations."<<endl<<endl<<endl; } bool errorCheck(cl_int error, string dataPoint) { bool errorOccurred = true; if(error == CL_BUILD_PROGRAM_FAILURE) cout<<"CL_BUILD_PROGRAM_FAILURE"<<endl; else if(error == CL_COMPILER_NOT_AVAILABLE) cout<<"CL_COMPILER_NOT_AVAILABLE"<<endl; else if(error == CL_DEVICE_NOT_AVAILABLE) cout<<"CL_DEVICE_NOT_AVAILABLE"<<endl; else if(error == CL_DEVICE_NOT_FOUND) cout<<"CL_DEVICE_NOT_FOUND"<<endl; else if(error == CL_INVALID_ARG_INDEX) cout<<"CL_INVALID_ARG_INDEX"<<endl; else if(error == CL_INVALID_ARG_SIZE) cout<<"CL_INVALID_ARG_SIZE"<<endl; else if(error == CL_INVALID_ARG_VALUE) cout<<"CL_INVALID_ARG_VALUE"<<endl; else if(error == CL_INVALID_BINARY) cout<<"CL_INVALID_BINARY"<<endl; else if(error == CL_INVALID_BUFFER_SIZE) cout<<"CL_INVALID_BUFFER_SIZE"<<endl; else if(error == CL_INVALID_BUILD_OPTIONS) cout<<"CL_INVALID_BUILD_OPTIONS"<<endl; else if(error == CL_INVALID_COMMAND_QUEUE) cout<<"CL_INVALID_COMMAND_QUEUE"<<endl; else if(error == CL_INVALID_CONTEXT) cout<<"CL_INVALID_CONTEXT"<<endl; else if(error == CL_INVALID_DEVICE) cout<<"CL_INVALID_DEVICE"<<endl; else if(error == CL_INVALID_DEVICE_TYPE) cout<<"CL_INVALID_DEVICE_TYPE"<<endl; else if(error == CL_INVALID_EVENT) cout<<"CL_INVALID_EVENT"<<endl; else if(error == CL_INVALID_EVENT_WAIT_LIST) cout<<"CL_INVALID_EVENT_WAIT_LIST"<<endl; else if(error == CL_INVALID_GLOBAL_OFFSET) cout<<"CL_INVALID_GLOBAL_OFFSET"<<endl; else if(error == CL_INVALID_HOST_PTR) cout<<"CL_INVALID_HOST_PTR"<<endl; else if(error == CL_INVALID_KERNEL) cout<<"CL_INVALID_KERNEL"<<endl; else if(error == CL_INVALID_KERNEL_ARGS) cout<<"CL_INVALID_KERNEL_ARGS"<<endl; else if(error == CL_INVALID_MEM_OBJECT) cout<<"CL_INVALID_MEM_OBJECT"<<endl; else if(error == CL_INVALID_OPERATION) cout<<"CL_INVALID_OPERATION"<<endl; else if(error == CL_INVALID_PLATFORM) cout<<"CL_INVALID_PLATFORM"<<endl; else if(error == CL_INVALID_PROGRAM) cout<<"CL_INVALID_PROGRAM"<<endl; else if(error == CL_INVALID_PROGRAM_EXECUTABLE) cout<<"CL_INVALID_PROGRAM_EXECUTABLE"<<endl; else if(error == CL_INVALID_QUEUE_PROPERTIES) cout<<"CL_INVALID_QUEUE_PROPERTIES"<<endl; else if(error == CL_INVALID_SAMPLER) cout<<"CL_INVALID_SAMPLER"<<endl; else if(error == CL_INVALID_VALUE) cout<<"CL_INVALID_VALUE"<<endl; else if(error == CL_INVALID_WORK_DIMENSION) cout<<"CL_INVALID_WORK_DIMENSION"<<endl; else if(error == CL_INVALID_WORK_GROUP_SIZE) cout<<"CL_INVALID_WORK_GROUP_SIZE"<<endl; else if(error == CL_MEM_COPY_HOST_PTR) cout<<"CL_MEM_COPY_HOST_PTR"<<endl; else if(error == CL_MEM_OBJECT_ALLOCATION_FAILURE) cout<<"CL_MEM_OBJECT_ALLOCATION_FAILURE"<<endl; else if(error == CL_MEM_USE_HOST_PTR) cout<<"CL_MEM_USE_HOST_PTR"<<endl; else if(error == CL_OUT_OF_HOST_MEMORY) cout<<"CL_OUT_OF_HOST_MEMORY"<<endl; else if(error == CL_OUT_OF_RESOURCES) cout<<"CL_OUT_OF_RESOURCES"<<endl; else if(error != 0) cout<<"Unknown error"<<endl; else if(error == 0) { //cout<<"No error at: "+dataPoint<<endl<<endl; return false; } cout<<"Error at: "+dataPoint<<endl<<endl; return errorOccurred; }

Outcomes