Hi everyone,
I've been working on the 0-1 Knapsack problem for the past few days and I've got it running correctly.
I'm now just looking for ways to optimize the code I have. I'm fine if you don't want to bother editing my code, but if you have any resources you can point me to to help me with this, it's much appreciated. Also, if I've left anything out or you need explanation, please let me know.
PS. I am currently forced to work on the CPU so if there is anything you can recommend specifically for CPU optimization (or GPUs for later), it would be beneficial.
Thanks for looking.
-Matt
knapsack.cl __kernel void knapsack(__global int *gpu_subset, __global int *weight, __global int *profit, int i, int NUM_COLS) { int local_id = get_local_id(0); if(local_id >= weight[i - 1] && (gpu_subset[local_id - weight[i - 1]] + profit[i - 1]) > gpu_subset[local_id]) gpu_subset[NUM_COLS + local_id] = (gpu_subset[local_id - weight[i - 1]] + profit[i - 1]); else gpu_subset[NUM_COLS + local_id] = gpu_subset[local_id]; } main.cpp for(int x = 0; x < NUM_RUNS; x++) { int cols = NUM_COLS; cl_mem gpu_results_buffer; cl_mem weight_buffer; cl_mem profit_buffer; cl_event cmd_event; //two rows (prev and current working row) //first row will be cleared //we will compute the row, retrieve it and use it as the first row in the next calculation int gpu_subset[NUM_COLS * 2]; weight_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(weight), NULL, &error); profit_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(profit), NULL, &error); gpu_results_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(gpu_subset), NULL, &error); error = clEnqueueWriteBuffer(cmd_queue, weight_buffer, CL_TRUE, 0, sizeof(weight), &weight, 0, NULL, &cmd_event); error = clEnqueueWriteBuffer(cmd_queue, profit_buffer, CL_TRUE, 0, sizeof(profit), &profit, 0, NULL, &cmd_event); error = clSetKernelArg(kernel, 0, sizeof(gpu_results_buffer), &gpu_results_buffer); error = clSetKernelArg(kernel, 1, sizeof(weight_buffer), &weight_buffer); error = clSetKernelArg(kernel, 2, sizeof(profit_buffer), &profit_buffer); error = clSetKernelArg(kernel, 4, sizeof(int), &cols); for(int i = 0; i < NUM_COLS * 2; i++) gpu_subset = 0; //zero the first row and column for(int j = 0; j < NUM_COLS; j++) gpu_results
= 0; for(int i = 0; i < NUM_ROWS; i++) gpu_results[i * NUM_COLS] = 0; int curr_pos = NUM_COLS; for(int i = 1; i < NUM_ROWS; i++) { error = clEnqueueWriteBuffer(cmd_queue, gpu_results_buffer, CL_TRUE, 0, sizeof(gpu_subset), &gpu_subset, 0, NULL, &cmd_event); error = clSetKernelArg(kernel, 3, sizeof(int), &i); //enqueue our kernel to execute on the device error = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, global, local, 0, NULL, NULL); //wait for execution to finish clFinish(cmd_queue); //read the results from the device since we need them for the next iteration cl_event resultsEvent; error = clEnqueueReadBuffer(cmd_queue, gpu_results_buffer, CL_TRUE, 0, sizeof(gpu_subset), &gpu_subset, 0, NULL, &resultsEvent); //copy results into final buffer #pragma omp parallel for for(int j = 0; j < NUM_COLS; j++) { gpu_results[curr_pos + j] = gpu_subset[NUM_COLS + j]; //copy the 2nd row into the 1st for use in the next cycle gpu_subset = gpu_subset[j + NUM_COLS]; } curr_pos += NUM_COLS; } clReleaseMemObject(weight_buffer); clReleaseMemObject(profit_buffer); }
__kernel void knapsack(__global int *gpu_subset, __global int *weight, __global int *profit, int i, int NUM_COLS) { int local_id = get_local_id(0); int weightval = weight[i - 1]; int profitval = profit[i - 1]; int gpu_subset1 = gpu_subset[local_id]; int gpu_subset2 = gpu_subset[local_id - weightval - profixval]; int boolcheck = (local_id >= weightval) && (gpu_subset2 > gpu_subset1); int val = (boolcheck) ? gpu_subset2 : gpu_subset1; gpu_subset[NUM_COLS + local_id] = val; }
Thanks for the tip Micah.
So, basically, to improve performance, try to break down if/else blocks and use a simple ?: block (compiles better IIRC) to check which to use.
Is there anything else I can do for performance? I've been reading about global and local memory; now, is it possible to transfer directly to local memory, bypassing global entirely? I could see this being much faster for accessing the arrays instead of jumping all the way to global memory (especially for such small, simple arrays). Or do I need to transfer to global memory, then local, at which point I could perform all my work and then transfer that local array back?
PS. Do you have any documentation I could use to help with my optimization?
PPS. I had to make a small change to your kernel to correct it, now the correct code is attached (in case anyone else wants it).
__kernel void knapsack(__global int *gpu_subset, __global int *weight, __global int *profit, int i, int NUM_COLS) { int local_id = get_local_id(0); int weightval = weight[i - 1]; int profitval = profit[i - 1]; int gpu_subset1 = gpu_subset[local_id]; int gpu_subset2 = gpu_subset[local_id - weightval] + profitval; int boolcheck = (local_id >= weightval) && (gpu_subset2 > gpu_subset1); int val = (boolcheck) ? gpu_subset2 : gpu_subset1; gpu_subset[NUM_COLS + local_id] = val; }
For anyone looking for another way to implement this, here's my version where there is less transfer back and forth, but requires a barrier for synchronization.
I should note that, on the CPU, this version is less efficient because of the barrier, I'm not sure about the GPU however since I haven't yet been able to test the code.
gpu_knapsack.cl: __kernel void knapsack(__global int *gpu_results, __global int *weight, __global int *profit, int NUM_ROWS, int NUM_COLS, __local int *gpu_subset) { int i = 0; int local_id = get_local_id(0); int weightval = 0; int profitval = 0; int gpu_subset1 = 0; int gpu_subset2 = 0; int boolcheck = 0; int val = 0; int offset = 0; /*start with a row of 0's, set to 0 so we don't need to read from global memory*/ gpu_subset[local_id] = 0; for(i = 1; i < NUM_ROWS; i++) { weightval = weight[i - 1]; profitval = profit[i - 1]; gpu_subset1 = gpu_subset[local_id]; /*results will be undefined if local_id < weightval but that is fine since they will be filtered out by the boolcheck later*/ gpu_subset2 = gpu_subset[local_id - weightval] + profitval; boolcheck = (local_id >= weightval) && (gpu_subset2 > gpu_subset1); val = (boolcheck) ? gpu_subset2 : gpu_subset1; /*offset for where in global memory we'll write to, perform global write*/ offset = (i * NUM_COLS) + local_id; gpu_results[offset] = val; /*force all threads to finish execution before writing our results back into the local array since a thread may be delayed and thus get an updated value instead of the "old" one*/ barrier(CLK_LOCAL_MEM_FENCE); /*now that all threads have finished execution update local memory with our new values for the next iteration*/ gpu_subset[local_id] = val; /*ensure the local changes have been made so we're working with the correct results next iteration this barrier may be necessary on the GPU, however, on the CPU it is not needed*/ /*barrier(CLK_LOCAL_MEM_FENCE);*/ } } main.cpp: for(int x = 0; x < NUM_RUNS; x++) { int rows = NUM_ROWS; int cols = NUM_COLS; cl_mem gpu_results_buffer; cl_mem gpu_subset_buffer; cl_mem weight_buffer; cl_mem profit_buffer; cl_event cmd_event; //current working row of the problem int gpu_subset[NUM_COLS]; weight_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(weight), NULL, &error); profit_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(profit), NULL, &error); gpu_results_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(gpu_results), NULL, &error); gpu_subset_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(gpu_subset), NULL, &error); error = clEnqueueWriteBuffer(cmd_queue, gpu_results_buffer, CL_TRUE, 0, sizeof(gpu_results), &gpu_results, 0, NULL, &cmd_event); error = clEnqueueWriteBuffer(cmd_queue, weight_buffer, CL_TRUE, 0, sizeof(weight), &weight, 0, NULL, &cmd_event); error = clEnqueueWriteBuffer(cmd_queue, profit_buffer, CL_TRUE, 0, sizeof(profit), &profit, 0, NULL, &cmd_event); error = clEnqueueWriteBuffer(cmd_queue, gpu_subset_buffer, CL_TRUE, 0, sizeof(gpu_subset), &gpu_subset, 0, NULL, &cmd_event); error = clSetKernelArg(kernel, 0, sizeof(gpu_results_buffer), &gpu_results_buffer); error = clSetKernelArg(kernel, 1, sizeof(weight_buffer), &weight_buffer); error = clSetKernelArg(kernel, 2, sizeof(profit_buffer), &profit_buffer); error = clSetKernelArg(kernel, 3, sizeof(int), &rows); error = clSetKernelArg(kernel, 4, sizeof(int), &cols); error = clSetKernelArg(kernel, 5, sizeof(gpu_subset_buffer), NULL); cl_start_time = omp_get_wtime(); //enqueue our kernel to execute on the device error = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, global, local, 0, NULL, NULL); //wait for execution to finish clFinish(cmd_queue); //read the results from the device cl_event resultsEvent; error = clEnqueueReadBuffer(cmd_queue, gpu_results_buffer, CL_TRUE, 0, sizeof(gpu_results), &gpu_results, 0, NULL, &resultsEvent); cl_end_time = omp_get_wtime(); clReleaseMemObject(weight_buffer); clReleaseMemObject(profit_buffer); cl_run = (cl_end_time - cl_start_time); cl_elapsed += cl_run; }