notyou

0-1 Knapsack Optimization

Discussion created by notyou on Aug 3, 2010
Latest reply on Aug 6, 2010 by notyou

Hi everyone,

I've been working on the 0-1 Knapsack problem for the past few days and I've got it running correctly.

I'm now just looking for ways to optimize the code I have. I'm fine if you don't want to bother editing my code, but if you have any resources you can point me to to help me with this, it's much appreciated. Also, if I've left anything out or you need explanation, please let me know.

PS. I am currently forced to work on the CPU so if there is anything you can recommend specifically for CPU optimization (or GPUs for later), it would be beneficial.

Thanks for looking.

-Matt

knapsack.cl __kernel void knapsack(__global int *gpu_subset, __global int *weight, __global int *profit, int i, int NUM_COLS) { int local_id = get_local_id(0); if(local_id >= weight[i - 1] && (gpu_subset[local_id - weight[i - 1]] + profit[i - 1]) > gpu_subset[local_id]) gpu_subset[NUM_COLS + local_id] = (gpu_subset[local_id - weight[i - 1]] + profit[i - 1]); else gpu_subset[NUM_COLS + local_id] = gpu_subset[local_id]; } main.cpp for(int x = 0; x < NUM_RUNS; x++) { int cols = NUM_COLS; cl_mem gpu_results_buffer; cl_mem weight_buffer; cl_mem profit_buffer; cl_event cmd_event; //two rows (prev and current working row) //first row will be cleared //we will compute the row, retrieve it and use it as the first row in the next calculation int gpu_subset[NUM_COLS * 2]; weight_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(weight), NULL, &error); profit_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(profit), NULL, &error); gpu_results_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(gpu_subset), NULL, &error); error = clEnqueueWriteBuffer(cmd_queue, weight_buffer, CL_TRUE, 0, sizeof(weight), &weight, 0, NULL, &cmd_event); error = clEnqueueWriteBuffer(cmd_queue, profit_buffer, CL_TRUE, 0, sizeof(profit), &profit, 0, NULL, &cmd_event); error = clSetKernelArg(kernel, 0, sizeof(gpu_results_buffer), &gpu_results_buffer); error = clSetKernelArg(kernel, 1, sizeof(weight_buffer), &weight_buffer); error = clSetKernelArg(kernel, 2, sizeof(profit_buffer), &profit_buffer); error = clSetKernelArg(kernel, 4, sizeof(int), &cols); for(int i = 0; i < NUM_COLS * 2; i++) gpu_subset[i] = 0; //zero the first row and column for(int j = 0; j < NUM_COLS; j++) gpu_results[j] = 0; for(int i = 0; i < NUM_ROWS; i++) gpu_results[i * NUM_COLS] = 0; int curr_pos = NUM_COLS; for(int i = 1; i < NUM_ROWS; i++) { error = clEnqueueWriteBuffer(cmd_queue, gpu_results_buffer, CL_TRUE, 0, sizeof(gpu_subset), &gpu_subset, 0, NULL, &cmd_event); error = clSetKernelArg(kernel, 3, sizeof(int), &i); //enqueue our kernel to execute on the device error = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, global, local, 0, NULL, NULL); //wait for execution to finish clFinish(cmd_queue); //read the results from the device since we need them for the next iteration cl_event resultsEvent; error = clEnqueueReadBuffer(cmd_queue, gpu_results_buffer, CL_TRUE, 0, sizeof(gpu_subset), &gpu_subset, 0, NULL, &resultsEvent); //copy results into final buffer #pragma omp parallel for for(int j = 0; j < NUM_COLS; j++) { gpu_results[curr_pos + j] = gpu_subset[NUM_COLS + j]; //copy the 2nd row into the 1st for use in the next cycle gpu_subset[j] = gpu_subset[j + NUM_COLS]; } curr_pos += NUM_COLS; } clReleaseMemObject(weight_buffer); clReleaseMemObject(profit_buffer); }

Outcomes