1 Reply Latest reply on Jun 3, 2010 6:58 AM by Chocrates

    Help. Calculating running time

    zhngzjn

      Hello, every one. I'm a beginner of OpenCL.

      I downloaded a sample code , and I made a slight change to see if OpenCL works well with quad-core CPU .

      Strangely enough, it seems that the OpenCL code got the correct result 10 times faster than the corresponding serial code. I thought the speed-up ratio couldn't be greater than 4 .

      Did I make any mistake in calculating the times? Or anywhere else?

      #include <fcntl.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <math.h> //#include <unistd.h> #include <sys/types.h> #include <sys/stat.h> #include <CL/cl.h> #include <time.h> //////////////////////////////////////////////////////////////////////////////// // Use a static data size for simplicity // #define DATA_SIZE (102400) //////////////////////////////////////////////////////////////////////////////// // Simple compute kernel which computes results[i]=1+1+……+1=i // const char *KernelSource = "\n" \ "__kernel void test( \n" \ " __global float* input, \n" \ " __global float* output, \n" \ " const unsigned int count) \n" \ "{ \n" \ " int i = get_global_id(0); int j; \n" \ " output[i]=0; \n" \ " for (j=0;j<i;j++) \n" \ " output[i] = output[i]+1; \n" \ "} \n" \ "\n"; //////////////////////////////////////////////////////////////////////////////// int main(int argc, char** argv) { int err; // error code returned from api calls float *data=new float[DATA_SIZE]; // original data set given to device float *results=new float[DATA_SIZE]; // results returned from device unsigned int correct; // number of correct results returned size_t global; // global domain size for our calculation size_t local; // local domain size for our calculation cl_device_id device_id[4]; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel cl_mem input; // device memory used for the input array cl_mem output; // device memory used for the output array // Fill our data set with random float values // int i = 0; unsigned int count = DATA_SIZE; for(i = 0; i < count; i++) data[i] = rand() / (float)RAND_MAX; // Connect to a compute device // cl_int status = 0; cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL); err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_CPU, 1, device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } // Create a compute context // context = clCreateContext(0, 1, device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command commands // commands = clCreateCommandQueue(context, device_id[0], 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source buffer // program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "test", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } // Create the input and output arrays in device memory for our calculation // input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL); output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL); if (!input || !output) { printf("Error: Failed to allocate device memory!\n"); exit(1); } // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array!\n"); exit(1); } int start=clock(); // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } // Execute the kernel over the entire range of our 1d input data set // global = count; local =1; cl_event *cev; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, cev); if (err) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } // Wait for the command commands to get serviced before reading back results // clFinish(commands); // Read back the results from the device to verify the output // err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0,NULL, NULL ); int end=clock(); printf("OPENCL time:%d\n",end-start); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } // Validate our results // correct = 0; int ir; start=clock(); for(i = 0; i < count; i++) { for ( ir=0;ir<i;ir++) results[i]=results[i]-1 ; } end=clock(); // Print a brief summary detailing the results // printf("CPU time:%d\n",end-start); for(i = 0; i < count; i++) { if (fabs(results[i])<0.0000001) correct++; } printf("Computed '%d/%d' correct values!\n", correct, count); // Shutdown and cleanup // clReleaseMemObject(input); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); delete data; delete results; return 0; }