dstokac

cpu vs. gpu opencl performance

Discussion created by dstokac on Nov 18, 2009
Latest reply on Nov 19, 2009 by nou

To check performance of both, cpu & gpu, I wrote a small program with three versions of a more elaborate copy kernel. The first kernel is just strightforward copying from one global buffer to another. In the second kernel we use vectorized (float->float4) copying procedure, whereas in the third version we try to make use of the local memory. Each version of the kernel is executed on both cpu & gpu, within one thread. Source code of the program is attached.

The results I get for my system (cpu=Dual Core Pentium E5200, gpu=HD4770):

Local memory:
cpu: 32768
gpu: 16384
Memory type:
cpu: 2
gpu: 2
         cpu native - exec.time:       0.24 t2:          0          1          2          3
           cpu copy - exec.time:       0.07 t2:          0          1          2          3
         cpu copy4 - exec.time:       0.03 t2:          0          1          2          3
cpu copy4_local - exec.time:       0.03 t2:          0          1          2          3
           gpu copy - exec.time:       49.3 t2:          0          1          2          3
         gpu copy4 - exec.time:       15.6 t2:          0          1          2          3
gpu copy4_local - exec.time:       15.8 t2:          0          1          2          3


Conclusions:
1) OpenCL implementation substantially outperformed native implementation,
about 3x faster. Since the cpu has 2 cores, I'm not sure where this speed up
comes from.
2) Vectorized version performs better, as expected. About ~2.5-3x better
than scalar version.
3) gpu performance is not comparable to cpu. cpu is a few hundert times
faster.


I'm posting this results to hear experiences from other users. Furthermore,
it would be nice to see results of the same program on other systems,
particularly those which have dedicated(fast) local memory, which is not the case with my system.
Any comments on how to increase gpu performance with respect to cpu
performance are welcomed. I would also be grateful to those who could give me good explanation of the posted results.It would be nice to know why is cpu __local associated to CL_GLOBAL (explanation for GPU can be found in other threads).

P.S. Structure of the kernel is chosen so deliberately. It reflects more
complex structure of kernels I use. Of course, this simplified structure can
further be simplified, but then it wouldn't reflect demands imposed on
hardware by more complex kernels. Performance gains through async copying don't count either.)

#include "CL/cl.hpp" #include <iomanip> #include <string> #include <fstream> #include <vector> #include <iostream> #include <stdlib.h> using namespace std; #define N_elem 4 #define N_iter 15000000 // #define N_iter 1500000 float t1[N_elem]; float t2[N_elem]; class OpCl{ public: cl::Context context; cl::Program::Sources source; cl::Program program; cl::CommandQueue queue; cl::Event last_event; std::vector<cl::Device> devices; OpCl(); OpCl(int context_DEVICE_TYPE, string prog_source_file); }; OpCl::OpCl( cl_int context_DEVICE_TYPE, string prog_source_file) { int err; devices.clear(); context=cl::Context(context_DEVICE_TYPE, NULL, NULL, NULL, &err); devices =context.getInfo<CL_CONTEXT_DEVICES>(); ifstream file(prog_source_file.c_str()); if (!file) { cerr<<"ERROR: file:"<<prog_source_file.c_str()<<" cannot be opened!"<<endl; exit(1); } string prog(istreambuf_iterator<char>(file), (istreambuf_iterator<char>())); source=cl::Program::Sources(1,make_pair(prog.c_str(), prog.length()+1)); program=cl::Program(context, source); err=program.build(devices,""); if (err != CL_SUCCESS) { string str=program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]); cerr<<"ERROR in building Program:"<<endl; cerr<<"err:"<<err<<endl; cerr<<"str:"<<str<<endl; exit(EXIT_FAILURE); } queue=cl::CommandQueue(context, devices[0], 0, &err); } class timer{ clock_t t_start; clock_t t_end; bool b_start; public: timer(){t_start=t_end=clock();b_start=0;} void start(){t_start=clock();b_start=1;} void end(){t_end=clock();b_start=0;} float duration() { if (b_start) return (clock()-t_start)/(float)CLOCKS_PER_SEC; else return (t_end-t_start)/(float)CLOCKS_PER_SEC; } }; void calc_cpu_native() { for(int i_iter=0; i_iter<N_iter; i_iter++) for(int i=0; i<N_elem; i++) t2[i]=t1[i]; } void calc_OpCl(cl::Kernel& k, OpCl& o) { o.queue.enqueueNDRangeKernel(k, cl::NDRange(), cl::NDRange(1), cl::NDRange(1), NULL, NULL); o.queue.finish(); } int main() { timer v; OpCl gpu(CL_DEVICE_TYPE_GPU, "opencl_benchmark_v0.1_kernels.cl"); OpCl cpu(CL_DEVICE_TYPE_CPU, "opencl_benchmark_v0.1_kernels.cl"); cl::Kernel k_copy_cpu(cpu.program,"copy",NULL); cl::Kernel k_copy4_cpu(cpu.program,"copy4",NULL); cl::Kernel k_copy4_local_cpu(cpu.program,"copy4_local",NULL); cl::Kernel k_copy_gpu(gpu.program,"copy",NULL); cl::Kernel k_copy4_gpu(gpu.program,"copy4",NULL); cl::Kernel k_copy4_local_gpu(gpu.program,"copy4_local",NULL); cl::Buffer CL_t1_cpu=cl::Buffer(cpu.context, CL_MEM_READ_WRITE, N_elem*sizeof(float), NULL, NULL); cl::Buffer CL_t2_cpu=cl::Buffer(cpu.context, CL_MEM_READ_WRITE, N_elem*sizeof(float), NULL, NULL); cl::Buffer CL_t1_gpu=cl::Buffer(gpu.context, CL_MEM_READ_WRITE, N_elem*sizeof(float), NULL, NULL); cl::Buffer CL_t2_gpu=cl::Buffer(gpu.context, CL_MEM_READ_WRITE, N_elem*sizeof(float), NULL, NULL); k_copy_cpu.setArg(0,CL_t1_cpu); k_copy_cpu.setArg(1,CL_t2_cpu); k_copy_cpu.setArg(2,N_elem); k_copy_cpu.setArg(3,N_iter); k_copy4_cpu.setArg(0,CL_t1_cpu); k_copy4_cpu.setArg(1,CL_t2_cpu); k_copy4_cpu.setArg(2,N_elem/4); k_copy4_cpu.setArg(3,N_iter); k_copy4_local_cpu.setArg(0,CL_t1_cpu); k_copy4_local_cpu.setArg(1,CL_t2_cpu); k_copy4_local_cpu.setArg(2,N_elem/4); k_copy4_local_cpu.setArg(3,N_iter); k_copy_gpu.setArg(0,CL_t1_gpu); k_copy_gpu.setArg(1,CL_t2_gpu); k_copy_gpu.setArg(2,N_elem); k_copy_gpu.setArg(3,N_iter); k_copy4_gpu.setArg(0,CL_t1_gpu); k_copy4_gpu.setArg(1,CL_t2_gpu); k_copy4_gpu.setArg(2,N_elem/4); k_copy4_gpu.setArg(3,N_iter); k_copy4_local_gpu.setArg(0,CL_t1_gpu); k_copy4_local_gpu.setArg(1,CL_t2_gpu); k_copy4_local_gpu.setArg(2,N_elem/4); k_copy4_local_gpu.setArg(3,N_iter); cout<<"Local memory:"<<endl; cout<<"cpu: "<<cpu.devices[0].getInfo<CL_DEVICE_LOCAL_MEM_SIZE>()<<endl; cout<<"gpu: "<<gpu.devices[0].getInfo<CL_DEVICE_LOCAL_MEM_SIZE>()<<endl; cout<<"Memory type:"<<endl; cout<<"cpu: "<<cpu.devices[0].getInfo<CL_DEVICE_LOCAL_MEM_TYPE>()<<endl; cout<<"gpu: "<<gpu.devices[0].getInfo<CL_DEVICE_LOCAL_MEM_TYPE>()<<endl; for(int i=0; i<N_elem; i++) t1[i]=i; gpu.queue.enqueueWriteBuffer(CL_t1_gpu, CL_TRUE, 0, N_elem*sizeof(float), &t1[0]); cpu.queue.enqueueWriteBuffer(CL_t1_cpu, CL_TRUE, 0, N_elem*sizeof(float), &t1[0]); v.start(); calc_cpu_native(); cout<<setw(30)<<"cpu native - exec.time: "<<setw(10)<<v.duration(); cout<<" t2:"; cout.precision(3); for(int i=0; i<N_elem; i++) cout<<setw(11)<<t2[i]; cout<<endl; v.start(); calc_OpCl(k_copy_cpu, cpu); cout<<setw(30)<<"cpu copy - exec.time: "<<setw(10)<<v.duration(); cpu.queue.enqueueReadBuffer(CL_t2_cpu, CL_TRUE, 0, N_elem*sizeof(float), &t2[0]); cout<<" t2:"; cout.precision(3); for(int i=0; i<N_elem; i++) cout<<setw(11)<<t2[i]; cout<<endl; v.start(); calc_OpCl(k_copy4_cpu, cpu); cout<<setw(30)<<"cpu copy4 - exec.time: "<<setw(10)<<v.duration(); cpu.queue.enqueueReadBuffer(CL_t2_cpu, CL_TRUE, 0, N_elem*sizeof(float), &t2[0]); cout<<" t2:"; cout.precision(3); for(int i=0; i<N_elem; i++) cout<<setw(11)<<t2[i]; cout<<endl; v.start(); calc_OpCl(k_copy4_local_cpu, cpu); cout<<setw(30)<<"cpu copy4_local - exec.time: "<<setw(10)<<v.duration(); cpu.queue.enqueueReadBuffer(CL_t2_cpu, CL_TRUE, 0, N_elem*sizeof(float), &t2[0]); cout<<" t2:"; cout.precision(3); for(int i=0; i<N_elem; i++) cout<<setw(11)<<t2[i]; cout<<endl; v.start(); calc_OpCl(k_copy_gpu, gpu); cout<<setw(30)<<"gpu copy - exec.time: "<<setw(10)<<v.duration(); cpu.queue.enqueueReadBuffer(CL_t2_gpu, CL_TRUE, 0, N_elem*sizeof(float), &t2[0]); cout<<" t2:"; cout.precision(3); for(int i=0; i<N_elem; i++) cout<<setw(11)<<t2[i]; cout<<endl; v.start(); calc_OpCl(k_copy4_gpu, gpu); cout<<setw(30)<<"gpu copy4 - exec.time: "<<setw(10)<<v.duration(); cpu.queue.enqueueReadBuffer(CL_t2_gpu, CL_TRUE, 0, N_elem*sizeof(float), &t2[0]); cout<<" t2:"; cout.precision(3); for(int i=0; i<N_elem; i++) cout<<setw(11)<<t2[i]; cout<<endl; v.start(); calc_OpCl(k_copy4_local_gpu, gpu); cout<<setw(30)<<"gpu copy4_local - exec.time: "<<setw(10)<<v.duration(); cpu.queue.enqueueReadBuffer(CL_t2_gpu, CL_TRUE, 0, N_elem*sizeof(float), &t2[0]); cout<<" t2:"; cout.precision(3); for(int i=0; i<N_elem; i++) cout<<setw(11)<<t2[i]; cout<<endl; } #define N_max 10 __kernel void copy( __global float* t1, __global float* t2, int N, int N_iter ) { int i_thread=get_global_id(0); for(int i=0; i<N; i++) t2[i_thread*N+i]=0; for(int i_iter=0; i_iter<N_iter; i_iter++) for(int i=0; i<N; i++) t2[i_thread*N+i]=t1[i_thread*N+i]; } __kernel void copy4( __global float4* t1, __global float4* t2, int N, int N_iter ) { int i_thread=get_global_id(0); for(int i=0; i<N; i++) t2[i_thread*N+i]=0; for(int i_iter=0; i_iter<N_iter; i_iter++) for(int i=0; i<N; i++) t2[i_thread*N+i]=t1[i_thread*N+i]; } __kernel void copy_local( __global float* t1, __global float* t2, int N, int N_iter ) { int i_thread=get_global_id(0); __local float p1_l[N_max]; __local float p2_l[N_max]; for(int i=0; i<N; i++) t2[i_thread*N+i]=0; for(int i=0; i<N; i++) p1_l[i]=t1[i_thread*N+i]; for(int i_iter=0; i_iter<N_iter; i_iter++) for(int i=0; i<N; i++) p2_l[i]=p1_l[i]; for(int i=0; i<N; i++) t2[i_thread*N+i]=p2_l[i]; } __kernel void copy4_local( __global float4* t1, __global float4* t2, int N, int N_iter ) { int i_thread=get_global_id(0); __local float4 p1_l[N_max]; __local float4 p2_l[N_max]; for(int i=0; i<N; i++) t2[i_thread*N+i]=(float4)(0,0,0,0); for(int i=0; i<N; i++) p1_l[i_thread*N+i]=t1[i_thread*N+i]; for(int i_iter=0; i_iter<N_iter; i_iter++) for(int i=0; i<N; i++) p2_l[i_thread*N+i]=p1_l[i_thread*N+i]; for(int i=0; i<N; i++) t2[i_thread*N+i]=p2_l[i_thread*N+i]; }

Outcomes