Hello,
I took the vectorAdd example using HIP from github (https://github.com/ROCm-Developer-Tools/HIP-Examples/blob/master/vectorAdd/vectoradd_hip.cpp) and added C++ timers around compute only part. Aside from this, I also added timers around simple addition on CPU. I see that vadd on cpu is much faster than gpu. Why is that so?
Can someone help me explain what am i missing here?
hipcc -w -g -std=c++14 vectoradd_hip.cpp -o out.exe
./out.exe
System minor 0
System major 9
agent prop name Vega 10 [Radeon Instinct MI25]
hip Device prop succeeded
GPU duration : 104.737 ms
CPU duration : 1.521 ms
PASSED!
Kernel code :
#define WIDTH 1024
#define HEIGHT 1024
#define NUM (WIDTH*HEIGHT)
#define THREADS_PER_BLOCK_X 16
#define THREADS_PER_BLOCK_Y 16
#define THREADS_PER_BLOCK_Z 1
__global__ void
vectoradd_float(float* __restrict__ a, const float* __restrict__ b, const float* __restrict__ c, int width, int height)
{
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int i = y * width + x;
if ( i < (width * height)) {
a[i] = b[i] + c[i];
}
}
Host code :
GPU part :
{
auto start = std::chrono::steady_clock::now();
hipLaunchKernelGGL(vectoradd_float,
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, 0,
deviceA ,deviceB ,deviceC ,WIDTH ,HEIGHT);
auto stop = std::chrono::steady_clock::now();
std::chrono::duration<double, std::milli> elapsed = stop - start;
printf("GPU duration : %0.3f ms\n", elapsed.count());
}
CPU part :
{
auto start = std::chrono::steady_clock::now();
for (i = 0; i < NUM; i++) {
cpu_out[i] = hostB[i] + hostC[i];
}
auto stop = std::chrono::steady_clock::now();
std::chrono::duration<double, std::milli> elapsed = stop - start;
printf("CPU duration : %0.3f ms\n", elapsed.count());
}