AnsweredAssumed Answered

Poor kernel device enqueue performance

Question asked by doqtor on Sep 1, 2015
Latest reply on Oct 12, 2015 by doqtor

Hi guys,

 

Am I doing something wrong here? Device side enqueue is 5 times slower than host side enqueue in the below simple example:

 

Kernels:

 

__kernel __attribute__((reqd_work_group_size(256, 1, 1)))

void sum(int elem, __global int *a, __global int *b, __global int *c)

{

    int local_id = get_local_id(0);

    int local_size = get_local_size(0);

 

    c[elem*local_size + local_id] = a[elem*local_size + local_id] + b[elem*local_size + local_id];

}

 

__kernel __attribute__((reqd_work_group_size(1, 1, 1)))

void sum2(int total, __global int *a, __global int *b, __global int *c)

{

    for (int elem = 0; elem < total; ++elem)

    {

        enqueue_kernel(get_default_queue(),

            CLK_ENQUEUE_FLAGS_NO_WAIT,

            ndrange_1D(256, 256),

            ^{ sum(elem, a, b, c);

        });

    }

}

 

Host side enqueue:

 

const size_t iterations = 2000;

std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();

for (size_t iter = 0; iter < iterations; ++iter)

{

    kernel.setArg(0, static_cast<int>(iter));

    queue.enqueueNDRangeKernel(*kernel, cl::NullRange, cl::NDRange(256, 1), cl::NDRange(256, 1));

}

queue.finish();

std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();

std::cout << "kernel took " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms.\n";

std::cout << std::endl;

 

Device side enqueue:

 

const size_t iterations = 2000;

std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();

kernel.setArg(0, static_cast<int>(iterations));

queue.enqueueNDRangeKernel(*kernel, cl::NullRange, cl::NDRange(1, 1), cl::NDRange(1, 1));

queue.finish();

std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();

std::cout << "kernel took " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms.\n";

std::cout << std::endl;

 

My set up:

Ubuntu 14.04, R9 295, fglrx 15.20.3, AMD APP SDK 3.0

Outcomes