3 Replies Latest reply on Oct 12, 2015 2:02 PM by doqtor

    Poor kernel device enqueue performance

    doqtor

      Hi guys,

       

      Am I doing something wrong here? Device side enqueue is 5 times slower than host side enqueue in the below simple example:

       

      Kernels:

       

      __kernel __attribute__((reqd_work_group_size(256, 1, 1)))

      void sum(int elem, __global int *a, __global int *b, __global int *c)

      {

          int local_id = get_local_id(0);

          int local_size = get_local_size(0);

       

          c[elem*local_size + local_id] = a[elem*local_size + local_id] + b[elem*local_size + local_id];

      }

       

      __kernel __attribute__((reqd_work_group_size(1, 1, 1)))

      void sum2(int total, __global int *a, __global int *b, __global int *c)

      {

          for (int elem = 0; elem < total; ++elem)

          {

              enqueue_kernel(get_default_queue(),

                  CLK_ENQUEUE_FLAGS_NO_WAIT,

                  ndrange_1D(256, 256),

                  ^{ sum(elem, a, b, c);

              });

          }

      }

       

      Host side enqueue:

       

      const size_t iterations = 2000;

      std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();

      for (size_t iter = 0; iter < iterations; ++iter)

      {

          kernel.setArg(0, static_cast<int>(iter));

          queue.enqueueNDRangeKernel(*kernel, cl::NullRange, cl::NDRange(256, 1), cl::NDRange(256, 1));

      }

      queue.finish();

      std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();

      std::cout << "kernel took " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms.\n";

      std::cout << std::endl;

       

      Device side enqueue:

       

      const size_t iterations = 2000;

      std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();

      kernel.setArg(0, static_cast<int>(iterations));

      queue.enqueueNDRangeKernel(*kernel, cl::NullRange, cl::NDRange(1, 1), cl::NDRange(1, 1));

      queue.finish();

      std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();

      std::cout << "kernel took " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms.\n";

      std::cout << std::endl;

       

      My set up:

      Ubuntu 14.04, R9 295, fglrx 15.20.3, AMD APP SDK 3.0