AnsweredAssumed Answered

Asynchronous pinned transfers?

Question asked by dmyablonski on May 6, 2012
Latest reply on May 17, 2012 by dmyablonski

I'm trying to modify BufferBandwidth from the AMDAPP SDK so that I can run multiple threads concurrently transferring data either to the same device, or a second device in the system.

 

I'm doing fine in single thread, HOST->DEVICE (basically do the same thing as the SDK code).

 

When I use two threads, both sending data host to device, I seem to get full speed with one thread and half speed (twice as long to transfer) for the second thread.


Is there an issue using two pinned buffers simultaneously? I have a tracing tool from my company that I use and am able to see that the threads start within microseconds of each other, but one finishes in twice as long. Expectation of course would be that both finish around the same time. Are mapped writes not a DMA?

 

Each thread has its own context and queue.

Each thread does the following:

 ...
  // create host buffer
  mem_host = clCreateBuffer(context, CL_MEM_READ_ONLY, data_bytes, host_ptr, &ret);

  // Create scratch memory for the mapped device memory
  void* memscratch;
  posix_memalign( &memscratch, 4096, data_bytes ); //Create a buffer aligned at 4096 byte blocks


  // Create the Device buffer
  mem_dev = clCreateBuffer( context,
                            CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY,
                            data_bytes,
                            memscratch, &ret);


  // Map the device buffer (pre-pin it)
  void* dev_ptr;
  dev_ptr=(void*)clEnqueueMapBuffer(  command_queue,
                                      mem_dev,
                                      CL_FALSE,
                                      CL_MAP_READ | CL_MAP_WRITE,
                                      0,
                                      data_bytes,
                                      0, NULL,
                                      NULL, &ret);
  // Flush/finish the command
  clFlush(command_queue);
  clFinish(command_queue);


....


    start_tod_timer(&start_timer);
    clEnqueueWriteBuffer( command_queue,
                                mem_host,
                                CL_FALSE, 0,
                                data_bytes,
                                dev_ptr,
                                0, NULL, &ev);
    clFlush(command_queue);


    cl_int param_value;
    size_t param_value_size_ret;


    while(1)
    {
      ret |= clGetEventInfo( ev,
          CL_EVENT_COMMAND_EXECUTION_STATUS,
          sizeof( cl_int ),
          &param_value,
          &param_value_size_ret );


      if( param_value == CL_COMPLETE )
        break;
    }
    clReleaseEvent( ev );


    tod_res[i] = stop_tod_timer(&start_timer);

Outcomes