Archives Discussions

arvin99 · ‎12-05-2013

Hi,

I need your help. I create an OpenCL program using pinned buffer.

The duration to transfer buffer from device to host is fast as I expected.

My question is, is it normal if the transfer rate of pinned buffer (size is 256 MB) exceed the PCI-e bandwidth (PCI-e 2.0 x 16)??

The transfer rate of pinned buffer is 204582 GB/s ( iget the transfer rate from AMD APP Profiler)

arvin99 · ‎12-05-2013

Can someone help me??

I already search many times and I find that pinned buffer can increase transfer rate but it is limited to PCI-e bandwidth.

In my case, the transfer rate is exceed PCI-e bandwidth (8 GB in each direction)

himanshu_gautam · ‎12-06-2013

Ya true, pinned buffer can increase the transfer rate but it is limited to PCI-e bandwidth.

For more details check Types of Memory used by Runtime in the "AMD Accelerated Parallel Processing OpenCL Programming Guide" available in AMD APP SDK Reference materials.

http://developer.amd.com/wordpress/media/2013/07/AMD_Accelerated_Parallel_Processing_OpenCL_Programm...

arvin99 · ‎12-06-2013

Thanks for reply.

Is it applied in zero copy buffer too (zero copy buffer transfer rate is limited to PCI-e bandwidth)??

If it is true, why my pinned buffer transfer rate (device to host) exceed PCI-e bandwith (204582 GB/s > 8 GB/s)??

Is it a bug from AMD APP Profiler or there are mistakes in my program (Matrix Multiplication program using 2D work item) ??

I make a new test case with matrix 4096 x 4096 (64 MB) and the pinned buffer transfer rate is 93703 GB/s ( 93703 GB/s > 8GB/s).

Is it because of the asynchronous transfer ??

Here is the code:

#include <Windows.h>

#include <iostream>

#include <fstream>

#include <string>

using namespace std;

#ifdef __APPLE__

#include <OpenCL/opencl.h>

#else

#include <CL/cl.h>

#endif

#define MAX_SOURCE_SIZE (0x100000)

#define SIZE 4096

cl_context context = NULL;

cl_command_queue queue = NULL;

cl_program program = NULL;

//Function to read RandomNumber.txt and fill matrix

void fillMatrix(cl_float* data, int size)

{

//Get text file to read

ifstream myfile("RandomNumber.txt");

if(myfile.is_open())

{

for(int i = 0; i < size*size; ++i)

{

myfile >> data;

if(((i + 1) % size) == 0)

{

//ignore until the next line

myfile.ignore(INT_MAX, '\n');

}

myfile.close();

}

else cout << "Unable to open file";

}

//Function to print matrix

void printMatrix(cl_float* matrix, int totalsize, int size)

{

for(int i = 0; i < totalsize; i++)

{

cout<<matrix<<" ";

if(((i + 1) % size) == 0)

{

cout<<endl;

}

cout<<endl;

}

void MatrixMul(cl_mem d_A, cl_mem d_B, cl_mem d_C, int size)

{

cl_int err;

cl_kernel naive;

// Create Kernel Object Bound To Kernel Function

naive = clCreateKernel(program, "naiveAlgorithm", &err);

//Set size of global work item and work tem in each work goups

int globalsize = size;

int localsize;

if(globalsize >= 16)

{

localsize =16;

}else

{

localsize = globalsize;

}

size_t global_work_items [2] = {globalsize, globalsize};

size_t local_work_items [2] = {localsize, localsize};

// Setup Kernel Argument

err = clSetKernelArg(naive, 0, sizeof(cl_mem), (void *)&d_A);

err = clSetKernelArg(naive, 1, sizeof(cl_mem), (void *)&d_B);

err = clSetKernelArg(naive, 2, sizeof(cl_mem), (void *)&d_C);

err = clSetKernelArg(naive, 3, sizeof(cl_int), (void *)&size);

// Execute OpenCL kernel for Naive Algorithm

err = clEnqueueNDRangeKernel(queue, naive, 2, NULL, global_work_items, local_work_items, 0, NULL, NULL);

clFinish(queue);

//Release Kernel

err = clReleaseKernel(naive);

}

void Naive(cl_float* matrixA, cl_float* matrixB, cl_float* matrixC, int size)

{

int err;

// OpenCL device memory for matrices

cl_mem d_A;

cl_mem d_B;

cl_mem d_C;

// Allocate Device Memory For Input And Output

d_A = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(cl_float)*size*size, 0, &err);

d_B = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(cl_float)*size*size, 0, &err);

d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR , sizeof(cl_float)*size*size, 0, &err);

// Copy Host Memory To Memory Device

err = clEnqueueWriteBuffer(queue, d_A, CL_FALSE, 0, sizeof(cl_float)*size*size, matrixA, 0, NULL, NULL); //Copy matrixA to d_a

err = clEnqueueWriteBuffer(queue, d_B, CL_FALSE, 0, sizeof(cl_float)*size*size, matrixB, 0, NULL, NULL); //Copy matrixB to d_b

MatrixMul(d_A, d_B, d_C, size);

// Copy Device Memory To Host Memory

err = clEnqueueReadBuffer(queue, d_C, CL_TRUE, 0, sizeof(cl_float)*size*size, matrixC, 0, NULL, NULL);

err = clReleaseMemObject(d_A);

err = clReleaseMemObject(d_B);

err = clReleaseMemObject(d_C);

}

//Main Function

int main(int argc, char **argv)

{

cl_int size = 4096;

//Matrix for input and output

cl_float * matrixA;

cl_float * matrixB;

cl_float * matrixC;

//Allocate and init memory for the host

matrixA = (cl_float *) malloc(size*size*sizeof(cl_float));

matrixB = (cl_float *) malloc(size*size*sizeof(cl_float));

matrixC = (cl_float *) malloc(size*size*sizeof(cl_float));

//Fill matrix

fillMatrix(matrixA,size);

fillMatrix(matrixB,size);

//print input for matrix A and B

cout<<"Input for matrix A :"<<endl;

printMatrix(matrixA, size*size, size);

cout<<"Input for matrix B :"<<endl;

printMatrix(matrixB, size*size, size);

cl_int err; // error code

cl_platform_id* platforms;

cl_uint platformCount;

cl_device_id device;

int platformtype = 0; //if 0 using amd app sdk but if 1 using intel sdk

clGetPlatformIDs(0, NULL, &platformCount); //get number of platform

platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount); //create memory allocation for each platform

clGetPlatformIDs(platformCount, platforms, NULL); //get list of platform

clGetDeviceIDs (platforms [platformtype], CL_DEVICE_TYPE_GPU, 1, &device, NULL); //get list of devices

const cl_context_properties contextProperties [] =

{

CL_CONTEXT_PLATFORM,

reinterpret_cast<cl_context_properties> (platforms [platformtype]),

0, 0

};

// Create OpenCL context

context = clCreateContext(contextProperties, 1, &device, NULL, NULL, &err);

// Create a command queue

queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);

//Load Kernel Source

FILE *fp;

const char fileName[] = "./MatMul_Kernel.cl";

size_t source_size;

char *source_str;

fp = fopen(fileName, "r");

if (!fp)

{

fprintf(stderr, "Failed to load kernel.\n");

exit(1);

}

source_str = (char *)malloc(MAX_SOURCE_SIZE);

source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);

fclose(fp);

// Create Program Object

program = clCreateProgramWithSource(context, 1, (const char **) &source_str, (const size_t *) &source_size, &err);

// Build Program

err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);

Naive(matrixA, matrixB, matrixC, size);

//Cleanup all memory

err = clFlush(queue);

err = clFinish(queue);

err = clReleaseProgram(program);

err = clReleaseCommandQueue(queue);

err = clReleaseContext(context);

// Display result of matrix multiplication

cout<<"Output for matrix C :"<<endl;

printMatrix(matrixC, size*size, size);

cout<<endl;

free(matrixA);

free(matrixB);

free(matrixC);

free(source_str);

return 0;

}

And here is the image from AMD APP Profiler:

arvin99 · ‎12-06-2013

I already read the AMD documentation. Looks like I am misunderstanding.

1. If I use standard buffer (Default Flag), the process of transfer happened two times :

pageable buffer (host) -> page-locked buffer / pinned buffer (host) -> buffer (device)

2. If I use pinned buffer d_C (CL_MEM_ALLOC_HOST_PTR), the transfer rate will increase but it is limited with PCIe bandwidth.

page-locked buffer / pinned buffer (host) -> buffer (device)

3. If I use zero copy, the transfer rate increase greatly since the GPU access kernel directly in the pinned host memory without do any transfer,

but the drawback is: the kernel execution time become slow especially for discrete GPU (because of slower access of GPU to read kernel).

Is my explanation correct??

So, if my explanation is correct, my pinned buffer transfer rate (CL_MEM_ALLOC_HOST) must not exceed PCI-e bandwith (204582 GB/s > 8 GB/s)

unless I use zero copy (CL_MEM_ALLOC_HOST + mapBuffer), is n't it??

Is OpenCL automatically use zero copy if the transfer rate faster than PCIe bandwidth??

arvin99 · ‎12-06-2013

Here is the image of GPU-Z that show my PCIe version:

Here is the PCIe speed test result from program that provided by ATI:

From the result, it seems the limit of my PCIe 2.0x16 bandwidth is ~5.8GB/sec (in theory is 8GB/sec) when I copy from GPU to CPU.

Looks like there is no problem in my hardware, so why the transfer rate from device (GPU) to host(CPU) become higher than PCI-e bandwidth

whenever I don't use clEnqueueMapBuffer?

nou · ‎12-07-2013

use of CL_MEM_ALLOC_HOST_PTR cause it to use zero copy buffer. so there are no transfer across PCIe which explains that high transfer rates. refer to table 5.2 in AMD APP OpenCL programming guide.

arvin99 · ‎12-07-2013

Thanks for reply, Nou.

I become confused because the table just explain the relation between "the flag in clCreateBuffer" with "clEnqueueMapBuffer".

The table does n't explain the relation between "the flag in clCreateBuffer" with "clEnqueueReadBuffer/WriteBuffer".

So,according to the table 5.2 (I already understand in this part):

CL_MEM_ALLOC_HOST_PTR + mapBuffer -> zero copy

DEFAULT FLAG + mapBuffer -> copy (if device CPU, it become zero copy buffer)

And from my experience:

DEFAULT FLAG + enqueueWriteBuffer/ReadBuffer -> copy

From your explanation:

CL_MEM_ALLOC_HOST_PTR + enqueueWriteBuffer/ReadBuffer -> zero copy (this part make me confuse)

In conclusion, what make a buffer is zero copy buffer is the parameter CL_MEM_ALLOC_HOST_PTR and CL_MEM_USE_HOST_PTR although I use enqueueWriteBuffer/ReadBuffer.

Am I correct??

In documentation (5.6.2.3 Pre-pinned Buffers), to make a pinned buffer (it must not exceed PCIe bandwidth), we can use CL_MEM_ALLOC_HOST_PTR and CL_MEM_USE_HOST_PTR.

So, how to make a pinned buffer (not zero copy buffer) without the use of enqueueReadBuffer/WriteBuffer??

If I use enqueueReadBuffer/WriteBuffer with CL_MEM_ALLOC_HOS_PTR, it will become zero copy buffer, is n't it??

Or pinned buffer (with flag CL_MEM_ALLOC_HOST_PTR) always become zero copy buffer??

nou · ‎12-08-2013

Oh I overlooked that you are talking about clEnqueueReadBuffer(). It may be a bug in profiler code. Try set it to unblocking read and call clFinish manually. only clEnqueuMap/Unmap can be zero copy so this is clearly a bug in profiler.

arvin99 · ‎12-08-2013

Thanks for your help but I already try that and the result is same,

using CL_MEM_ALLOC_HOST_PTR + enqueuereadbuffer = zero copy (transfer rate ~15000 GB/s)

	// Allocate Device Memory For Input And Output
	d_A = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(cl_float)sizesize, 0, &err);
	d_B = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(cl_float)sizesize, 0, &err);
	*d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY \| CL_MEM_ALLOC_HOST_PTR , sizeof(cl_float)sizesize, 0, &err);*

	// Copy Host Memory To Memory Device

	err = clEnqueueWriteBuffer(queue, d_A, CL_FALSE, 0, sizeof(cl_float)sizesize, matrixA, 0, NULL, NULL); //Copy matrixA to d_a
	clFinish(queue);
	err = clEnqueueWriteBuffer(queue, d_B, CL_FALSE, 0, sizeof(cl_float)sizesize, matrixB, 0, NULL, NULL); //Copy matrixB to d_b
	clFinish(queue);

err = clEnqueueReadBuffer(queue, d_C, CL_FALSE, 0, sizeof(cl_float)*size*size, matrixC, 0, NULL, NULL);

clFinish(queue);

Are there a relation with discrete GPU because my GPU support VM (virtual memory)?? Or are there a relation with buffer size??

I use AMD App SDK 2.6 + AMD App Profiler 2.4

arvin99 · ‎12-10-2013

Hey, I already use unblocking call in clEnqueueWriteBuffer and using clFinish(queue) but the result is zero copy.

And I already try to install AMD APP SDK 2.5 and 2.7 but using clEnqueueReadBuffer/WriteBuffer with CL_MEM_ALLOC_HOST_PTR still result in zero copy.

I already upgrade to AMD APP Profiler 2.5 and the result is zero copy too.

Can someone help me ?? I just want to get the result of pinned buffer, not zero copy buffer??

amd_support · ‎03-03-2014

Hi,

We tried to compile/run your code. However RandomNumbers.txt and MatMul_Kernel.cl are missing.

Could you please provide them?

Thanks,

AMD Support

youwei · ‎10-10-2015

It is not a bug. It is zero-copy according to the documentation.

cl_mem d_C in your code is a buffer on your host because your specify the flag.

When you pass it as a kernel argument, the doc says " If the buffer is used in a kernel, the runtime creates a cached copy on the device, and subsequent copies are not on the fast path." So possibly your kernel will write to cache in device memory and after the execution the memory area is implicitly copied back (not at paged locked rate).

Even if you do not enqueue the read buffer command, the result already reside in your buffer d_C.

Archives Discussions

Transfer Rate exceed PCI-e bandwith