Hi,
could anyone help me? I have a matrix-vector multiplication program written in openCL/C. I call this function from Fortran to do a matrix vector multiplication. eg Ax=b. The A matrix does not change, however x is updated on successive calls.
How can I reuse A on sucessive calls without reinitialising and copying A to the GPU? Because this takes up most of the execution time.
salomonamd,
i did matrix multiplication using local memory from this code.
A and B are input matrices C is the output.
dimension of both matices is same.i.e widthA=widthB=widthC.
You can refer to matrix multiplication sample which came with SDK.
Although local memory is not used,but the arrays are copied to device memory,which add up to efficiency.
hope it helps.
/* Write MMM using local memory */ #pragma OPENCL EXTENSION cl_amd_printf:enable void __kernel mmmLocal(int widthA, int widthB, int widthC, __global float* A, __global float* B, __global float* C, int BLOCK_SIZE, __local float* A_local, __local float* B_local) { int xGlobal=get_global_id(0); int xLocal=get_local_id(0); int yGlobal=get_global_id(1); int yLocal=get_local_id(1); int BlockNum=get_global_size(0)/get_local_size(0); float CSum=0; //if(xGlobal<=5&&xGlobal>=4&& yGlobal>=4 && yGlobal<=5) //{ int NumBlock=widthC/BLOCK_SIZE; for(int j=0;j<NumBlock;j++) { //printf("Global=%d %d Local=%d%d\n",xGlobal,yGlobal,xLocal,yLocal); A_local[(yLocal)*BLOCK_SIZE+xLocal]=A[yGlobal*widthA + j*BLOCK_SIZE + xLocal]; //printf(" block vlaue fetched=%f\n",A_local[(yLocal)*BLOCK_SIZE+xLocal]); B_local[yLocal*BLOCK_SIZE+xLocal]=B[(yLocal+j*BLOCK_SIZE)*widthA+xGlobal]; //printf(" block vlaue fetched=%f\n",B_local[(yLocal)*BLOCK_SIZE+xLocal]); barrier(CLK_LOCAL_MEM_FENCE); for(int i=0;i<BLOCK_SIZE;i++) { CSum+=A_local[yLocal*BLOCK_SIZE+i]*B_local[ i*BLOCK_SIZE + xLocal]; //printf("%f X %f=%f\n",A_local[yLocal*BLOCK_SIZE+i],B_local[ i*BLOCK_SIZE + xLocal],CSum); } barrier(CLK_GLOBAL_MEM_FENCE); } //printf("%f\n",CSum); C[yGlobal*widthA+xGlobal]=CSum; //} }
Hi,
yes I also use a kernel that utilises local memory. I got the following kernel from http://www.bealto.com/gpu-gemv_v3.html
However, it is on the host where I need the optimisations, because copying large matrices to the GPU is time consuming.
Let me explain a little better. The gemv program works great, but because I do consecutive calls to gemv on the gpu copying both A and x takes alot of time. The A matrix stays the same, but x is updated at each iteration. Therefore I need some way to be able to call my kernel and only copy x to the gpu at each iteration. This is all on the host code. eg:
enter subroutine( A, x, y, i )
create clcontext (setup/initialisation)
if ( i=1 )
clEnqueueWriteBuffer( A, x )
else
clEnqueueWriteBuffer( x )
endif
call gemv( A, x )
clEnqueueReadBuffer( y )
exit subroutine
I hope this is more clear? I basically need to save the context somehow because on exit of the subroutine the context gets deleted sometimes or on some systems.
// P threads per row compute 1/P-th of each dot product. // WORK has N/P entries. __kernel void gemv3(__global const scalar_t * a,__global const scalar_t * x, __global scalar_t * y, __local scalar_t * work, int m,int n) { // Load a slice of X in WORK, using all available threads int ncols = n / get_global_size(COL_DIM); // nb values to load int col0 = ncols * get_global_id(COL_DIM); // first value to load for (int k=0;k<ncols;k+=get_local_size(ROW_DIM)) { int col = k+get_local_id(ROW_DIM); if (col < ncols) work[col] = x[col0+col]; } barrier(CLK_LOCAL_MEM_FENCE); // sync group // Compute partial dot product scalar_t sum = (scalar_t)0; for (int k=0;k<ncols;k++) { sum += a[get_global_id(ROW_DIM)+m*(col0+k)] * work
; } // Store in Y (P columns per row) y[get_global_id(ROW_DIM)+m*get_global_id(COL_DIM)] = sum; } // Reduce M = get_global_size(0) rows of P values in matrix Y. // Stores the result in first column of Y. __kernel void reduce_rows(__global scalar_t * y,int m,int p) { int row = get_global_id(0); scalar_t sum = (scalar_t)0; for (int col=0;col<p;col++) sum += y[row + m*col]; y[row] = sum; }
salomonamd,
What is it that you are not abke to do,to implement your mltiplication algorithm?
salomonamd
Come on, this is not an OpenCl question.
Either make the context a member variable or pass it as a function parameter.
Thanx, himanshu.gautam and redditisgreat,
I do have the context as part of a member variable already. I had some issues passing the structure of which cl_context forms a part of between Fortran and C.
I guess problem solved.