4 Replies Latest reply on Jun 7, 2011 1:16 PM by himanshu.gautam

    Kernel does not give correct output

    richeek.arya

      Hi all,

      I have written a kernel that produces correct output on GTX 260 but when I run the same code on ATI Radeon 5450 it gives incorrect output. I am not sure why this is happening.On the GTX side when I open resource monitor I see physical memory usage shooting up hence I am little concern about freeing up the memory too.

      I am interfacing this OpenCL code with Matlab. And I have written an interface function that makes the OpenCL context, initializes device variables and calls the kernel.  In the end it frees up the variable.

      Could someone kindly check if my variable initialization is correct? The output of the kernel is a a variable LLR that I am reading back using clEnqueueReadBuffer(), is doing that correct?

      Is there a better way of initializing variables?? Am I not freeing up the memory correctly?? Any help would be highy appreciated.

      #include <string.h> #include"cl_resources.h" #ifdef _CHAR16T #define CHAR16_T #endif #include "mex.h" #include <string.h> #include"cl_resources.h" #ifdef _CHAR16T #define CHAR16_T #endif #include "mex.h" void mexFunction (int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { cl_int status; /* Initialize OpenCL resoruces */ status = initializeCL(); if(status != CL_SUCCESS) return; // input variables float *R_re, *R_im, *dist_ZF,*s_re, *s_im , *symbol_alphabet_re, *symbol_alphabet_im; bool *bittable; int kk, nR, nT, nSym, *M, *symbols_ZF_i, total_bits,Sum_M,max_2_M; size_t size; cl_event events[2], ev; // output variables float *LLR; if (nrhs!=7) mexErrMsgTxt("7 input arguments required \n\n" "[LLR] = soft_sd(R,s,dist_ZF,symbols_ZF,symbol_alphabet,bittable) ... soft Sphere Decoder\n\n" " R ... upper triangular matrix obtained from the QR decomposition of the channel H (complex)\n" " s ... received symbol vector, s=Q^H*y (nR x nSym) (complex)\n" " dist_ZF ... Distance of the zero forcing solution (real)\n" " symbols_ZF_i ... indices to symbols of the ZF solution (nT x nSym) (real integer)\n" " M ... number of bits in the corresponding layer (1 x nR) (real)\n" " symbol_alphabet ... for the demapping (2^M_max x nT) (complex)\n" " bittable ... matrix containing the bits according to the symbol_alphabet (M x 2^M) (logical)\n" " LLR ... max-log-MAP approximation of the LLR values (M*nR) (real)\n\n"); if (nlhs>1) mexErrMsgTxt("One output lefthand argument required \n"); // check input variables if ( ! mxIsComplex(prhs[0]) ) mexErrMsgTxt("1st argument 'R' must be a complex-valued (nR x nT) matrix"); if ( ! mxIsComplex(prhs[1]) ) mexErrMsgTxt("2nd argument 's' must be a complex-valued (nR x nSym) matrix"); if ( mxIsComplex(prhs[2]) ) mexErrMsgTxt("3rd argument 'dist_ZF' must be a real-valued (1 x nSym) matrix"); if ( mxIsComplex(prhs[3]) ) mexErrMsgTxt("4th argument 'symbols_ZF_i' must be a real-valued (nT x nSym) integer matrix"); if ( mxIsComplex(prhs[4]) ) mexErrMsgTxt("5th argument 'M' must be a real-valued (1 x nT) integer matrix"); if ( ! mxIsComplex(prhs[5]) ) mexErrMsgTxt("6th argument 'symbol_alphabet' must be a complex-valued (2^M_max x nT) matrix"); if ( ! mxIsLogical(prhs[6]) ) mexErrMsgTxt("7th argument 'bittable' must be a logical (M x 2^M) matrix"); nR = mxGetM(prhs[0]); // number of receive antennas nT = mxGetN(prhs[0]); // number of transmit antennas nSym = mxGetN(prhs[1]); // Block size (number of transmitted symbol vectors) Sum_M = mxGetM(prhs[6]); // sum of the number of bits of M max_2_M = mxGetN(prhs[6]); // Maximum value of 2^M // fetch input variables R_re = (float *)(mxGetPr(prhs[0])); // fetch pointer to real part of R R_im = (float *)(mxGetPi(prhs[0])); // fetch pointer to imag part of R s_re = (float *)(mxGetPr(prhs[1])); // fetch pointer to real part of s s_im = (float *)(mxGetPi(prhs[1])); // fetch pointer to imag part of s dist_ZF = (float *)(mxGetPr(prhs[2])); // fetch ZF distance symbols_ZF_i = (int *)(mxGetPr(prhs[3])); // fetch pointer to imag part of ZF solution indices M = (int *)(mxGetPr(prhs[4])); // fetch pointer to number of bits vector symbol_alphabet_re = (float *)(mxGetPr(prhs[5])); // fetch pointer to real part of symbol alphabet symbol_alphabet_im = (float *)(mxGetPi(prhs[5])); // fetch pointer to imag part of symbol alphabet bittable = (bool *)(mxGetPr(prhs[6])); // fetch pointer to real part of bit mapping table // allocate memory for output variables total_bits = 0; for(kk=0; kk<nT; kk++) total_bits += M[kk]; size_t total_size = 0; // Allocate Variables on the Device Global Memory of GPU and Load the input variables to the device cl_mem R_re_d; size = nR * nT * sizeof(float); total_size += size; R_re_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status); if(status != CL_SUCCESS) { mexPrintf("Error: Setting kernel argument. \n"); return ; } float* R_re_p; R_re_p = (float *)clEnqueueMapBuffer(commandQueue,R_re_d,CL_TRUE,CL_MAP_WRITE,0,size,0,NULL,NULL,&status); if(status != CL_SUCCESS) { mexPrintf("Error: clEnqueueMapBuffer \n"); return; } memcpy(R_re_p, R_re, size); /* Load the data back on the GPU */ status = clEnqueueUnmapMemObject(commandQueue,R_re_d,(void *)R_re_p,0,NULL,&ev); if(status != CL_SUCCESS) { mexPrintf("clEnqueueUnmapMemObject() failed\n"); return; } status = clWaitForEvents(1, &ev); if(status != CL_SUCCESS) { mexPrintf("clEnqueueUnmapMemObject() Release failed R_re_d\n"); return; } cl_mem s_re_d; size = nT * nSym * sizeof(float); total_size += size; s_re_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status); if(status != CL_SUCCESS) { mexPrintf("Error: Setting kernel argument. \n"); return; } float *s_re_p; s_re_p = (float *)clEnqueueMapBuffer(commandQueue,s_re_d,CL_TRUE,CL_MAP_WRITE,0,size,0,NULL,NULL,&status); if(status != CL_SUCCESS) { mexPrintf("Error: clEnqueueMapBuffer \n"); return; } memcpy(s_re_p, s_re, size); /* Load the data back on the GPU */ status = clEnqueueUnmapMemObject(commandQueue,s_re_d,(void*)s_re_p,0,NULL,&ev); if(status != CL_SUCCESS) { mexPrintf("clEnqueueUnmapMemObject() failed\n"); return; } status = clWaitForEvents(1, &ev); if(status != CL_SUCCESS) { mexPrintf("clEnqueueUnmapMemObject() Release failed s_re_d\n"); return; } cl_mem s_im_d; size = nT * nSym * sizeof(float); total_size += size; s_im_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status); if(status != CL_SUCCESS) { mexPrintf("Error: Setting kernel argument. \n"); return; } float *s_im_p; s_im_p = (float *)clEnqueueMapBuffer(commandQueue,s_im_d,CL_TRUE,CL_MAP_WRITE,0,size,0,NULL,NULL,&status); if(status != CL_SUCCESS) { mexPrintf("Error: clEnqueueMapBuffer \n"); return; } memcpy(s_im_p, s_im, size); /* Load the data back on the GPU */ status = clEnqueueUnmapMemObject(commandQueue,s_im_d,(void *)s_im_p,0,NULL,&ev); if(status != CL_SUCCESS) { mexPrintf("clEnqueueUnmapMemObject() s_im failed\n"); return; } status = clWaitForEvents(1, &ev); if(status != CL_SUCCESS) { mexPrintf("clEnqueueUnmapMemObject() Release failed s_im_d\n"); return; } cl_mem R_im_d; size = nR * nT * sizeof(float); total_size += size; R_im_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status); if(status != CL_SUCCESS) { mexPrintf("Error: Setting kernel argument. \n"); return ; } float *R_im_p; R_im_p = (float *)clEnqueueMapBuffer(commandQueue,R_im_d,CL_TRUE,CL_MAP_WRITE,0,size,0,NULL,NULL,&status); if(status != CL_SUCCESS) { mexPrintf("Error: clEnqueueMapBuffer \n"); return; } memcpy(R_im_p, R_im, size); /* Load the data back on the GPU */ status = clEnqueueUnmapMemObject(commandQueue,R_im_d,(void *)R_im_p,0,NULL,&ev); if(status != CL_SUCCESS) { mexPrintf("clEnqueueUnmapMemObject() failed R_im with status %d\n", status); return; } status = clWaitForEvents(1, &ev); if(status != CL_SUCCESS) { mexPrintf("clEnqueueUnmapMemObject() Release failed R_im_d\n"); return; } cl_mem dist_ZF_d; size = 1 * nSym * sizeof(float); dist_ZF_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status); total_size += size; if(status != CL_SUCCESS) { mexPrintf("Error: Setting kernel argument. \n"); return; } float *dist_ZF_p; dist_ZF_p = (float *)clEnqueueMapBuffer(commandQueue,dist_ZF_d,CL_TRUE,CL_MAP_WRITE,0,size,0,NULL,NULL,&status); if(status != CL_SUCCESS) { mexPrintf("Error: clEnqueueMapBuffer \n"); return; } memcpy(dist_ZF_p, dist_ZF, size); /* Load the data back on the GPU */ status = clEnqueueUnmapMemObject(commandQueue,dist_ZF_d,(void *)dist_ZF_p,0,NULL,&ev); if(status != CL_SUCCESS) { mexPrintf("clEnqueueUnmapMemObject() failed\n"); return; } status = clWaitForEvents(1, &ev); if(status != CL_SUCCESS) { mexPrintf("clEnqueueUnmapMemObject() Release failed dist_ZF_d\n"); return; } cl_mem symbols_ZF_index_d; size = nT * nSym * sizeof(int); symbols_ZF_index_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status); total_size += size; if(status != CL_SUCCESS) { mexPrintf("Error: Setting kernel argument. \n"); return; } int *symbols_ZF_index_p; symbols_ZF_index_p = (int*)clEnqueueMapBuffer(commandQueue,symbols_ZF_index_d,CL_TRUE,CL_MAP_WRITE,0,size,0,NULL,NULL,&status); if(status != CL_SUCCESS) { mexPrintf("Error: clEnqueueMapBuffer \n"); return; } memcpy(symbols_ZF_index_p, symbols_ZF_i, size); /* Load the data back on the GPU */ status = clEnqueueUnmapMemObject(commandQueue,symbols_ZF_index_d,(void*)symbols_ZF_index_p,0,NULL,&ev); if(status != CL_SUCCESS) { mexPrintf("clEnqueueUnmapMemObject() failed\n"); return; } status = clWaitForEvents(1, &ev); if(status != CL_SUCCESS) { mexPrintf("clEnqueueUnmapMemObject() Release failed symbols_ZF_index_p\n"); return; } cl_mem M_d; size = nT * 1 * sizeof(int); M_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status); total_size += size; if(status != CL_SUCCESS) { mexPrintf("Error: Setting kernel argument. \n"); return; } int *M_p; M_p = (int*)clEnqueueMapBuffer(commandQueue,M_d,CL_TRUE,CL_MAP_WRITE,0,size,0,NULL,NULL,&status); if(status != CL_SUCCESS) { mexPrintf("Error: clEnqueueMapBuffer \n"); return; } memcpy(M_p, M, size); /* Load the data back on the GPU */ status = clEnqueueUnmapMemObject(commandQueue, M_d,(void *)M_p,0,NULL,&ev); if(status != CL_SUCCESS) { mexPrintf("clEnqueueUnmapMemObject() failed\n"); return; } status = clWaitForEvents(1, &ev); if(status != CL_SUCCESS) { mexPrintf("clEnqueueUnmapMemObject() Release failed M_d\n"); return; } cl_mem symbol_alphabet_re_d; size = nT * max_2_M * sizeof(float); total_size += size; symbol_alphabet_re_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status); if(status != CL_SUCCESS) { mexPrintf("Error: Setting kernel argument. \n"); return; } float *symbol_alphabet_re_p; symbol_alphabet_re_p = (float *)clEnqueueMapBuffer(commandQueue,symbol_alphabet_re_d,CL_TRUE,CL_MAP_WRITE,0,size,0,NULL,NULL,&status); if(status != CL_SUCCESS) { mexPrintf("Error: clEnqueueMapBuffer \n"); return; } memcpy(symbol_alphabet_re_p, symbol_alphabet_re, size); /* Load the data back on the GPU */ status = clEnqueueUnmapMemObject(commandQueue,symbol_alphabet_re_d,(void *)symbol_alphabet_re_p,0,NULL,&ev); if(status != CL_SUCCESS) { mexPrintf("clEnqueueUnmapMemObject() failed\n"); return; } status = clWaitForEvents(1, &ev); if(status != CL_SUCCESS) { mexPrintf("clEnqueueUnmapMemObject() Release failed symbol_alphabet_re_d\n"); return; } cl_mem symbol_alphabet_im_d; size = nT * max_2_M * sizeof(float); symbol_alphabet_im_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status); total_size += size; if(status != CL_SUCCESS) { mexPrintf("Error: Setting kernel argument. \n"); return; } float *symbol_alphabet_im_p; symbol_alphabet_im_p = (float *)clEnqueueMapBuffer(commandQueue,symbol_alphabet_im_d,CL_TRUE,CL_MAP_WRITE,0,size,0,NULL,NULL,&status); if(status != CL_SUCCESS) { mexPrintf("Error: clEnqueueMapBuffer \n"); return; } memcpy(symbol_alphabet_im_p, symbol_alphabet_im, size); /* Load the data back on the GPU */ status = clEnqueueUnmapMemObject(commandQueue,symbol_alphabet_im_d,(void *)symbol_alphabet_im_p,0,NULL,&ev); if(status != CL_SUCCESS) { mexPrintf("clEnqueueUnmapMemObject() failed\n"); return; } status = clWaitForEvents(1, &ev); if(status != CL_SUCCESS) { mexPrintf("clEnqueueUnmapMemObject() Release failed symbol_alphabet_im_d\n"); return; } cl_mem bittable_d; size = Sum_M * max_2_M * sizeof(bool); bittable_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status); total_size += size; if(status != CL_SUCCESS) { mexPrintf("Error: Setting kernel argument. \n"); // return; } bool *bittable_p; bittable_p = (bool*)clEnqueueMapBuffer(commandQueue,bittable_d,CL_TRUE,CL_MAP_WRITE,0,size,0,NULL,NULL,&status); if(status != CL_SUCCESS) { mexPrintf("Error: clEnqueueMapBuffer \n"); return; } memcpy(bittable_p, bittable, size); /* Load the data back on the GPU */ status = clEnqueueUnmapMemObject(commandQueue,bittable_d,(void*)bittable_p,0,NULL,&ev); if(status != CL_SUCCESS) { mexPrintf("clEnqueueUnmapMemObject() failed bittable\n"); return; } status = clWaitForEvents(1, &ev); if(status != CL_SUCCESS) { mexPrintf("clEnqueueUnmapMemObject() Release failed bittable_d\n"); return; } /* This is the output */ cl_mem LLR_d; size = Sum_M *nSym * sizeof(float); total_size += size; LLR_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status); if(status != CL_SUCCESS) { mexPrintf("Error: Setting kernel argument. \n"); return; } //mexPrintf("Total allocated memory is %d\n", total_size); /* Set kernel Arguments */ /*** Set appropriate arguments to the kernel ***/ status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&R_re_d); if(status != CL_SUCCESS) { mexPrintf("Error: Setting kernel argument.\n"); return; } status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&R_im_d); if(status != CL_SUCCESS) { mexPrintf("Error: Setting kernel argument\n"); return; } status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&s_re_d); if(status != CL_SUCCESS) { mexPrintf("Error: Setting kernel argument\n"); return; } status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&s_im_d); if(status != CL_SUCCESS) { mexPrintf("Error: Setting kernel argument\n"); return; } status = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&dist_ZF_d); if(status != CL_SUCCESS) { mexPrintf("Error: Setting kernel argument\n"); return; } status = clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&symbols_ZF_index_d); if(status != CL_SUCCESS) { mexPrintf("Error: Setting kernel argument\n"); return; } status = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&symbol_alphabet_re_d); if(status != CL_SUCCESS) { mexPrintf("Error: Setting kernel argument\n"); return; } status = clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *)&symbol_alphabet_im_d); if(status != CL_SUCCESS) { mexPrintf("Error: Setting kernel argument\n"); return; } status = clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *)&bittable_d); if(status != CL_SUCCESS) { mexPrintf("Error: Setting kernel argument\n"); return; } status = clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *)&LLR_d); if(status != CL_SUCCESS) { mexPrintf("Error: Setting kernel argument\n"); return; } status = clSetKernelArg(kernel, 10, sizeof(int), (void *)&nSym); if(status != CL_SUCCESS) { mexPrintf("Error: Setting kernel argument\n"); return; } status = clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *)&M_d); if(status != CL_SUCCESS) { mexPrintf("Error: Setting kernel argument\n"); return; } status = clSetKernelArg(kernel, 12, sizeof(int), (void *)&nT); if(status != CL_SUCCESS) { mexPrintf("Error: Setting kernel argument\n"); return; } status = clSetKernelArg(kernel, 13, sizeof(int), (void *)&nR); if(status != CL_SUCCESS) { mexPrintf("Error: Setting kernel argument\n"); return; } status = clSetKernelArg(kernel, 14, sizeof(int), (void *)&total_bits); if(status != CL_SUCCESS) { mexPrintf("Error: Setting kernel argument\n"); return; } size_t localThreads[2] = {128, 1}; size_t globalThreads[2] = {nSym, 1}; status = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, &events[0]); if(status != CL_SUCCESS) { mexPrintf("Error: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)\n"); return; } /* wait for the kernel call to finish execution */ status = clWaitForEvents(1, &events[0]); if(status != CL_SUCCESS) { mexPrintf("Error: Waiting for kernel run to finish.(clWaitForEvents)\n"); return; } // output variable with Sphere Decoder solution of single precision type plhs [0] = mxCreateNumericMatrix(total_bits,nSym,mxSINGLE_CLASS,mxREAL); if(plhs[0] == NULL) mexErrMsgTxt("mxCreateNumericMatrix failed(1)\n"); LLR = (float *) mxGetPr(plhs[0]); // fetch pointer for output variable if(LLR == NULL ) mexErrMsgTxt("mxCreateNumericMatrix failed(2)\n"); //Copy the data back from the GPU size = Sum_M *nSym * sizeof(float); status = clEnqueueReadBuffer(commandQueue,LLR_d,CL_TRUE,0,size,LLR,0,0,&ev); if(status != CL_SUCCESS) { mexPrintf("Error in reading LLR buffer Status is: %d size is(bytes) %d\n", status, size); } status = clWaitForEvents(1, &ev); if(status != CL_SUCCESS) { mexPrintf("Error: Waiting for LLR read to finish.(clWaitForEvents) status is \n", status); return; } //free up the host memory //free(R_re_p); //R_re_p = NULL; //free(R_im_p); //R_im_p = NULL; //free(s_re_p); //s_re_p = NULL; //free(s_im_p); //s_im_p = NULL; //free(symbols_ZF_index_p); //symbols_ZF_index_p = NULL; //free(M_p); //M_p = NULL; //free(symbol_alphabet_re_p); //symbol_alphabet_re_p = NULL; //free(symbol_alphabet_im_p); //symbol_alphabet_im_p = NULL; //free(bittable_p); //bittable_p = NULL; // Free device memory status = 0; status = clReleaseKernel(kernel); status += clReleaseProgram(program); status += clReleaseMemObject(R_re_d); status += clReleaseMemObject(R_im_d); status += clReleaseMemObject(s_re_d); status += clReleaseMemObject(s_im_d); status += clReleaseMemObject(dist_ZF_d); status += clReleaseMemObject(symbols_ZF_index_d); status += clReleaseMemObject(M_d); status += clReleaseMemObject(symbol_alphabet_re_d); status += clReleaseMemObject(symbol_alphabet_im_d); status += clReleaseMemObject(bittable_d); status += clReleaseMemObject(LLR_d); status += clReleaseCommandQueue(commandQueue); status += clReleaseContext(context); if(status != 0) mexPrintf("Error in freeing up the memory"); }