n.perrotta

Matrix calculation & Performances

Discussion created by n.perrotta on Oct 20, 2010
Latest reply on Nov 4, 2010 by n.perrotta
Matrix calculation & Performances

Hi.

I'm quite new to ATI Stream and OpenCL and I'm doing my first tests developing some routines on matrix computation.

I've developed the code (see attachments) to perform huge matrix multiplication but I'm experimenting some problems. In details:

1) for "long running" task (i.e.: more than 3 secs) my desktop freezes and become unresponsive...I cannot do anything except waiting for program completion. I had the same problem working on both Windows7 (a disaster) and Ubuntu10.04 (really better), with 10.9 drivers and ATI Stream 2.2.

2) I cannot select the second graphic card (I've two HD5770 in crossfire) from C++ code but I can do it from JAVA (using the corresponding program written with JOCL).

I think that's enough for the moment.

Please have a look at the attached code that is only a prototype therefore quality is poor. I started from ATI SDK samples template.

Thank you in advance for your help.

Bye,

NIKO.

// C++ Code (OpenCL kernel code is at the end of this program) // ==================== #include "matrix.hpp" #include <stdio.h> #include <stdlib.h> #include <time.h> #include <math.h> void magicFill(float *m, int rows, int cols) { int i; for(i = 0; i < (rows*cols); i++) { m[i] = i+1; //cos(i); } } void printMatrix(float *m, int rows, int cols) { int r,c; for(r = 0; r < rows; r++) { for(c = 0; c < cols; c++) { printf(" %2.2f", m[r*cols+c]); } printf("\n"); } } clock_t getMillisec(void) { return (clock() / (CLOCKS_PER_SEC/1000) ); } int initializeHost(int m0r, int m0c, int m1r, int m1c) { m0rows = m0r; m0cols = m0c; m1rows = m1r; m1cols = m1c; m0size[0] = m0rows; m0size[1] = m0cols; m1size[0] = m1rows; m1size[1] = m1cols; outputRows = m0rows; outputCols = m1cols; m0 = (float *)malloc(sizeof(float) * m0rows * m0cols); m1 = (float *)malloc(sizeof(float) * m1rows * m1cols); output = (float *)malloc(sizeof(float) * m0rows * m1cols); magicFill(m0, m0rows, m0cols); magicFill(m1, m1rows, m1cols); if (fprint) { printf("\n=======================================================\n"); printMatrix(m0, m0rows, m0cols); printf("\n=======================================================\n"); printMatrix(m1, m1rows, m1cols); printf("\n=======================================================\n"); } return 0; } /* * Converts the contents of a file into a string */ std::string convertToString(const char *filename) { size_t size; char* str; std::string s; std::fstream f(filename, (std::fstream::in | std::fstream::binary)); if(f.is_open()) { size_t fileSize; f.seekg(0, std::fstream::end); size = fileSize = f.tellg(); f.seekg(0, std::fstream::beg); str = new char[size+1]; if(!str) { f.close(); return NULL; } f.read(str, fileSize); f.close(); str[size] = '\0'; s = str; delete[] str; return s; } else { std::cout << "\nFile containg the kernel code(\".cl\") not found. Please copy the required file in the folder containg the executable.\n"; exit(1); } return NULL; } /* * \brief OpenCL related initialization * Create Context, Device list, Command Queue * Create OpenCL memory buffer objects * Load CL file, compile, link CL source * Build program and kernel objects */ int initializeCL(void) { cl_int status = 0; size_t deviceListSize; cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(status != CL_SUCCESS) { std::cout << "Error: Getting Platforms. (clGetPlatformsIDs)\n"; return 1; } if(numPlatforms > 0) { cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Getting Platform Ids. (clGetPlatformsIDs)\n"; return 1; } for(unsigned int i=0; i < numPlatforms; ++i) { char pbuff[100]; status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Getting Platform Info.(clGetPlatformInfo)\n"; return 1; } printf("\nPlatform: %s", pbuff); platform = platforms[i]; if(!strcmp(pbuff, "Advanced Micro Devices, Inc.")) { break; } } delete platforms; } if(NULL == platform) { std::cout << "NULL platform found so Exiting Application." << std::endl; return 1; } cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; if (!strcmp(pu,"gpu")) { context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status); } else { context = clCreateContextFromType(cps, CL_DEVICE_TYPE_CPU, NULL, NULL, &status); } if(status != CL_SUCCESS) { std::cout<<"Error: Creating Context. (clCreateContextFromType)\n"; return 1; } status = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); if(status != CL_SUCCESS) { std::cout<< "Error: Getting Context Info \ (device list size, clGetContextInfo)\n"; return 1; } int numDevices = (int)deviceListSize / sizeof(cl_device_id); printf("\nNumber of devices: %d\n", numDevices); devices = (cl_device_id *)malloc(deviceListSize); if(devices == 0) { std::cout<<"Error: No devices found.\n"; return 1; } status = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL); if(status != CL_SUCCESS) { std::cout<< "Error: Getting Context Info \ (device list, clGetContextInfo)\n"; return 1; } commandQueue = clCreateCommandQueue(context, devices[0], 0, &status); if(status != CL_SUCCESS) { std::cout<<"Creating Command Queue. (clCreateCommandQueue)\n"; return 1; } m0buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_float)*m0rows*m0cols, m0, &status); if(status != CL_SUCCESS) { std::cout<<"Error: clCreateBuffer (inputBuffer)\n"; return 1; } m1buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_float)*m1rows*m1cols, m1, &status); if(status != CL_SUCCESS) { std::cout<<"Error: clCreateBuffer (inputBuffer)\n"; return 1; } outputBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_float)*outputRows*outputCols, output, &status); if(status != CL_SUCCESS) { std::cout<<"Error: clCreateBuffer (outputBuffer)\n"; return 1; } const char * filename = "matrix_kernel.cl"; std::string sourceStr = convertToString(filename); const char * source = sourceStr.c_str(); size_t sourceSize[] = { strlen(source) }; program = clCreateProgramWithSource(context, 1, &source, sourceSize, &status); if(status != CL_SUCCESS) { std::cout<< "Error: Loading Binary into cl_program \ (clCreateProgramWithBinary)\n"; return 1; } status = clBuildProgram(program, 1, devices, NULL, NULL, NULL); if(status != CL_SUCCESS) { std::cout<<"Error: Building Program (clBuildProgram)\n"; return 1; } kernel = clCreateKernel(program, "mat_mult", &status); if(status != CL_SUCCESS) { std::cout<<"Error: Creating Kernel from program. (clCreateKernel)\n"; return 1; } return 0; } int runCLKernels(void) { cl_int status; cl_uint maxDims; cl_event events[2]; size_t globalThreads[2]; size_t localThreads[2]; size_t maxWorkGroupSize; size_t maxWorkItemSizes[3]; status = clGetDeviceInfo(devices[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void*)&maxWorkGroupSize, NULL); printf("\nMaxWorkGroupSize: %d", maxWorkGroupSize); if(status != CL_SUCCESS) { std::cout<<"Error: Getting Device Info. (clGetDeviceInfo)\n"; return 1; } status = clGetDeviceInfo(devices[0], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void*)&maxDims, NULL); if(status != CL_SUCCESS) { std::cout<<"Error: Getting Device Info. (clGetDeviceInfo)\n"; return 1; } status = clGetDeviceInfo(devices[0], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxDims, (void*)maxWorkItemSizes, NULL); if(status != CL_SUCCESS) { std::cout<<"Error: Getting Device Info. (clGetDeviceInfo)\n"; return 1; } globalThreads[0] = outputRows; globalThreads[1] = outputCols; localThreads[0] = 8; localThreads[1] = 8; status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&m0buffer); status = clSetKernelArg(kernel, 1, sizeof(cl_int2), (void *)&m0size); status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&m1buffer); status = clSetKernelArg(kernel, 3, sizeof(cl_int2), (void *)&m1size); status = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&outputBuffer); if(status != CL_SUCCESS) { std::cout<< "Error: Setting kernel argument. (multiplier)\n"; return 1; } status = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, &events[0]); if(status != CL_SUCCESS) { std::cout<< "Error: Enqueueing kernel onto command queue. \ (clEnqueueNDRangeKernel)\n"; return 1; } status = clWaitForEvents(1, &events[0]); if(status != CL_SUCCESS) { std::cout<< "Error: Waiting for kernel run to finish. \ (clWaitForEvents)\n"; return 1; } status = clReleaseEvent(events[0]); if(status != CL_SUCCESS) { std::cout<< "Error: Release event object. \ (clReleaseEvent)\n"; return 1; } status = clEnqueueReadBuffer( commandQueue, outputBuffer, CL_TRUE, 0, sizeof(cl_float)*m0rows*m1cols, output, 0, NULL, &events[1]); if(status != CL_SUCCESS) { std::cout << "Error: clEnqueueReadBuffer failed. \ (clEnqueueReadBuffer)\n"; return 1; } status = clWaitForEvents(1, &events[1]); if(status != CL_SUCCESS) { std::cout<< "Error: Waiting for read buffer call to finish. \ (clWaitForEvents)\n"; return 1; } status = clReleaseEvent(events[1]); if(status != CL_SUCCESS) { std::cout<< "Error: Release event object. \ (clReleaseEvent)\n"; return 1; } printf("\noutput(0,0): %2.2f", output[0]); printf("\noutput(outputRows,outputCols): %2.2f\n", output[outputRows*outputCols-1]); return 0; } /* * \brief Release OpenCL resources (Context, Memory etc.) */ int cleanupCL(void) { cl_int status; status = clReleaseKernel(kernel); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseKernel \n"; return 1; } status = clReleaseProgram(program); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseProgram\n"; return 1; } status = clReleaseMemObject(m0buffer); status = clReleaseMemObject(m1buffer); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseMemObject (inputBuffer)\n"; return 1; } status = clReleaseMemObject(outputBuffer); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseMemObject (outputBuffer)\n"; return 1; } status = clReleaseCommandQueue(commandQueue); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseCommandQueue\n"; return 1; } status = clReleaseContext(context); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseContext\n"; return 1; } return 0; } /* * \brief Releases program's resources */ void cleanupHost(void) { if(m0 != NULL) { free(m0); m0 = NULL; } if(m1 != NULL) { free(m1); m1 = NULL; } if(output != NULL) { free(output); output = NULL; } if(devices != NULL) { free(devices); devices = NULL; } } void print1DArray(const std::string arrayName, const unsigned int * arrayData, const unsigned int length) { cl_uint i; cl_uint numElementsToPrint = (256 < length) ? 256 : length; std::cout << std::endl; std::cout << arrayName << ":" << std::endl; for(i = 0; i < numElementsToPrint; ++i) { std::cout << arrayData[i] << " "; } std::cout << std::endl; } int main(int argc, char * argv[]) { long start; long end; long elapsed; if(initializeHost(atoi(argv[1]), atoi(argv[2]), atoi(argv[3]), atoi(argv[4]))==1) return 1; int ftemp = atoi(argv[5]); if (ftemp == 0) { fprint = false; } else { fprint = true; } pu = argv[6]; start = getMillisec(); if(initializeCL()==1) return 1; end = getMillisec(); printf("\nInitialization elapsed time: %d", (end-start)); start = getMillisec(); if(runCLKernels()==1) return 1; end = getMillisec(); printf("\nExecution elapsed time: %d", (end-start)); if (fprint) { printf("\n=======================================================\n"); printMatrix(output, outputRows, outputCols); printf("\n======================================================="); } //print1DArray(std::string("Output"), output, width); start = getMillisec(); if(cleanupCL()==1) return 1; cleanupHost(); end = getMillisec(); printf("\nCleanup elapsed time: %d\n", (end-start)); return 0; } // kernel code //============== __kernel void mat_mult(__global const float *m0, const int2 m0size, __global const float *m1, const int2 m1size, __global float *res) { int row = get_global_id(0); int col = get_global_id(1); float row_total = 0; for(int k = 0; k < m0size.y; k++) { row_total = row_total + m0[row*m0size.y + k]*m1[k*m1size.y + col]; } res[row*m1size.y + col] = row_total; }

Outcomes