9 Replies Latest reply on Nov 4, 2010 7:42 PM by n.perrotta

    Matrix calculation & Performances

    n.perrotta
      Matrix calculation & Performances

      Hi.

      I'm quite new to ATI Stream and OpenCL and I'm doing my first tests developing some routines on matrix computation.

      I've developed the code (see attachments) to perform huge matrix multiplication but I'm experimenting some problems. In details:

      1) for "long running" task (i.e.: more than 3 secs) my desktop freezes and become unresponsive...I cannot do anything except waiting for program completion. I had the same problem working on both Windows7 (a disaster) and Ubuntu10.04 (really better), with 10.9 drivers and ATI Stream 2.2.

      2) I cannot select the second graphic card (I've two HD5770 in crossfire) from C++ code but I can do it from JAVA (using the corresponding program written with JOCL).

      I think that's enough for the moment.

      Please have a look at the attached code that is only a prototype therefore quality is poor. I started from ATI SDK samples template.

      Thank you in advance for your help.

      Bye,

      NIKO.

      // C++ Code (OpenCL kernel code is at the end of this program) // ==================== #include "matrix.hpp" #include <stdio.h> #include <stdlib.h> #include <time.h> #include <math.h> void magicFill(float *m, int rows, int cols) { int i; for(i = 0; i < (rows*cols); i++) { m[i] = i+1; //cos(i); } } void printMatrix(float *m, int rows, int cols) { int r,c; for(r = 0; r < rows; r++) { for(c = 0; c < cols; c++) { printf(" %2.2f", m[r*cols+c]); } printf("\n"); } } clock_t getMillisec(void) { return (clock() / (CLOCKS_PER_SEC/1000) ); } int initializeHost(int m0r, int m0c, int m1r, int m1c) { m0rows = m0r; m0cols = m0c; m1rows = m1r; m1cols = m1c; m0size[0] = m0rows; m0size[1] = m0cols; m1size[0] = m1rows; m1size[1] = m1cols; outputRows = m0rows; outputCols = m1cols; m0 = (float *)malloc(sizeof(float) * m0rows * m0cols); m1 = (float *)malloc(sizeof(float) * m1rows * m1cols); output = (float *)malloc(sizeof(float) * m0rows * m1cols); magicFill(m0, m0rows, m0cols); magicFill(m1, m1rows, m1cols); if (fprint) { printf("\n=======================================================\n"); printMatrix(m0, m0rows, m0cols); printf("\n=======================================================\n"); printMatrix(m1, m1rows, m1cols); printf("\n=======================================================\n"); } return 0; } /* * Converts the contents of a file into a string */ std::string convertToString(const char *filename) { size_t size; char* str; std::string s; std::fstream f(filename, (std::fstream::in | std::fstream::binary)); if(f.is_open()) { size_t fileSize; f.seekg(0, std::fstream::end); size = fileSize = f.tellg(); f.seekg(0, std::fstream::beg); str = new char[size+1]; if(!str) { f.close(); return NULL; } f.read(str, fileSize); f.close(); str[size] = '\0'; s = str; delete[] str; return s; } else { std::cout << "\nFile containg the kernel code(\".cl\") not found. Please copy the required file in the folder containg the executable.\n"; exit(1); } return NULL; } /* * \brief OpenCL related initialization * Create Context, Device list, Command Queue * Create OpenCL memory buffer objects * Load CL file, compile, link CL source * Build program and kernel objects */ int initializeCL(void) { cl_int status = 0; size_t deviceListSize; cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(status != CL_SUCCESS) { std::cout << "Error: Getting Platforms. (clGetPlatformsIDs)\n"; return 1; } if(numPlatforms > 0) { cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Getting Platform Ids. (clGetPlatformsIDs)\n"; return 1; } for(unsigned int i=0; i < numPlatforms; ++i) { char pbuff[100]; status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Getting Platform Info.(clGetPlatformInfo)\n"; return 1; } printf("\nPlatform: %s", pbuff); platform = platforms[i]; if(!strcmp(pbuff, "Advanced Micro Devices, Inc.")) { break; } } delete platforms; } if(NULL == platform) { std::cout << "NULL platform found so Exiting Application." << std::endl; return 1; } cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; if (!strcmp(pu,"gpu")) { context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status); } else { context = clCreateContextFromType(cps, CL_DEVICE_TYPE_CPU, NULL, NULL, &status); } if(status != CL_SUCCESS) { std::cout<<"Error: Creating Context. (clCreateContextFromType)\n"; return 1; } status = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); if(status != CL_SUCCESS) { std::cout<< "Error: Getting Context Info \ (device list size, clGetContextInfo)\n"; return 1; } int numDevices = (int)deviceListSize / sizeof(cl_device_id); printf("\nNumber of devices: %d\n", numDevices); devices = (cl_device_id *)malloc(deviceListSize); if(devices == 0) { std::cout<<"Error: No devices found.\n"; return 1; } status = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL); if(status != CL_SUCCESS) { std::cout<< "Error: Getting Context Info \ (device list, clGetContextInfo)\n"; return 1; } commandQueue = clCreateCommandQueue(context, devices[0], 0, &status); if(status != CL_SUCCESS) { std::cout<<"Creating Command Queue. (clCreateCommandQueue)\n"; return 1; } m0buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_float)*m0rows*m0cols, m0, &status); if(status != CL_SUCCESS) { std::cout<<"Error: clCreateBuffer (inputBuffer)\n"; return 1; } m1buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_float)*m1rows*m1cols, m1, &status); if(status != CL_SUCCESS) { std::cout<<"Error: clCreateBuffer (inputBuffer)\n"; return 1; } outputBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_float)*outputRows*outputCols, output, &status); if(status != CL_SUCCESS) { std::cout<<"Error: clCreateBuffer (outputBuffer)\n"; return 1; } const char * filename = "matrix_kernel.cl"; std::string sourceStr = convertToString(filename); const char * source = sourceStr.c_str(); size_t sourceSize[] = { strlen(source) }; program = clCreateProgramWithSource(context, 1, &source, sourceSize, &status); if(status != CL_SUCCESS) { std::cout<< "Error: Loading Binary into cl_program \ (clCreateProgramWithBinary)\n"; return 1; } status = clBuildProgram(program, 1, devices, NULL, NULL, NULL); if(status != CL_SUCCESS) { std::cout<<"Error: Building Program (clBuildProgram)\n"; return 1; } kernel = clCreateKernel(program, "mat_mult", &status); if(status != CL_SUCCESS) { std::cout<<"Error: Creating Kernel from program. (clCreateKernel)\n"; return 1; } return 0; } int runCLKernels(void) { cl_int status; cl_uint maxDims; cl_event events[2]; size_t globalThreads[2]; size_t localThreads[2]; size_t maxWorkGroupSize; size_t maxWorkItemSizes[3]; status = clGetDeviceInfo(devices[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void*)&maxWorkGroupSize, NULL); printf("\nMaxWorkGroupSize: %d", maxWorkGroupSize); if(status != CL_SUCCESS) { std::cout<<"Error: Getting Device Info. (clGetDeviceInfo)\n"; return 1; } status = clGetDeviceInfo(devices[0], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void*)&maxDims, NULL); if(status != CL_SUCCESS) { std::cout<<"Error: Getting Device Info. (clGetDeviceInfo)\n"; return 1; } status = clGetDeviceInfo(devices[0], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxDims, (void*)maxWorkItemSizes, NULL); if(status != CL_SUCCESS) { std::cout<<"Error: Getting Device Info. (clGetDeviceInfo)\n"; return 1; } globalThreads[0] = outputRows; globalThreads[1] = outputCols; localThreads[0] = 8; localThreads[1] = 8; status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&m0buffer); status = clSetKernelArg(kernel, 1, sizeof(cl_int2), (void *)&m0size); status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&m1buffer); status = clSetKernelArg(kernel, 3, sizeof(cl_int2), (void *)&m1size); status = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&outputBuffer); if(status != CL_SUCCESS) { std::cout<< "Error: Setting kernel argument. (multiplier)\n"; return 1; } status = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, &events[0]); if(status != CL_SUCCESS) { std::cout<< "Error: Enqueueing kernel onto command queue. \ (clEnqueueNDRangeKernel)\n"; return 1; } status = clWaitForEvents(1, &events[0]); if(status != CL_SUCCESS) { std::cout<< "Error: Waiting for kernel run to finish. \ (clWaitForEvents)\n"; return 1; } status = clReleaseEvent(events[0]); if(status != CL_SUCCESS) { std::cout<< "Error: Release event object. \ (clReleaseEvent)\n"; return 1; } status = clEnqueueReadBuffer( commandQueue, outputBuffer, CL_TRUE, 0, sizeof(cl_float)*m0rows*m1cols, output, 0, NULL, &events[1]); if(status != CL_SUCCESS) { std::cout << "Error: clEnqueueReadBuffer failed. \ (clEnqueueReadBuffer)\n"; return 1; } status = clWaitForEvents(1, &events[1]); if(status != CL_SUCCESS) { std::cout<< "Error: Waiting for read buffer call to finish. \ (clWaitForEvents)\n"; return 1; } status = clReleaseEvent(events[1]); if(status != CL_SUCCESS) { std::cout<< "Error: Release event object. \ (clReleaseEvent)\n"; return 1; } printf("\noutput(0,0): %2.2f", output[0]); printf("\noutput(outputRows,outputCols): %2.2f\n", output[outputRows*outputCols-1]); return 0; } /* * \brief Release OpenCL resources (Context, Memory etc.) */ int cleanupCL(void) { cl_int status; status = clReleaseKernel(kernel); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseKernel \n"; return 1; } status = clReleaseProgram(program); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseProgram\n"; return 1; } status = clReleaseMemObject(m0buffer); status = clReleaseMemObject(m1buffer); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseMemObject (inputBuffer)\n"; return 1; } status = clReleaseMemObject(outputBuffer); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseMemObject (outputBuffer)\n"; return 1; } status = clReleaseCommandQueue(commandQueue); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseCommandQueue\n"; return 1; } status = clReleaseContext(context); if(status != CL_SUCCESS) { std::cout<<"Error: In clReleaseContext\n"; return 1; } return 0; } /* * \brief Releases program's resources */ void cleanupHost(void) { if(m0 != NULL) { free(m0); m0 = NULL; } if(m1 != NULL) { free(m1); m1 = NULL; } if(output != NULL) { free(output); output = NULL; } if(devices != NULL) { free(devices); devices = NULL; } } void print1DArray(const std::string arrayName, const unsigned int * arrayData, const unsigned int length) { cl_uint i; cl_uint numElementsToPrint = (256 < length) ? 256 : length; std::cout << std::endl; std::cout << arrayName << ":" << std::endl; for(i = 0; i < numElementsToPrint; ++i) { std::cout << arrayData[i] << " "; } std::cout << std::endl; } int main(int argc, char * argv[]) { long start; long end; long elapsed; if(initializeHost(atoi(argv[1]), atoi(argv[2]), atoi(argv[3]), atoi(argv[4]))==1) return 1; int ftemp = atoi(argv[5]); if (ftemp == 0) { fprint = false; } else { fprint = true; } pu = argv[6]; start = getMillisec(); if(initializeCL()==1) return 1; end = getMillisec(); printf("\nInitialization elapsed time: %d", (end-start)); start = getMillisec(); if(runCLKernels()==1) return 1; end = getMillisec(); printf("\nExecution elapsed time: %d", (end-start)); if (fprint) { printf("\n=======================================================\n"); printMatrix(output, outputRows, outputCols); printf("\n======================================================="); } //print1DArray(std::string("Output"), output, width); start = getMillisec(); if(cleanupCL()==1) return 1; cleanupHost(); end = getMillisec(); printf("\nCleanup elapsed time: %d\n", (end-start)); return 0; } // kernel code //============== __kernel void mat_mult(__global const float *m0, const int2 m0size, __global const float *m1, const int2 m1size, __global float *res) { int row = get_global_id(0); int col = get_global_id(1); float row_total = 0; for(int k = 0; k < m0size.y; k++) { row_total = row_total + m0[row*m0size.y + k]*m1[k*m1size.y + col]; } res[row*m1size.y + col] = row_total; }

        • Matrix calculation & Performances
          nou

          GPU currently can't do two thing at the same time. so if it compute some calculation task it can't redraw your screen.

          on linux you must export DISPLAY=:0 to gain access to both cards.

            • Matrix calculation & Performances
              n.perrotta

              Hi.

              Thank you very much for your answers.

              About the first issue.....ok, I'll avoid to stress myself in case of freeze.

              About the second issue (export DISPLAY), are you sayng that I've to set the environment variable DISPLAY (export DISPLAY=:0) before launching my program ? In my code (as you can see in my previous email) everything is perfect if I select device[0] but as soon as I select device[1] I get an error on comman queue ?

              Any idea ?

               

              Thank you again.

               

              Bye,

              NIKO.

                • Matrix calculation & Performances
                  nou

                  ah sorry try disable crossfire. there is issue with it.

                  yes export DISPLAY is setting enviroment variable. it is linux thing.

                    • Matrix calculation & Performances
                      n.perrotta

                      Hi.

                      Thank you for the quick reply.

                      Unfortunately disabling crossfire and launching CLInfo I cannot see the second device HD5770 anymore.

                      Now re-enabling the crossfire and setting the DISPLAY I got an error from clEnqueueNDRangeKernel command....I moved one step forward. Using CLInfo I see 3 devices: 1 CPU and 2 HD5770.

                      One detail: the second HD5770 has no display connected.

                      Using the java binding JOCL I can select the device[1] instead of device[0] but from CPP I got the error above.

                      I do not have any other idea.

                      ?????

                       

                      Thank you in advance for your help.

                       

                      Bye,

                      NIKO.

                        • Matrix calculation & Performances
                          nou

                          do you have second card configured in xorg.conf? if not run aticonfig --initial --adapter=all

                            • Matrix calculation & Performances
                              n.perrotta

                              Hi.

                              I've done the following:

                              - disabled crossfire via software using ATI Catalyst Control Center (but I didn't remove the physical connection between the two graphic cards)

                              - reboot

                              - run aticonfig --initial --adapter=all (I can see two devices into xorg.conf)

                              - reboot

                              - set DISPLAY env variable by means of "export DISPLAY=:0"

                              - run the program selecting device[1]

                              - ..... getting the usual error when I try to enqueue the kernel into the command queue.

                               

                              Any other idea ? ;-))))

                               

                              I'm looking forward to experimenting some new options. It must work...I hope.

                               

                              Bye,

                              NIKO.

                              • Matrix calculation & Performances
                                n.perrotta

                                Hi.

                                Now I've disabled the crossfire also removing the ribbon between the 2 HD5770.

                                I've run:

                                - aticonfig --initial --adapter=all

                                - reboot

                                - export DISPLAY=:0

                                - run the program selecting device[1]

                                and I got always the same problem.

                                I've no more ideas....I'll go on NVIDIA, less powerful but working (already successfully tested) without Xorg.

                                I've to start the project !

                                It's a pity....so much computation power and so many limitations.

                                Thank you again for your support.

                                Bye,

                                NIKO.

                        • Matrix calculation & Performances
                          rosalita456

                          I am also agree with nou. He is saying right.