3 Replies Latest reply on May 19, 2010 1:53 PM by JanS

    performance issues & alternative kernel w/o function use crashes

    JanS

      hi,

      iam currently writing my bachelor thesis about porting AIDA(also known as cube attack) http://en.wikipedia.org/wiki/Cube_attack to OpenCL. to get an inital preview of the performance gain, i've implemented a short performance test, which is on a 4870 not much faster than my 3 ghz intel quadcore(5s vs 13s). i've read about automatic function inlining, to proof this i commented out my function calls and used alternative code without functions - which crashes my PC as soon as i run it. is this a bug?

      PS: please ask me before using parts of the code. should be no problem at all, but i want to know it

       

       

      ###################################################################### cpp: ###################################################################### #define __CL_ENABLE_EXCEPTIONS #define __NO_STD_VECTOR #define __NO_STD_STRING #include <cstdio> #include <cstdlib> #include <iostream> #include <fstream> #include <ctime> #if defined(__APPLE__) || defined(__MACOSX) #include <OpenCL/cl.hpp> #else #include <CL/cl.hpp> #endif #include "ComputationTest.hpp" const int TRIVIUM_SIZE = 288; const size_t localsize = 64; const long count = TRIVIUM_SIZE * 64 * 32; cl_ulong* data = new cl_ulong[count]; const char* filename = "aida_kernel.cl"; cl::vector<cl::Platform> platforms; cl::vector<cl::Device> devices; cl::Context context; cl::Program program_; int main(void) { size_t memSize = sizeof(cl_ulong)*count; for(int i=0; i < count; ++i ) data[i] = i; std::ifstream file(filename, std::ifstream::in); if(!file.is_open()) { std::cerr << "ERROR: Could not open kernel file " << filename << std::endl; return EXIT_FAILURE; } std::string prog(std::istreambuf_iterator<char>(file), (std::istreambuf_iterator<char>())); try { cl::vector<cl::Platform>::iterator curPlatform = initPlatform(); context = initContext(CL_DEVICE_TYPE_GPU, curPlatform); cl::vector<cl::Device>::iterator curDevice = initDevice(); cl::Program::Sources source(1, std::make_pair(prog.c_str(),prog.size())); program_ = cl::Program(context, source); program_.build(devices); cl::Kernel kernel(program_, "aida"); //localsize = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(*devices); std::cout << "localsize: " << localsize << " count: " << count << std::endl; cl::CommandQueue queue(context, *curDevice); cl::Buffer membuff( context, CL_MEM_READ_WRITE, memSize ); // Daten vom Hauptspeicher in den GPU Speicher schreiben queue.enqueueWriteBuffer( membuff, CL_TRUE, 0, 20, data ); cl::KernelFunctor func = kernel.bind( queue, cl::NDRange(count), cl::NDRange(localsize) ); time_t secondelapsed = time(NULL); std::cout << "\nStarting kernel..." << std::endl; func(membuff).wait(); time_t secondsfinished = time(NULL); // Daten vom GPU Speicher in den Hauptspeicher zurückschreiben queue.enqueueReadBuffer( membuff, CL_TRUE, 0, 20, data ); /* unsigned int correct = 0; for( unsigned int i = 0; i < count; ++i ) { std::cout << data[i] << std::endl; // if( data[i] == i ) ++correct; } std::cout << "Computed " << correct << "/" << count; std::cout << " correct values." << std::endl;*/ //std::cout << data[128] << std::endl; std::cout << "Finished after " << (secondsfinished - secondelapsed) << " seconds." << std::endl; } catch (cl::Error err) { std::cerr << "ERROR: " << err.what() << "(" << err.err() << ")" << std::endl; if(err.err() == -11) { std::cout << "Build log:\n" << program_.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]).c_str() << std::endl; } return EXIT_FAILURE; } std::cin.get(); return EXIT_SUCCESS; } cl::vector<cl::Platform>::iterator initPlatform() { std::cout << "Getting platform information...\n"; cl::Platform::get(&platforms); cl::vector<cl::Platform>::iterator i; if (platforms.size() > 0) { int platformCount = 0; for (i = platforms.begin(); i != platforms.end(); ++i, ++platformCount) { std::cout << "Found platform[" << platformCount << "]\n" << "\tName: " << (*i).getInfo<CL_PLATFORM_NAME>().c_str() << "\n" << "\tVendor: " << (*i).getInfo< CL_PLATFORM_VENDOR>().c_str() << "\n" << "\tPlatform Version: " << (*i).getInfo< CL_PLATFORM_VERSION>().c_str() << std::endl; } --i; // get last known(workaround) std::cout << "Using default platform[0]\n"; } return i; } cl::Context initContext(cl_device_type type, cl::vector<cl::Platform>::iterator& platforms) { cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*platforms)(), 0 }; return cl::Context(type, cps, NULL, NULL, NULL); } cl::vector<cl::Device>::iterator initDevice() { devices = context.getInfo<CL_CONTEXT_DEVICES> (); cl::vector<cl::Device>::iterator i; std::cout << "\nGetting device information...\n"; if (devices.size() > 0) { int deviceCount = 0; for (i = devices.begin(); i != devices.end(); ++i, ++deviceCount) { std::cout << "Found device[" << deviceCount << "]\n" << "\tName: " << (*i).getInfo<CL_DEVICE_NAME>().c_str() << "\n\tCompute units: " << (*i).getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() << "\n" << "\tGlobal memory size: " << (*i).getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() / 1000 << "kb\n" << "\tVendor: " << (*i).getInfo< CL_DEVICE_VENDOR>().c_str() << "\n" << "\tDevice Version: " << (*i).getInfo< CL_DEVICE_VERSION>().c_str() << std::endl; } --i; // get last known(workaround) std::cout << "Using default device[0]\n"; } return i; } ###################################################################### hpp: ###################################################################### #pragma once cl::vector<cl::Platform>::iterator initPlatform(); cl::Context initContext(cl_device_type type, cl::vector<cl::Platform>::iterator& platforms); cl::vector<cl::Device>::iterator initDevice(); ###################################################################### working kernel: ###################################################################### /* AIDA ATTACK */ int dec(int val) { return --val < 0 ? 287 : val; } __kernel void aida( __global unsigned long* membuff ) { const size_t gl_id = get_global_id(0); // global id = unique id(0 till membuff.count - 1) /* const size_t max_g = get_global_size(0); // global size == membuff.count() const size_t work_dim = get_work_dim(); // should be 1-dimensional const size_t lid = get_local_id(0); // local id = current thread id(0 till 63, which are all parallel executed!) const size_t max_l = get_local_size(0); // local size = 64 threads const size_t gr_id = get_group_id(0); // current group id, needed to calculate offset const size_t max_ng = get_num_groups(0); // membuff.count() / local size */ int q243 = 243, q286 = 287, q287 = 287, q288 = 0, q066 = 66, q069 = 69, q091 = 91, q092 = 92, q093 = 93, q162 = 162, q171 = 171, q175 = 175, q176 = 176, q177 = 177, q264 = 264; const int TMAX = 120000; const int TMIN = 119999; unsigned long lk[288]; for(int t = 1; t < TMAX; ++t) { if(t >= TMIN) { membuff[gl_id] = 1 & (lk[q243] ^ lk[q288] ^ lk[q066] ^ lk[q093] ^ lk[q162] ^ lk[q177]); } lk[q288] = lk[q243] ^ lk[q288] ^ lk[q069] ^ (lk[q286] & lk[q287]); lk[q093] = lk[q066] ^ lk[q093] ^ lk[q171] ^ (lk[q091] & lk[q092]); lk[q177] = lk[q162] ^ lk[q177] ^ lk[q264] ^ (lk[q175] & lk[q176]); dec(q243); dec(q286); dec(q287); dec(q288); dec(q066); dec(q069); dec(q091); dec(q092); dec(q093); dec(q162); dec(q264); dec(q171); dec(q175); dec(q176); dec(q177); } } ###################################################################### crashing kernel w/o function use ###################################################################### /* AIDA ATTACK */ int dec(int val) { return val < 0 ? 287 : --val; } __kernel void aida( __global unsigned long* membuff ) { const size_t gl_id = get_global_id(0); // global id = unique id(0 till membuff.count - 1) /* const size_t max_g = get_global_size(0); // global size == membuff.count() const size_t work_dim = get_work_dim(); // should be 1-dimensional const size_t lid = get_local_id(0); // local id = current thread id(0 till 63, which are all parallel executed!) const size_t max_l = get_local_size(0); // local size = 64 threads const size_t gr_id = get_group_id(0); // current group id, needed to calculate offset const size_t max_ng = get_num_groups(0); // membuff.count() / local size */ int q243 = 243, q286 = 287, q287 = 287, q288 = 0, q066 = 66, q069 = 69, q091 = 91, q092 = 92, q093 = 93, q162 = 162, q171 = 171, q175 = 175, q176 = 176, q177 = 177, q264 = 264; const int TMAX = 120000; const int TMIN = 119999; unsigned long lk[288]; for(int t = 1; t < TMAX; ++t) { if(t >= TMIN) { membuff[gl_id] = 1 & (lk[q243] ^ lk[q288] ^ lk[q066] ^ lk[q093] ^ lk[q162] ^ lk[q177]); } lk[q288] = lk[q243] ^ lk[q288] ^ lk[q069] ^ (lk[q286] & lk[q287]); lk[q093] = lk[q066] ^ lk[q093] ^ lk[q171] ^ (lk[q091] & lk[q092]); lk[q177] = lk[q162] ^ lk[q177] ^ lk[q264] ^ (lk[q175] & lk[q176]); --q243; if(q243 < 0) q243 = 287; --q286; if(q286 < 0) q286 = 287; --q287; if(q287 < 0) q287 = 287; --q288; if(q288 < 0) q288 = 287; --q066; if(q066 < 0) q066 = 287; --q069; if(q069 < 0) q069 = 287; --q091; if(q091 < 0) q091 = 287; --q092; if(q092 < 0) q092 = 287; --q093; if(q093 < 0) q093 = 287; --q162; if(q162 < 0) q162 = 287; --q264; if(q264 < 0) q264 = 287; --q171; if(q171 < 0) q171 = 287; --q175; if(q175 < 0) q175 = 287; --q176; if(q176 < 0) q176 = 287; --q177; if(q177 < 0) q177 = 287; } }

          • performance issues & alternative kernel w/o function use crashes
            ryta1203

            I don't think the inlining should be a problem, though the ISA generated from SKA is quite different for the two versions.

            The non-inlined version producing a crazy amount scratch mem instr for me on SKA. When you say "crash", what do you mean? Do you have VPU recover turned off? What is your watchdog timer set to? Is the program crashing or the OS?

            Also, you'd probably see a much better gain from a 58xx since it has local memory.

              • performance issues & alternative kernel w/o function use crashes
                JanS

                my screen flashed to white or blue and i need a hard reboot on win7 x64. VPU recover should be on by default, haven't changed it since CCC installation.

                we apparently found a workaround to evade branches:

                increase the size of the local array lk from 288 to 288 + TMAX, so we trade memory for less branches.

                dont know if this causes some memory alignment performance issues, is this a good tradeoff? .... see the attached code

                 

                old code snippet, given that TMIN = 1199 and TMAX = 1200 ################################################################################# int dec(int val) { return --val < 0 ? 287 : val; } unsigned long lk[288]; for(int t = 1; t < TMAX; ++t) { if(t >= TMIN) { membuff[gl_id] = 1 & (lk[q243] ^ lk[q288] ^ lk[q066] ^ lk[q093] ^ lk[q162] ^ lk[q177]); } lk[q288] = lk[q243] ^ lk[q288] ^ lk[q069] ^ (lk[q286] & lk[q287]); lk[q093] = lk[q066] ^ lk[q093] ^ lk[q171] ^ (lk[q091] & lk[q092]); lk[q177] = lk[q162] ^ lk[q177] ^ lk[q264] ^ (lk[q175] & lk[q176]); dec(q243); dec(q286); dec(q287); dec(q288); dec(q066); dec(q069); dec(q091); dec(q092); dec(q093); dec(q162); dec(q264); dec(q171); dec(q175); dec(q176); dec(q177); } ##################################################################### new code snippet: unsigned long lk[288+1200]; // 1200 = TMAX, but use it due C for(int t = 1; t < TMAX; ++t) { if(t >= TMIN) { membuff[gl_id] = 1 & (lk[t+243] ^ lk[t+288] ^ lk[t+66] ^ lk[t+93] ^ lk[t+162] ^ lk[t+177]); } lk[t+243] ^ lk[t+288] ^ lk[t+69] ^ (lk[t+286] & lk[t+287]); lk[t+66] ^ lk[t+93] ^ lk[t+171] ^ (lk[t+91] & lk[t+92]); lk[t+162] ^ lk[t+177] ^ lk[t+264] ^ (lk[t+175] & lk[t+176]); }