JanS

performance issues & alternative kernel w/o function use crashes

Discussion created by JanS on May 18, 2010
Latest reply on May 19, 2010 by JanS

hi,

iam currently writing my bachelor thesis about porting AIDA(also known as cube attack) http://en.wikipedia.org/wiki/Cube_attack to OpenCL. to get an inital preview of the performance gain, i've implemented a short performance test, which is on a 4870 not much faster than my 3 ghz intel quadcore(5s vs 13s). i've read about automatic function inlining, to proof this i commented out my function calls and used alternative code without functions - which crashes my PC as soon as i run it. is this a bug?

PS: please ask me before using parts of the code. should be no problem at all, but i want to know it

 

 

###################################################################### cpp: ###################################################################### #define __CL_ENABLE_EXCEPTIONS #define __NO_STD_VECTOR #define __NO_STD_STRING #include <cstdio> #include <cstdlib> #include <iostream> #include <fstream> #include <ctime> #if defined(__APPLE__) || defined(__MACOSX) #include <OpenCL/cl.hpp> #else #include <CL/cl.hpp> #endif #include "ComputationTest.hpp" const int TRIVIUM_SIZE = 288; const size_t localsize = 64; const long count = TRIVIUM_SIZE * 64 * 32; cl_ulong* data = new cl_ulong[count]; const char* filename = "aida_kernel.cl"; cl::vector<cl::Platform> platforms; cl::vector<cl::Device> devices; cl::Context context; cl::Program program_; int main(void) { size_t memSize = sizeof(cl_ulong)*count; for(int i=0; i < count; ++i ) data[i] = i; std::ifstream file(filename, std::ifstream::in); if(!file.is_open()) { std::cerr << "ERROR: Could not open kernel file " << filename << std::endl; return EXIT_FAILURE; } std::string prog(std::istreambuf_iterator<char>(file), (std::istreambuf_iterator<char>())); try { cl::vector<cl::Platform>::iterator curPlatform = initPlatform(); context = initContext(CL_DEVICE_TYPE_GPU, curPlatform); cl::vector<cl::Device>::iterator curDevice = initDevice(); cl::Program::Sources source(1, std::make_pair(prog.c_str(),prog.size())); program_ = cl::Program(context, source); program_.build(devices); cl::Kernel kernel(program_, "aida"); //localsize = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(*devices); std::cout << "localsize: " << localsize << " count: " << count << std::endl; cl::CommandQueue queue(context, *curDevice); cl::Buffer membuff( context, CL_MEM_READ_WRITE, memSize ); // Daten vom Hauptspeicher in den GPU Speicher schreiben queue.enqueueWriteBuffer( membuff, CL_TRUE, 0, 20, data ); cl::KernelFunctor func = kernel.bind( queue, cl::NDRange(count), cl::NDRange(localsize) ); time_t secondelapsed = time(NULL); std::cout << "\nStarting kernel..." << std::endl; func(membuff).wait(); time_t secondsfinished = time(NULL); // Daten vom GPU Speicher in den Hauptspeicher zurückschreiben queue.enqueueReadBuffer( membuff, CL_TRUE, 0, 20, data ); /* unsigned int correct = 0; for( unsigned int i = 0; i < count; ++i ) { std::cout << data[i] << std::endl; // if( data[i] == i ) ++correct; } std::cout << "Computed " << correct << "/" << count; std::cout << " correct values." << std::endl;*/ //std::cout << data[128] << std::endl; std::cout << "Finished after " << (secondsfinished - secondelapsed) << " seconds." << std::endl; } catch (cl::Error err) { std::cerr << "ERROR: " << err.what() << "(" << err.err() << ")" << std::endl; if(err.err() == -11) { std::cout << "Build log:\n" << program_.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]).c_str() << std::endl; } return EXIT_FAILURE; } std::cin.get(); return EXIT_SUCCESS; } cl::vector<cl::Platform>::iterator initPlatform() { std::cout << "Getting platform information...\n"; cl::Platform::get(&platforms); cl::vector<cl::Platform>::iterator i; if (platforms.size() > 0) { int platformCount = 0; for (i = platforms.begin(); i != platforms.end(); ++i, ++platformCount) { std::cout << "Found platform[" << platformCount << "]\n" << "\tName: " << (*i).getInfo<CL_PLATFORM_NAME>().c_str() << "\n" << "\tVendor: " << (*i).getInfo< CL_PLATFORM_VENDOR>().c_str() << "\n" << "\tPlatform Version: " << (*i).getInfo< CL_PLATFORM_VERSION>().c_str() << std::endl; } --i; // get last known(workaround) std::cout << "Using default platform[0]\n"; } return i; } cl::Context initContext(cl_device_type type, cl::vector<cl::Platform>::iterator& platforms) { cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*platforms)(), 0 }; return cl::Context(type, cps, NULL, NULL, NULL); } cl::vector<cl::Device>::iterator initDevice() { devices = context.getInfo<CL_CONTEXT_DEVICES> (); cl::vector<cl::Device>::iterator i; std::cout << "\nGetting device information...\n"; if (devices.size() > 0) { int deviceCount = 0; for (i = devices.begin(); i != devices.end(); ++i, ++deviceCount) { std::cout << "Found device[" << deviceCount << "]\n" << "\tName: " << (*i).getInfo<CL_DEVICE_NAME>().c_str() << "\n\tCompute units: " << (*i).getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() << "\n" << "\tGlobal memory size: " << (*i).getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() / 1000 << "kb\n" << "\tVendor: " << (*i).getInfo< CL_DEVICE_VENDOR>().c_str() << "\n" << "\tDevice Version: " << (*i).getInfo< CL_DEVICE_VERSION>().c_str() << std::endl; } --i; // get last known(workaround) std::cout << "Using default device[0]\n"; } return i; } ###################################################################### hpp: ###################################################################### #pragma once cl::vector<cl::Platform>::iterator initPlatform(); cl::Context initContext(cl_device_type type, cl::vector<cl::Platform>::iterator& platforms); cl::vector<cl::Device>::iterator initDevice(); ###################################################################### working kernel: ###################################################################### /* AIDA ATTACK */ int dec(int val) { return --val < 0 ? 287 : val; } __kernel void aida( __global unsigned long* membuff ) { const size_t gl_id = get_global_id(0); // global id = unique id(0 till membuff.count - 1) /* const size_t max_g = get_global_size(0); // global size == membuff.count() const size_t work_dim = get_work_dim(); // should be 1-dimensional const size_t lid = get_local_id(0); // local id = current thread id(0 till 63, which are all parallel executed!) const size_t max_l = get_local_size(0); // local size = 64 threads const size_t gr_id = get_group_id(0); // current group id, needed to calculate offset const size_t max_ng = get_num_groups(0); // membuff.count() / local size */ int q243 = 243, q286 = 287, q287 = 287, q288 = 0, q066 = 66, q069 = 69, q091 = 91, q092 = 92, q093 = 93, q162 = 162, q171 = 171, q175 = 175, q176 = 176, q177 = 177, q264 = 264; const int TMAX = 120000; const int TMIN = 119999; unsigned long lk[288]; for(int t = 1; t < TMAX; ++t) { if(t >= TMIN) { membuff[gl_id] = 1 & (lk[q243] ^ lk[q288] ^ lk[q066] ^ lk[q093] ^ lk[q162] ^ lk[q177]); } lk[q288] = lk[q243] ^ lk[q288] ^ lk[q069] ^ (lk[q286] & lk[q287]); lk[q093] = lk[q066] ^ lk[q093] ^ lk[q171] ^ (lk[q091] & lk[q092]); lk[q177] = lk[q162] ^ lk[q177] ^ lk[q264] ^ (lk[q175] & lk[q176]); dec(q243); dec(q286); dec(q287); dec(q288); dec(q066); dec(q069); dec(q091); dec(q092); dec(q093); dec(q162); dec(q264); dec(q171); dec(q175); dec(q176); dec(q177); } } ###################################################################### crashing kernel w/o function use ###################################################################### /* AIDA ATTACK */ int dec(int val) { return val < 0 ? 287 : --val; } __kernel void aida( __global unsigned long* membuff ) { const size_t gl_id = get_global_id(0); // global id = unique id(0 till membuff.count - 1) /* const size_t max_g = get_global_size(0); // global size == membuff.count() const size_t work_dim = get_work_dim(); // should be 1-dimensional const size_t lid = get_local_id(0); // local id = current thread id(0 till 63, which are all parallel executed!) const size_t max_l = get_local_size(0); // local size = 64 threads const size_t gr_id = get_group_id(0); // current group id, needed to calculate offset const size_t max_ng = get_num_groups(0); // membuff.count() / local size */ int q243 = 243, q286 = 287, q287 = 287, q288 = 0, q066 = 66, q069 = 69, q091 = 91, q092 = 92, q093 = 93, q162 = 162, q171 = 171, q175 = 175, q176 = 176, q177 = 177, q264 = 264; const int TMAX = 120000; const int TMIN = 119999; unsigned long lk[288]; for(int t = 1; t < TMAX; ++t) { if(t >= TMIN) { membuff[gl_id] = 1 & (lk[q243] ^ lk[q288] ^ lk[q066] ^ lk[q093] ^ lk[q162] ^ lk[q177]); } lk[q288] = lk[q243] ^ lk[q288] ^ lk[q069] ^ (lk[q286] & lk[q287]); lk[q093] = lk[q066] ^ lk[q093] ^ lk[q171] ^ (lk[q091] & lk[q092]); lk[q177] = lk[q162] ^ lk[q177] ^ lk[q264] ^ (lk[q175] & lk[q176]); --q243; if(q243 < 0) q243 = 287; --q286; if(q286 < 0) q286 = 287; --q287; if(q287 < 0) q287 = 287; --q288; if(q288 < 0) q288 = 287; --q066; if(q066 < 0) q066 = 287; --q069; if(q069 < 0) q069 = 287; --q091; if(q091 < 0) q091 = 287; --q092; if(q092 < 0) q092 = 287; --q093; if(q093 < 0) q093 = 287; --q162; if(q162 < 0) q162 = 287; --q264; if(q264 < 0) q264 = 287; --q171; if(q171 < 0) q171 = 287; --q175; if(q175 < 0) q175 = 287; --q176; if(q176 < 0) q176 = 287; --q177; if(q177 < 0) q177 = 287; } }

Outcomes