JanS

Kernel uses way too many scratch registers, execution fails

Discussion created by JanS on Jul 26, 2010
Latest reply on Sep 15, 2010 by JanS

Hi,

i'm just learning to how to use OpenCL and GPGPU. In my bachelor thesis, i convert an existing cryptoattack called "AIDA" or the "CUBE" attack to using GPGPU. Because the attack uses an array of 288 x 64-bit long and other state variables, many registers are used. Execution just fails on my dedicated workstation ( Phenom 1055T, 16GB Ram, Radeon 5970, 6 TB HD space to store the results ), the code just "hangs" after i start the kernel. Currently im using Ubuntu 10.04 x64 & Catalyst 10.6 and only one GPU device. The problem persists under Windows 7 x64.

I know that maybe the outer loop

for(int IVLK = 0; IVLK < (1LL<<IV_CNT); ++IVLK)

could be too "large" for one work-item, but it doesn't change the Scratch register count.

I would be very grateful if you could help me. Need to finish my thesis in the next 3 weeks and currently im stuck at this problem.

 

 

Kernel: ######################################################################################################################################################### /* AIDA ATTACK */ int dec(int val) { return --val < 0 ? 287 : val; } //#define dec(val) if( --val < 0 ) val = 287; #define IV_COUNT 23 __kernel void aida( __global unsigned long* membuff ) { const size_t gl_id = get_global_id(0); // global id = unique id(0 till membuff.count - 1) /* const size_t max_g = get_global_size(0); // global size == membuff.count() const size_t work_dim = get_work_dim(); // should be 1-dimensional const size_t lid = get_local_id(0); // local id = current thread id(0 till 63, which are all parallel executed!) const size_t max_l = get_local_size(0); // local size = 64 threads const size_t gr_id = get_group_id(0); // current group id, needed to calculate offset const size_t max_ng = get_num_groups(0); // membuff.count() / local size */ const int TMAX = 750; const int TMIN = 700; const int IV_CNT = IV_COUNT; const int IVPOS[IV_COUNT] = {2,4,6,8,10,13,19,23,31,32,34,39,44,46,53,55,62,69,71,73,74,76,79}; int i,t; unsigned long lk[288]; int q243, q286, q287, q288, q066, q069, q091, q092, q093, q162, q171, q175, q176, q177, q264; for(int IVLK = 0; IVLK < (1LL<<IV_CNT); ++IVLK) { q066 = 66; q069 = 69; q091 = 91; q092 = 92; q093 = 93; q162 = 162; q171 = 171; q175 = 175; q176 = 176; q177 = 177; q243 = 243; q264 = 264; q286 = 286; q287 = 287; q288 = 0; // Keybits setzen for(i = 1; i <= 285; ++i) { lk[i] = 0; } for(i = 0; i < IV_CNT; i++) { if ((IVLK & (1<<(i))) != 0) { lk[93 + IVPOS[i]] = 0xffffffffffffffffLL; } } /* Test */ /*lk[1] = 0xffffffffffffffffLL; lk[5] = 0xffffffffffffffffLL; lk[10] = 0xffffffffffffffffLL; lk[15] = 0xffffffffffffffffLL; lk[20] = 0xffffffffffffffffLL; lk[25] = 0xffffffffffffffffLL; lk[30] = 0xffffffffffffffffLL; lk[35] = 0xffffffffffffffffLL;*/ lk[10] = 0xffffffffffffffffLL; lk[0] = 0xffffffffffffffffLL; lk[286] = 0xffffffffffffffffLL; lk[287] = 0xffffffffffffffffLL; for(t = 1; t < TMIN; ++t) { lk[q288] = lk[q243] ^ lk[q288] ^ lk[q069] ^ (lk[q286] & lk[q287]); lk[q093] = lk[q066] ^ lk[q093] ^ lk[q171] ^ (lk[q091] & lk[q092]); lk[q177] = lk[q162] ^ lk[q177] ^ lk[q264] ^ (lk[q175] & lk[q176]); --q066; if(q066 < 0) q066 = 287; --q069; if(q069 < 0) q069 = 287; --q091; if(q091 < 0) q091 = 287; --q092; if(q092 < 0) q092 = 287; --q093; if(q093 < 0) q093 = 287; --q162; if(q162 < 0) q162 = 287; --q171; if(q171 < 0) q171 = 287; --q175; if(q175 < 0) q175 = 287; --q176; if(q176 < 0) q176 = 287; --q177; if(q177 < 0) q177 = 287; --q243; if(q243 < 0) q243 = 287; --q264; if(q264 < 0) q264 = 287; --q286; if(q286 < 0) q286 = 287; --q287; if(q287 < 0) q287 = 287; --q288; if(q288 < 0) q288 = 287; } for(t = TMIN; t < TMAX; ++t) { membuff[gl_id + (t - TMIN)] ^= (lk[q243] ^ lk[q288] ^ lk[q066] ^ lk[q093] ^ lk[q162] ^ lk[q177]); lk[q288] = lk[q243] ^ lk[q288] ^ lk[q069] ^ (lk[q286] & lk[q287]); lk[q093] = lk[q066] ^ lk[q093] ^ lk[q171] ^ (lk[q091] & lk[q092]); lk[q177] = lk[q162] ^ lk[q177] ^ lk[q264] ^ (lk[q175] & lk[q176]); --q066; if(q066 < 0) q066 = 287; --q069; if(q069 < 0) q069 = 287; --q091; if(q091 < 0) q091 = 287; --q092; if(q092 < 0) q092 = 287; --q093; if(q093 < 0) q093 = 287; --q162; if(q162 < 0) q162 = 287; --q171; if(q171 < 0) q171 = 287; --q175; if(q175 < 0) q175 = 287; --q176; if(q176 < 0) q176 = 287; --q177; if(q177 < 0) q177 = 287; --q243; if(q243 < 0) q243 = 287; --q264; if(q264 < 0) q264 = 287; --q286; if(q286 < 0) q286 = 287; --q287; if(q287 < 0) q287 = 287; --q288; if(q288 < 0) q288 = 287; } } } .cpp: ######################################################################################################################################################### #define __CL_ENABLE_EXCEPTIONS #define __NO_STD_VECTOR #define __NO_STD_STRING #include <cstdio> #include <cstdlib> #include <iostream> #include <fstream> #include <ctime> #include <stdlib.h> #if defined(__APPLE__) || defined(__MACOSX) #include <OpenCL/cl.hpp> #else #include <CL/cl.hpp> #endif #include "ComputationTest.hpp" const char* filename = "aida_kernel.cl"; const int TRIVIUM_SIZE = 288; const size_t localsize = 64; const long globalsize = 1024; const int data_size = 1500*64*320; cl_ulong* data = new cl_ulong[data_size]; cl::vector<cl::Platform> platforms; cl::vector<cl::Device> devices; cl::Context context; cl::Program program_; int main(void) { size_t memSize = sizeof(cl_ulong)*(data_size); for(int i=0; i < data_size; ++i ) data[i] = 0; std::ifstream file(filename, std::ifstream::in); if(!file.is_open()) { std::cerr << "ERROR: Could not open kernel file " << filename << std::endl; return EXIT_FAILURE; } std::string prog(std::istreambuf_iterator<char>(file), (std::istreambuf_iterator<char>())); try { cl::vector<cl::Platform>::iterator curPlatform = initPlatform(); context = initContext(CL_DEVICE_TYPE_GPU, curPlatform); cl::vector<cl::Device>::iterator curDevice = initDevice(); cl::Program::Sources source(1, std::make_pair(prog.c_str(),prog.size())); program_ = cl::Program(context, source); program_.build(devices); cl::Kernel kernel(program_, "aida"); //localsize = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(*devices); std::cout << "localsize: " << localsize << " globalsize: " << globalsize << std::endl; cl::CommandQueue queue(context, *curDevice); cl::Buffer membuff( context, CL_MEM_READ_WRITE, memSize ); // Daten vom Hauptspeicher in den GPU Speicher schreiben queue.enqueueWriteBuffer( membuff, CL_TRUE, 0, memSize, data ); cl::KernelFunctor func = kernel.bind( queue, cl::NDRange(globalsize), cl::NDRange(localsize) ); time_t secondelapsed = time(NULL); std::cout << "\nStarting kernel..." << std::endl; func(membuff).wait(); time_t secondsfinished = time(NULL); // Daten vom GPU Speicher in den Hauptspeicher zurückschreiben queue.enqueueReadBuffer( membuff, CL_TRUE, 0, memSize, data ); cl_ulong sum = 0; for(int i = 0; i < 128; ++i ) { /*if(data[i] != 0) sum += data[i];*/ std::cout << "[" << i << "] " << std::hex << data[i] << std::dec << std::endl; } std::cout << "Sum: " << sum << std::endl; //std::cout << data[128] << std::endl; std::cout << "Finished after " << (secondsfinished - secondelapsed) << " seconds." << std::endl; } catch (cl::Error err) { std::cerr << "ERROR: " << err.what() << "(" << err.err() << ")" << std::endl; if(err.err() == -11) { std::cout << "Build log:\n" << program_.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]).c_str() << std::endl; } delete[] data; return EXIT_FAILURE; } //std::cin.get(); delete[] data; return EXIT_SUCCESS; } cl::vector<cl::Platform>::iterator initPlatform() { std::cout << "Getting platform information...\n"; cl::Platform::get(&platforms); cl::vector<cl::Platform>::iterator i; if (platforms.size() > 0) { int platformCount = 0; for (i = platforms.begin(); i != platforms.end(); ++i, ++platformCount) { std::cout << "Found platform[" << platformCount << "]\n" << "\tName: " << (*i).getInfo<CL_PLATFORM_NAME>().c_str() << "\n" << "\tVendor: " << (*i).getInfo< CL_PLATFORM_VENDOR>().c_str() << "\n" << "\tPlatform Version: " << (*i).getInfo< CL_PLATFORM_VERSION>().c_str() << std::endl; } --i; // get last known(workaround) std::cout << "Using default platform[0]\n"; } return i; } cl::Context initContext(cl_device_type type, cl::vector<cl::Platform>::iterator& platforms) { cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*platforms)(), 0 }; return cl::Context(type, cps, NULL, NULL, NULL); } cl::vector<cl::Device>::iterator initDevice() { devices = context.getInfo<CL_CONTEXT_DEVICES> (); cl::vector<cl::Device>::iterator i; std::cout << "\nGetting device information...\n"; if (devices.size() > 0) { int deviceCount = 0; for (i = devices.begin(); i != devices.end(); ++i, ++deviceCount) { std::cout << "Found device[" << deviceCount << "]\n" << "\tName: " << (*i).getInfo<CL_DEVICE_NAME>().c_str() << "\n\tCompute units: " << (*i).getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() << "\n" << "\tGlobal memory size: " << (*i).getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() / 1000 << "kb\n" << "\tVendor: " << (*i).getInfo< CL_DEVICE_VENDOR>().c_str() << "\n" << "\tDevice Version: " << (*i).getInfo< CL_DEVICE_VERSION>().c_str() << std::endl; } --i; // get last known(workaround) std::cout << "Using device[" << (deviceCount - 1) << "]\n"; } return i; } .hpp: ######################################################################################################################################################### #pragma once typedef std::pair<cl::vector<cl::Platform>, cl::vector<cl::Platform>::iterator> PlatformPair; typedef std::pair<cl::vector<cl::Device>, cl::vector<cl::Device>::iterator> DevicePair; cl::vector<cl::Platform>::iterator initPlatform(); cl::Context initContext(cl_device_type type, cl::vector<cl::Platform>::iterator& platforms); cl::vector<cl::Device>::iterator initDevice();

Outcomes