4 Replies Latest reply on Sep 15, 2010 11:52 AM by JanS

    Kernel uses way too many scratch registers, execution fails

    JanS

      Hi,

      i'm just learning to how to use OpenCL and GPGPU. In my bachelor thesis, i convert an existing cryptoattack called "AIDA" or the "CUBE" attack to using GPGPU. Because the attack uses an array of 288 x 64-bit long and other state variables, many registers are used. Execution just fails on my dedicated workstation ( Phenom 1055T, 16GB Ram, Radeon 5970, 6 TB HD space to store the results ), the code just "hangs" after i start the kernel. Currently im using Ubuntu 10.04 x64 & Catalyst 10.6 and only one GPU device. The problem persists under Windows 7 x64.

      I know that maybe the outer loop

      for(int IVLK = 0; IVLK < (1LL<<IV_CNT); ++IVLK)

      could be too "large" for one work-item, but it doesn't change the Scratch register count.

      I would be very grateful if you could help me. Need to finish my thesis in the next 3 weeks and currently im stuck at this problem.

       

       

      Kernel: ######################################################################################################################################################### /* AIDA ATTACK */ int dec(int val) { return --val < 0 ? 287 : val; } //#define dec(val) if( --val < 0 ) val = 287; #define IV_COUNT 23 __kernel void aida( __global unsigned long* membuff ) { const size_t gl_id = get_global_id(0); // global id = unique id(0 till membuff.count - 1) /* const size_t max_g = get_global_size(0); // global size == membuff.count() const size_t work_dim = get_work_dim(); // should be 1-dimensional const size_t lid = get_local_id(0); // local id = current thread id(0 till 63, which are all parallel executed!) const size_t max_l = get_local_size(0); // local size = 64 threads const size_t gr_id = get_group_id(0); // current group id, needed to calculate offset const size_t max_ng = get_num_groups(0); // membuff.count() / local size */ const int TMAX = 750; const int TMIN = 700; const int IV_CNT = IV_COUNT; const int IVPOS[IV_COUNT] = {2,4,6,8,10,13,19,23,31,32,34,39,44,46,53,55,62,69,71,73,74,76,79}; int i,t; unsigned long lk[288]; int q243, q286, q287, q288, q066, q069, q091, q092, q093, q162, q171, q175, q176, q177, q264; for(int IVLK = 0; IVLK < (1LL<<IV_CNT); ++IVLK) { q066 = 66; q069 = 69; q091 = 91; q092 = 92; q093 = 93; q162 = 162; q171 = 171; q175 = 175; q176 = 176; q177 = 177; q243 = 243; q264 = 264; q286 = 286; q287 = 287; q288 = 0; // Keybits setzen for(i = 1; i <= 285; ++i) { lk[i] = 0; } for(i = 0; i < IV_CNT; i++) { if ((IVLK & (1<<(i))) != 0) { lk[93 + IVPOS[i]] = 0xffffffffffffffffLL; } } /* Test */ /*lk[1] = 0xffffffffffffffffLL; lk[5] = 0xffffffffffffffffLL; lk[10] = 0xffffffffffffffffLL; lk[15] = 0xffffffffffffffffLL; lk[20] = 0xffffffffffffffffLL; lk[25] = 0xffffffffffffffffLL; lk[30] = 0xffffffffffffffffLL; lk[35] = 0xffffffffffffffffLL;*/ lk[10] = 0xffffffffffffffffLL; lk[0] = 0xffffffffffffffffLL; lk[286] = 0xffffffffffffffffLL; lk[287] = 0xffffffffffffffffLL; for(t = 1; t < TMIN; ++t) { lk[q288] = lk[q243] ^ lk[q288] ^ lk[q069] ^ (lk[q286] & lk[q287]); lk[q093] = lk[q066] ^ lk[q093] ^ lk[q171] ^ (lk[q091] & lk[q092]); lk[q177] = lk[q162] ^ lk[q177] ^ lk[q264] ^ (lk[q175] & lk[q176]); --q066; if(q066 < 0) q066 = 287; --q069; if(q069 < 0) q069 = 287; --q091; if(q091 < 0) q091 = 287; --q092; if(q092 < 0) q092 = 287; --q093; if(q093 < 0) q093 = 287; --q162; if(q162 < 0) q162 = 287; --q171; if(q171 < 0) q171 = 287; --q175; if(q175 < 0) q175 = 287; --q176; if(q176 < 0) q176 = 287; --q177; if(q177 < 0) q177 = 287; --q243; if(q243 < 0) q243 = 287; --q264; if(q264 < 0) q264 = 287; --q286; if(q286 < 0) q286 = 287; --q287; if(q287 < 0) q287 = 287; --q288; if(q288 < 0) q288 = 287; } for(t = TMIN; t < TMAX; ++t) { membuff[gl_id + (t - TMIN)] ^= (lk[q243] ^ lk[q288] ^ lk[q066] ^ lk[q093] ^ lk[q162] ^ lk[q177]); lk[q288] = lk[q243] ^ lk[q288] ^ lk[q069] ^ (lk[q286] & lk[q287]); lk[q093] = lk[q066] ^ lk[q093] ^ lk[q171] ^ (lk[q091] & lk[q092]); lk[q177] = lk[q162] ^ lk[q177] ^ lk[q264] ^ (lk[q175] & lk[q176]); --q066; if(q066 < 0) q066 = 287; --q069; if(q069 < 0) q069 = 287; --q091; if(q091 < 0) q091 = 287; --q092; if(q092 < 0) q092 = 287; --q093; if(q093 < 0) q093 = 287; --q162; if(q162 < 0) q162 = 287; --q171; if(q171 < 0) q171 = 287; --q175; if(q175 < 0) q175 = 287; --q176; if(q176 < 0) q176 = 287; --q177; if(q177 < 0) q177 = 287; --q243; if(q243 < 0) q243 = 287; --q264; if(q264 < 0) q264 = 287; --q286; if(q286 < 0) q286 = 287; --q287; if(q287 < 0) q287 = 287; --q288; if(q288 < 0) q288 = 287; } } } .cpp: ######################################################################################################################################################### #define __CL_ENABLE_EXCEPTIONS #define __NO_STD_VECTOR #define __NO_STD_STRING #include <cstdio> #include <cstdlib> #include <iostream> #include <fstream> #include <ctime> #include <stdlib.h> #if defined(__APPLE__) || defined(__MACOSX) #include <OpenCL/cl.hpp> #else #include <CL/cl.hpp> #endif #include "ComputationTest.hpp" const char* filename = "aida_kernel.cl"; const int TRIVIUM_SIZE = 288; const size_t localsize = 64; const long globalsize = 1024; const int data_size = 1500*64*320; cl_ulong* data = new cl_ulong[data_size]; cl::vector<cl::Platform> platforms; cl::vector<cl::Device> devices; cl::Context context; cl::Program program_; int main(void) { size_t memSize = sizeof(cl_ulong)*(data_size); for(int i=0; i < data_size; ++i ) data[i] = 0; std::ifstream file(filename, std::ifstream::in); if(!file.is_open()) { std::cerr << "ERROR: Could not open kernel file " << filename << std::endl; return EXIT_FAILURE; } std::string prog(std::istreambuf_iterator<char>(file), (std::istreambuf_iterator<char>())); try { cl::vector<cl::Platform>::iterator curPlatform = initPlatform(); context = initContext(CL_DEVICE_TYPE_GPU, curPlatform); cl::vector<cl::Device>::iterator curDevice = initDevice(); cl::Program::Sources source(1, std::make_pair(prog.c_str(),prog.size())); program_ = cl::Program(context, source); program_.build(devices); cl::Kernel kernel(program_, "aida"); //localsize = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(*devices); std::cout << "localsize: " << localsize << " globalsize: " << globalsize << std::endl; cl::CommandQueue queue(context, *curDevice); cl::Buffer membuff( context, CL_MEM_READ_WRITE, memSize ); // Daten vom Hauptspeicher in den GPU Speicher schreiben queue.enqueueWriteBuffer( membuff, CL_TRUE, 0, memSize, data ); cl::KernelFunctor func = kernel.bind( queue, cl::NDRange(globalsize), cl::NDRange(localsize) ); time_t secondelapsed = time(NULL); std::cout << "\nStarting kernel..." << std::endl; func(membuff).wait(); time_t secondsfinished = time(NULL); // Daten vom GPU Speicher in den Hauptspeicher zurückschreiben queue.enqueueReadBuffer( membuff, CL_TRUE, 0, memSize, data ); cl_ulong sum = 0; for(int i = 0; i < 128; ++i ) { /*if(data[i] != 0) sum += data[i];*/ std::cout << "[" << i << "] " << std::hex << data[i] << std::dec << std::endl; } std::cout << "Sum: " << sum << std::endl; //std::cout << data[128] << std::endl; std::cout << "Finished after " << (secondsfinished - secondelapsed) << " seconds." << std::endl; } catch (cl::Error err) { std::cerr << "ERROR: " << err.what() << "(" << err.err() << ")" << std::endl; if(err.err() == -11) { std::cout << "Build log:\n" << program_.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]).c_str() << std::endl; } delete[] data; return EXIT_FAILURE; } //std::cin.get(); delete[] data; return EXIT_SUCCESS; } cl::vector<cl::Platform>::iterator initPlatform() { std::cout << "Getting platform information...\n"; cl::Platform::get(&platforms); cl::vector<cl::Platform>::iterator i; if (platforms.size() > 0) { int platformCount = 0; for (i = platforms.begin(); i != platforms.end(); ++i, ++platformCount) { std::cout << "Found platform[" << platformCount << "]\n" << "\tName: " << (*i).getInfo<CL_PLATFORM_NAME>().c_str() << "\n" << "\tVendor: " << (*i).getInfo< CL_PLATFORM_VENDOR>().c_str() << "\n" << "\tPlatform Version: " << (*i).getInfo< CL_PLATFORM_VERSION>().c_str() << std::endl; } --i; // get last known(workaround) std::cout << "Using default platform[0]\n"; } return i; } cl::Context initContext(cl_device_type type, cl::vector<cl::Platform>::iterator& platforms) { cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*platforms)(), 0 }; return cl::Context(type, cps, NULL, NULL, NULL); } cl::vector<cl::Device>::iterator initDevice() { devices = context.getInfo<CL_CONTEXT_DEVICES> (); cl::vector<cl::Device>::iterator i; std::cout << "\nGetting device information...\n"; if (devices.size() > 0) { int deviceCount = 0; for (i = devices.begin(); i != devices.end(); ++i, ++deviceCount) { std::cout << "Found device[" << deviceCount << "]\n" << "\tName: " << (*i).getInfo<CL_DEVICE_NAME>().c_str() << "\n\tCompute units: " << (*i).getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() << "\n" << "\tGlobal memory size: " << (*i).getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() / 1000 << "kb\n" << "\tVendor: " << (*i).getInfo< CL_DEVICE_VENDOR>().c_str() << "\n" << "\tDevice Version: " << (*i).getInfo< CL_DEVICE_VERSION>().c_str() << std::endl; } --i; // get last known(workaround) std::cout << "Using device[" << (deviceCount - 1) << "]\n"; } return i; } .hpp: ######################################################################################################################################################### #pragma once typedef std::pair<cl::vector<cl::Platform>, cl::vector<cl::Platform>::iterator> PlatformPair; typedef std::pair<cl::vector<cl::Device>, cl::vector<cl::Device>::iterator> DevicePair; cl::vector<cl::Platform>::iterator initPlatform(); cl::Context initContext(cl_device_type type, cl::vector<cl::Platform>::iterator& platforms); cl::vector<cl::Device>::iterator initDevice();

        • Kernel uses way too many scratch registers, execution fails
          MicahVillmow
          JanS,
          Instead of using a private array, use an array in the local address space. This is an order of magnitude faster and should not cause TDR's/hangs on the system for running so long. Using private arrays are 'bad' on a GPU. You can turn them into local arrays be prefixing __local on the array.
            • Kernel uses way too many scratch registers, execution fails
              JanS

              This sounds good. But if all work-items share the same array(like they should according to the opencl spec), i need to increase its size and calculate an offset, right?

                • Kernel uses way too many scratch registers, execution fails
                  Jawed

                  Since the array lk[] is so huge, you can only fit 14 instances of it into local memory per SIMD core on the GPU.

                  One approach is to have a single instance of lk[] shared by all work items in a work group. Make the work group size just 64 and you will get 8 instances of lk[] into local memory per SIMD core. (Since up to 8 work groups can share a SIMD core at one time).

                  This would allow you to code the algorithm without messing about with offsets. Simply test that the global work item ID is a multiple of 64. If it is, then do the computation. If not, then do nothing.

                  It means that performance is 1/64th what you might hope for.

                  But off-chip memory is ~1/200th or worse.

                  An advantage of using a work group size of 64 is that on your graphics card the work group barrier instructions between writing to lk[] and reading from lk[] take zero execution time.