4 Replies Latest reply on May 19, 2010 6:48 PM by JanS

    segment fault / memory error

    JanS

      hi,

      the following code crashes my program and i dont know why.

      if i use local arrays (of size ~ 1400) and write to an external buffer -> CRASH

      if i use local arrays and dont write to an external buffer -> NO CRASH

      if i write to an external buffer and dont use local arrays -> NO CRASH

      i write the global id to the externel buffer and just 5 to the local arrays, so they shouldnt depend on each other.

      edit: this only happens if i use CL_DEVICE_TYPE_CPU, GPU just works fine... ?

       

       

       

      works, most code outcommented __kernel void aida( __global unsigned long* membuff ) { const size_t gl_id = get_global_id(0); // global id = unique id(0 till membuff.count - 1) /* const int TMAX = 1200; const int TMIN = 1199; unsigned long lk[288+1200]; unsigned long summe = 0; for (int count2 = 0; count2 < 256; count2++) { for(int t = 0; t < 288; ++t) { lk[t] = count2; } for(int t = 1; t < TMAX; ++t) { if(t >= TMIN) { //summe = 1 & (lk[t+243] ^ lk[t+288] ^ lk[t+66] ^ lk[t+93] ^ lk[t+162] ^ lk[t+177]); } lk[287] = 5;//lk[t+243] ^ lk[t+288] ^ lk[t+69] ^ (lk[t+286] & lk[t+287]); lk[93] = 5;//lk[t+66] ^ lk[t+93] ^ lk[t+171] ^ (lk[t+91] & lk[t+92]); lk[177] = 5;//lk[t+162] ^ lk[t+177] ^ lk[t+264] ^ (lk[t+175] & lk[t+176]); } //summe ^= lk[1400]; } // count2 */ membuff[gl_id] = gl_id; } works, no crashes.... __kernel void aida( __global unsigned long* membuff ) { const size_t gl_id = get_global_id(0); // global id = unique id(0 till membuff.count - 1) const int TMAX = 1200; const int TMIN = 1199; unsigned long lk[288+1200]; unsigned long summe = 0; for (int count2 = 0; count2 < 256; count2++) { for(int t = 0; t < 288; ++t) { lk[t] = count2; } for(int t = 1; t < TMAX; ++t) { if(t >= TMIN) { //summe = 1 & (lk[t+243] ^ lk[t+288] ^ lk[t+66] ^ lk[t+93] ^ lk[t+162] ^ lk[t+177]); } //lk[287] = 5;//lk[t+243] ^ lk[t+288] ^ lk[t+69] ^ (lk[t+286] & lk[t+287]); //lk[93] = 5;//lk[t+66] ^ lk[t+93] ^ lk[t+171] ^ (lk[t+91] & lk[t+92]); //lk[177] = 5;//lk[t+162] ^ lk[t+177] ^ lk[t+264] ^ (lk[t+175] & lk[t+176]); } //summe ^= lk[1400]; } // count2*/ membuff[gl_id] = gl_id; } same as above, but lk[287] = 5; lk[93] = 5; lk[177] = 5; are not commented out. CRASHES. __kernel void aida( __global unsigned long* membuff ) { const size_t gl_id = get_global_id(0); // global id = unique id(0 till membuff.count - 1) const int TMAX = 1200; const int TMIN = 1199; unsigned long lk[288+1200]; unsigned long summe = 0; for (int count2 = 0; count2 < 256; count2++) { for(int t = 0; t < 288; ++t) { lk[t] = count2; } for(int t = 1; t < TMAX; ++t) { if(t >= TMIN) { //summe = 1 & (lk[t+243] ^ lk[t+288] ^ lk[t+66] ^ lk[t+93] ^ lk[t+162] ^ lk[t+177]); } lk[287] = 5;//lk[t+243] ^ lk[t+288] ^ lk[t+69] ^ (lk[t+286] & lk[t+287]); lk[93] = 5;//lk[t+66] ^ lk[t+93] ^ lk[t+171] ^ (lk[t+91] & lk[t+92]); lk[177] = 5;//lk[t+162] ^ lk[t+177] ^ lk[t+264] ^ (lk[t+175] & lk[t+176]); } //summe ^= lk[1400]; } // count2*/ membuff[gl_id] = gl_id; } and last, but not least: membuff[gl_id] = gl_id; is commented out, otherwise same as above: CRASHES. __kernel void aida( __global unsigned long* membuff ) { const size_t gl_id = get_global_id(0); // global id = unique id(0 till membuff.count - 1) const int TMAX = 1200; const int TMIN = 1199; unsigned long lk[288+1200]; unsigned long summe = 0; for (int count2 = 0; count2 < 256; count2++) { for(int t = 0; t < 288; ++t) { lk[t] = count2; } for(int t = 1; t < TMAX; ++t) { if(t >= TMIN) { //summe = 1 & (lk[t+243] ^ lk[t+288] ^ lk[t+66] ^ lk[t+93] ^ lk[t+162] ^ lk[t+177]); } lk[287] = 5;//lk[t+243] ^ lk[t+288] ^ lk[t+69] ^ (lk[t+286] & lk[t+287]); lk[93] = 5;//lk[t+66] ^ lk[t+93] ^ lk[t+171] ^ (lk[t+91] & lk[t+92]); lk[177] = 5;//lk[t+162] ^ lk[t+177] ^ lk[t+264] ^ (lk[t+175] & lk[t+176]); } //summe ^= lk[1400]; } // count2*/ //membuff[gl_id] = gl_id; }

        • segment fault / memory error
          omkaranathan

          I tried compiling all of them with SKA and there were no issues.

          What is your system config? OS/GPU/SDK Version/Driver?

            • segment fault / memory error
              JanS

              tested with ubuntu 10.04 32bit / 4650 / sdk 2.1 / 10.4

              segmention faults appear to be random.

              if i use local arrays and dont write to an external buffer -> 20/20 runs without a segmention fault

              if i write to an external buffer and dont use local arrays -> 20/20 runs without a segmention fault

              if i use local arrays (of size ~ 1400) and write to an external buffer -> 15/20 runs exit with an segmention fault

              if i use local arrays and dont write to an external buffer -> 15/20 runs exit with an segmention fault

              i really dont know what to do... tomorrow i coult post a complete source code, but i dont think that this has to do something with this specific problem.

              edit: wait a second, i'll give it a try on my workstation @ home

               

                • segment fault / memory error
                  omkaranathan

                   

                   tomorrow i coult post a complete source code, but i dont think that this has to do something with this specific problem


                  That will help to reproduce and identify the problem easily.

                    • segment fault / memory error
                      JanS

                      ok, got a VPU recover here on win7 x64:

                      globalsize = 102400 works

                      globalsize = 1024000 VPU recover

                      with CPU:

                      globalsize = 102400 application crash, even with globalsize 1024

                      globalsize = 1024000 application crash

                       

                      .cpp ###################################################################### #define __CL_ENABLE_EXCEPTIONS #define __NO_STD_VECTOR #define __NO_STD_STRING #include <cstdio> #include <cstdlib> #include <iostream> #include <fstream> #include <ctime> #if defined(__APPLE__) || defined(__MACOSX) #include <OpenCL/cl.hpp> #else #include <CL/cl.hpp> #endif #include "ComputationTest.hpp" const int TRIVIUM_SIZE = 288; const size_t localsize = 64; const long globalsize = 102400; // works, 1024000 = crash const int data_size = 1500*64; cl_ulong* data = new cl_ulong[data_size]; const char* filename = "aida_kernel.cl"; cl::vector<cl::Platform> platforms; cl::vector<cl::Device> devices; cl::Context context; cl::Program program_; int main(void) { size_t memSize = sizeof(cl_ulong)*(data_size); for(int i=0; i < data_size; ++i ) data[i] = 0; std::ifstream file(filename, std::ifstream::in); if(!file.is_open()) { std::cerr << "ERROR: Could not open kernel file " << filename << std::endl; return EXIT_FAILURE; } std::string prog(std::istreambuf_iterator<char>(file), (std::istreambuf_iterator<char>())); try { cl::vector<cl::Platform>::iterator curPlatform = initPlatform(); context = initContext(CL_DEVICE_TYPE_CPU, curPlatform); cl::vector<cl::Device>::iterator curDevice = initDevice(); cl::Program::Sources source(1, std::make_pair(prog.c_str(),prog.size())); program_ = cl::Program(context, source); program_.build(devices); cl::Kernel kernel(program_, "aida"); //localsize = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(*devices); std::cout << "localsize: " << localsize << " globalsize: " << globalsize << std::endl; cl::CommandQueue queue(context, *curDevice); cl::Buffer membuff( context, CL_MEM_READ_WRITE, memSize ); // Daten vom Hauptspeicher in den GPU Speicher schreiben queue.enqueueWriteBuffer( membuff, CL_TRUE, 0, memSize, data ); cl::KernelFunctor func = kernel.bind( queue, cl::NDRange(globalsize), cl::NDRange(localsize) ); time_t secondelapsed = time(NULL); std::cout << "\nStarting kernel..." << std::endl; func(membuff).wait(); time_t secondsfinished = time(NULL); // Daten vom GPU Speicher in den Hauptspeicher zurückschreiben queue.enqueueReadBuffer( membuff, CL_TRUE, 0, memSize, data ); for(int i = 0; i < 10; ++i ) { std::cout << data[i] << std::endl; } //std::cout << data[128] << std::endl; std::cout << "Finished after " << (secondsfinished - secondelapsed) << " seconds." << std::endl; } catch (cl::Error err) { std::cerr << "ERROR: " << err.what() << "(" << err.err() << ")" << std::endl; if(err.err() == -11) { std::cout << "Build log:\n" << program_.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]).c_str() << std::endl; } delete[] data; return EXIT_FAILURE; } std::cin.get(); delete[] data; return EXIT_SUCCESS; } cl::vector<cl::Platform>::iterator initPlatform() { std::cout << "Getting platform information...\n"; cl::Platform::get(&platforms); cl::vector<cl::Platform>::iterator i; if (platforms.size() > 0) { int platformCount = 0; for (i = platforms.begin(); i != platforms.end(); ++i, ++platformCount) { std::cout << "Found platform[" << platformCount << "]\n" << "\tName: " << (*i).getInfo<CL_PLATFORM_NAME>().c_str() << "\n" << "\tVendor: " << (*i).getInfo< CL_PLATFORM_VENDOR>().c_str() << "\n" << "\tPlatform Version: " << (*i).getInfo< CL_PLATFORM_VERSION>().c_str() << std::endl; } --i; // get last known(workaround) std::cout << "Using default platform[0]\n"; } return i; } cl::Context initContext(cl_device_type type, cl::vector<cl::Platform>::iterator& platforms) { cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*platforms)(), 0 }; return cl::Context(type, cps, NULL, NULL, NULL); } cl::vector<cl::Device>::iterator initDevice() { devices = context.getInfo<CL_CONTEXT_DEVICES> (); cl::vector<cl::Device>::iterator i; std::cout << "\nGetting device information...\n"; if (devices.size() > 0) { int deviceCount = 0; for (i = devices.begin(); i != devices.end(); ++i, ++deviceCount) { std::cout << "Found device[" << deviceCount << "]\n" << "\tName: " << (*i).getInfo<CL_DEVICE_NAME>().c_str() << "\n\tCompute units: " << (*i).getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() << "\n" << "\tGlobal memory size: " << (*i).getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() / 1000 << "kb\n" << "\tVendor: " << (*i).getInfo< CL_DEVICE_VENDOR>().c_str() << "\n" << "\tDevice Version: " << (*i).getInfo< CL_DEVICE_VERSION>().c_str() << std::endl; } --i; // get last known(workaround) std::cout << "Using default device[0]\n"; } return i; } .hpp ###################################################################### #pragma once cl::vector<cl::Platform>::iterator initPlatform(); cl::Context initContext(cl_device_type type, cl::vector<cl::Platform>::iterator& platforms); cl::vector<cl::Device>::iterator initDevice(); kernel ###################################################################### __kernel void aida( __global unsigned long* membuff ) { const size_t gl_id = get_global_id(0); // global id = unique id(0 till membuff.count - 1) /* const size_t max_g = get_global_size(0); // global size == membuff.count() const size_t work_dim = get_work_dim(); // should be 1-dimensional const size_t lid = get_local_id(0); // local id = current thread id(0 till 63, which are all parallel executed!) const size_t max_l = get_local_size(0); // local size = 64 threads const size_t gr_id = get_group_id(0); // current group id, needed to calculate offset const size_t max_ng = get_num_groups(0); // membuff.count() / local size */ const int TMAX = 1200; const int TMIN = 1199; unsigned long lk[288+1200]; unsigned long summe = 0; for (int count2 = 0; count2 < 256; count2++) { for(int t = 0; t < 288; ++t) { lk[t] = count2; } for(int t = 1; t < TMAX; ++t) { if(t >= TMIN) { summe = 1 & (lk[t+243] ^ lk[t+288] ^ lk[t+66] ^ lk[t+93] ^ lk[t+162] ^ lk[t+177]); } lk[t+243] ^ lk[t+288] ^ lk[t+69] ^ (lk[t+286] & lk[t+287]); lk[t+66] ^ lk[t+93] ^ lk[t+171] ^ (lk[t+91] & lk[t+92]); lk[t+162] ^ lk[t+177] ^ lk[t+264] ^ (lk[t+175] & lk[t+176]); } summe ^= lk[1400]; } // count2*/ membuff[gl_id] ^= summe; }