8 Replies Latest reply on Jan 30, 2013 1:33 PM by dmyablonski

    calling clAmdFftEnqueueTransform a second time crashes on CPU

    cipoint

      If I try to run clAmdFftEnqueueTransform a second time I get a segmentation fault when using the CPU (Intel i7). There is no problem doing so when using a HD6770. This problem only occurs for fftSize=1e4 or bigger. For smaller FFT sizes both devices (GPU and CPU) run just fine with multiple FFTs in series (i.e. in a for-loop).

       

      I can't figure out the problem.

       

      #include <complex>
      #include <iostream>
      #include <algorithm>
      #include <time.h>
      #include <CL/opencl.h>
      #include <clAmdFft.h>

       

      using namespace std;
      typedef complex<float> cfloat;

       

      int main(void) {
        cl_platform_id *cl_platformIDs = NULL;     // IDs of OpenCL platforms 
        cl_uint cl_platformsN = 0;                 // Platform count
        cl_uint cl_deviceCount = 0;                // Device count         
        cl_device_id *cl_devices = NULL;           // Device IDs
        cl_context cl_dev_context;                 // Context 
        cl_command_queue cl_queue;                 // Queue
        clAmdFftSetupData fftSetupData;            // FFT setup data 
        clAmdFftPlanHandle fftPlan;                // FFT plan 
        clAmdFftDim fftDim = CLFFT_1D;             // FFT dimension 
        size_t fftSize[1];                         // FFT size
        fftSize[0] = 1e4;
        cl_mem d_data;                             // Device level data 
        cfloat *h_src;                    // Host level input data 
        cfloat *h_res;                    // Host level output data
        time_t start, end;
       
        // Allocate host memory 
        h_src = (cfloat*)malloc(fftSize[0]*sizeof(cfloat));
        h_res = (cfloat*)malloc(fftSize[0]*sizeof(cfloat));
       
        // Get FFT version 
        clAmdFftInitSetupData(&fftSetupData); 
        cout << "Using clAmdFft " << fftSetupData.major << "." << fftSetupData.minor << "." << fftSetupData.patch << endl; 
       
        // Get available platforms 
        clGetPlatformIDs(0, NULL, &cl_platformsN); 
        cl_platformIDs = (cl_platform_id*) malloc( cl_platformsN * sizeof(cl_platform_id)); 
        clGetPlatformIDs(cl_platformsN, cl_platformIDs, NULL);
       
        // Select platform
        cl_uint i = 0;
       
        // Get number of available devices for this platform 
        clGetDeviceIDs(cl_platformIDs[i], CL_DEVICE_TYPE_ALL, NULL, NULL, &cl_deviceCount); 
       
        // Get available device IDs for this platform 
        cl_devices = (cl_device_id*) malloc( cl_deviceCount * sizeof(cl_device_id));  
        clGetDeviceIDs(cl_platformIDs[i], CL_DEVICE_TYPE_ALL, cl_deviceCount, cl_devices, NULL); 
       
        // Print platform name 
        char platform_name[1024]; 
        clGetPlatformInfo(cl_platformIDs[i], CL_PLATFORM_NAME, 1024, &platform_name, NULL); 
        cout << "Compute using OpenCl platfrom " << i << " [" << platform_name << "]" << endl; 
       
        // Select device 
        cl_uint j = 1;

        // Print device name and type 
        cl_device_type device_type; 
        char device_name[1024]; 
        clGetDeviceInfo(cl_devices[j], CL_DEVICE_NAME, 1024, &device_name, NULL); 
        clGetDeviceInfo( cl_devices[j],CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL); 
        cout << "Using OpenCl device " << j << " [" << device_name << "]" << endl;

       

        // Create source data
        cout << "input data:" << endl;
        for(int i = 0; i < min(int (fftSize[0]), 10); i++) {
          h_src[i] = cfloat (sin(i), 0);
          h_res[i] = cfloat (0, 0);
          cout << h_src[i] << endl;
        }
       
        // Create OpenCL context 
        cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)cl_platformIDs[i], 0};
        cl_dev_context = clCreateContext(cps, cl_deviceCount, cl_devices, NULL, NULL, NULL); 
       
        // Create command queue 
        cl_queue = clCreateCommandQueue(cl_dev_context, cl_devices[j], CL_QUEUE_PROFILING_ENABLE, NULL);

       

        // Create device buffer 
        d_data = clCreateBuffer(cl_dev_context, CL_MEM_READ_WRITE, fftSize[0]*sizeof(cfloat), NULL, NULL);

       

        // Setup FFT 
        clAmdFftSetup(&fftSetupData);

       

        // Create FFT plan
        clAmdFftCreateDefaultPlan(&fftPlan, cl_dev_context, fftDim, fftSize);

       

        // Set plan precision
        clAmdFftSetPlanPrecision(fftPlan, CLFFT_SINGLE);
        time(&start);

       

          // Copy data from host to device
          clEnqueueWriteBuffer(cl_queue, d_data, CL_TRUE, 0, fftSize[0]*sizeof(cfloat), h_src, 0, NULL, NULL); 
          clFinish(cl_queue);
         
          // Execute FFT
          clAmdFftEnqueueTransform(fftPlan, CLFFT_FORWARD, 1, &cl_queue, 0, NULL, NULL, &d_data, NULL, NULL);
          clFinish(cl_queue);
         
          // Copy result from device to host 
          clEnqueueReadBuffer(cl_queue, d_data, CL_TRUE, 0, fftSize[0]*sizeof(cfloat), h_res, 0, NULL, NULL);
          clFinish(cl_queue);
         
          // Copy data from host to device
          clEnqueueWriteBuffer(cl_queue, d_data, CL_TRUE, 0, fftSize[0]*sizeof(cfloat), h_src, 0, NULL, NULL); 
          clFinish(cl_queue);
         
          // Execute FFT
          clAmdFftEnqueueTransform(fftPlan, CLFFT_FORWARD, 1, &cl_queue, 0, NULL, NULL, &d_data, NULL, NULL);
          clFinish(cl_queue);

        time(&end);
        cout << "total exec time for 100 ffts: " << end - start << "s" << endl;
       
        // Print result
        cout << "output data:" << endl;
        for(int i = 0; i < min(int (fftSize[0]), 10); i++) {
          cout << h_res[i] << endl;
        }
       
        // Free FFT plan 
        clAmdFftDestroyPlan(&fftPlan); 
       
        // Free FFT 
        clAmdFftTeardown(); 
       
        // Free device memory 
        clReleaseMemObject(d_data); 
       
        // Release OpenCL context and queue 
        clReleaseCommandQueue(cl_queue); 
        clReleaseContext(cl_dev_context); 
       
        // Free objects
        free(cl_devices);  
        free(h_src); 
        free(h_res);
        return 0;
      }
        • Re: calling clAmdFftEnqueueTransform a second time crashes on CPU
          bragadeesh

          Are you using the AMD APP SDK OpenCL platform for Intel I7? Or is it Intel's OpenCL platform?

          Our libraries are very well tested on AMD GPUs, but not thoroughly tested on CPUs as it is not our primary target. We have some known issues and failures on CPU targets with the 1.8 version. And work is in progress to address those. You are probably running into those same issues.

           

          Is CPU device important for your application?

          1 of 1 people found this helpful
          • Re: calling clAmdFftEnqueueTransform a second time crashes on CPU
            gugi

            I had a similar problem with a backwards complex-to-real transformation: The first call to clAmdFftEnqueueTransform() was successful, the second caused a crash.

            After I supplied a temporary-buffer to clAmdFftEnqueueTransform() (i.e. its last argument wasn't NULL) everything worked fine.

              • Re: calling clAmdFftEnqueueTransform a second time crashes on CPU
                cipoint

                Thank you both for replying. I can't test it right now ... How big should be this temporary buffer? I couldn't find it in the documentation.

                • Re: calling clAmdFftEnqueueTransform a second time crashes on CPU
                  dmyablonski

                  I ran into this same problem the past day or so.

                   

                  Would it be possible to note this somewhere in the reference manual, besides the description of function to get the size of the temporary buffer?

                   

                  Here is the wording from the temporary buffer parameter description of enqueuetransform...

                  A cl_mem object that is reserved as a temporary buffer for FFT

                  processing. Only OpenCL buffer objects are supported; OpenCL

                  image objects return an error code. If clTmpBuffers is NULL or

                  nullptr, and the runtime needs temporary storage, an internal

                  temporary buffer is created on the fly and managed by the runtime.

                   

                  I read this and believed that, since I have plenty of free memory, that the runtime would handle it well-enough. That was not the case and I received errors. I was doing the same thing, iterating over the same FFT for a benchmark. I was running the library on an Intel host as well, FYI. It mainly seemed to effect batched transforms. It would just stall for a very long time before reporting a segmentation fault.