AnsweredAssumed Answered

calling clAmdFftEnqueueTransform a second time crashes on CPU

Question asked by cipoint on Dec 13, 2012
Latest reply on Jan 30, 2013 by dmyablonski

If I try to run clAmdFftEnqueueTransform a second time I get a segmentation fault when using the CPU (Intel i7). There is no problem doing so when using a HD6770. This problem only occurs for fftSize=1e4 or bigger. For smaller FFT sizes both devices (GPU and CPU) run just fine with multiple FFTs in series (i.e. in a for-loop).

 

I can't figure out the problem.

 

#include <complex>
#include <iostream>
#include <algorithm>
#include <time.h>
#include <CL/opencl.h>
#include <clAmdFft.h>

 

using namespace std;
typedef complex<float> cfloat;

 

int main(void) {
  cl_platform_id *cl_platformIDs = NULL;     // IDs of OpenCL platforms 
  cl_uint cl_platformsN = 0;                 // Platform count
  cl_uint cl_deviceCount = 0;                // Device count         
  cl_device_id *cl_devices = NULL;           // Device IDs
  cl_context cl_dev_context;                 // Context 
  cl_command_queue cl_queue;                 // Queue
  clAmdFftSetupData fftSetupData;            // FFT setup data 
  clAmdFftPlanHandle fftPlan;                // FFT plan 
  clAmdFftDim fftDim = CLFFT_1D;             // FFT dimension 
  size_t fftSize[1];                         // FFT size
  fftSize[0] = 1e4;
  cl_mem d_data;                             // Device level data 
  cfloat *h_src;                    // Host level input data 
  cfloat *h_res;                    // Host level output data
  time_t start, end;
 
  // Allocate host memory 
  h_src = (cfloat*)malloc(fftSize[0]*sizeof(cfloat));
  h_res = (cfloat*)malloc(fftSize[0]*sizeof(cfloat));
 
  // Get FFT version 
  clAmdFftInitSetupData(&fftSetupData); 
  cout << "Using clAmdFft " << fftSetupData.major << "." << fftSetupData.minor << "." << fftSetupData.patch << endl; 
 
  // Get available platforms 
  clGetPlatformIDs(0, NULL, &cl_platformsN); 
  cl_platformIDs = (cl_platform_id*) malloc( cl_platformsN * sizeof(cl_platform_id)); 
  clGetPlatformIDs(cl_platformsN, cl_platformIDs, NULL);
 
  // Select platform
  cl_uint i = 0;
 
  // Get number of available devices for this platform 
  clGetDeviceIDs(cl_platformIDs[i], CL_DEVICE_TYPE_ALL, NULL, NULL, &cl_deviceCount); 
 
  // Get available device IDs for this platform 
  cl_devices = (cl_device_id*) malloc( cl_deviceCount * sizeof(cl_device_id));  
  clGetDeviceIDs(cl_platformIDs[i], CL_DEVICE_TYPE_ALL, cl_deviceCount, cl_devices, NULL); 
 
  // Print platform name 
  char platform_name[1024]; 
  clGetPlatformInfo(cl_platformIDs[i], CL_PLATFORM_NAME, 1024, &platform_name, NULL); 
  cout << "Compute using OpenCl platfrom " << i << " [" << platform_name << "]" << endl; 
 
  // Select device 
  cl_uint j = 1;

  // Print device name and type 
  cl_device_type device_type; 
  char device_name[1024]; 
  clGetDeviceInfo(cl_devices[j], CL_DEVICE_NAME, 1024, &device_name, NULL); 
  clGetDeviceInfo( cl_devices[j],CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL); 
  cout << "Using OpenCl device " << j << " [" << device_name << "]" << endl;

 

  // Create source data
  cout << "input data:" << endl;
  for(int i = 0; i < min(int (fftSize[0]), 10); i++) {
    h_src[i] = cfloat (sin(i), 0);
    h_res[i] = cfloat (0, 0);
    cout << h_src[i] << endl;
  }
 
  // Create OpenCL context 
  cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)cl_platformIDs[i], 0};
  cl_dev_context = clCreateContext(cps, cl_deviceCount, cl_devices, NULL, NULL, NULL); 
 
  // Create command queue 
  cl_queue = clCreateCommandQueue(cl_dev_context, cl_devices[j], CL_QUEUE_PROFILING_ENABLE, NULL);

 

  // Create device buffer 
  d_data = clCreateBuffer(cl_dev_context, CL_MEM_READ_WRITE, fftSize[0]*sizeof(cfloat), NULL, NULL);

 

  // Setup FFT 
  clAmdFftSetup(&fftSetupData);

 

  // Create FFT plan
  clAmdFftCreateDefaultPlan(&fftPlan, cl_dev_context, fftDim, fftSize);

 

  // Set plan precision
  clAmdFftSetPlanPrecision(fftPlan, CLFFT_SINGLE);
  time(&start);

 

    // Copy data from host to device
    clEnqueueWriteBuffer(cl_queue, d_data, CL_TRUE, 0, fftSize[0]*sizeof(cfloat), h_src, 0, NULL, NULL); 
    clFinish(cl_queue);
   
    // Execute FFT
    clAmdFftEnqueueTransform(fftPlan, CLFFT_FORWARD, 1, &cl_queue, 0, NULL, NULL, &d_data, NULL, NULL);
    clFinish(cl_queue);
   
    // Copy result from device to host 
    clEnqueueReadBuffer(cl_queue, d_data, CL_TRUE, 0, fftSize[0]*sizeof(cfloat), h_res, 0, NULL, NULL);
    clFinish(cl_queue);
   
    // Copy data from host to device
    clEnqueueWriteBuffer(cl_queue, d_data, CL_TRUE, 0, fftSize[0]*sizeof(cfloat), h_src, 0, NULL, NULL); 
    clFinish(cl_queue);
   
    // Execute FFT
    clAmdFftEnqueueTransform(fftPlan, CLFFT_FORWARD, 1, &cl_queue, 0, NULL, NULL, &d_data, NULL, NULL);
    clFinish(cl_queue);

  time(&end);
  cout << "total exec time for 100 ffts: " << end - start << "s" << endl;
 
  // Print result
  cout << "output data:" << endl;
  for(int i = 0; i < min(int (fftSize[0]), 10); i++) {
    cout << h_res[i] << endl;
  }
 
  // Free FFT plan 
  clAmdFftDestroyPlan(&fftPlan); 
 
  // Free FFT 
  clAmdFftTeardown(); 
 
  // Free device memory 
  clReleaseMemObject(d_data); 
 
  // Release OpenCL context and queue 
  clReleaseCommandQueue(cl_queue); 
  clReleaseContext(cl_dev_context); 
 
  // Free objects
  free(cl_devices);  
  free(h_src); 
  free(h_res);
  return 0;
}

Outcomes