Hi,
I have isolated what seems like a bug in the clAmdFFT. Test code is below.
I do a batch of 2 1D vectors, one with a short pulse, and the other 0 values, but a zero vector ends up getting non-zero values after the FFT.
Input:
0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 |
0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 5.00E-01 | 1.00E+00 | 5.00E-01 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 |
Output:
0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 1.49E-08 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | -2.98E-08 | -1.49E-08 | -2.98E-08 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 | 0.00E+00 |
2.00E+00 | 0.00E+00 | -1.98E+00 | 1.49E-08 | 1.92E+00 | 0.00E+00 | -1.83E+00 | 1.49E-08 | 1.71E+00 | 0.00E+00 | -1.56E+00 | 0.00E+00 | 1.38E+00 | 0.00E+00 | -1.20E+00 | 1.00E+00 | 0.00E+00 | -8.05E-01 | 0.00E+00 | 6.17E-01 | 0.00E+00 | -4.44E-01 | 0.00E+00 | 2.93E-01 | 0.00E+00 | -1.69E-01 | 1.49E-08 | 7.61E-02 | 0.00E+00 | -1.92E-02 | 1.49E-08 | 0.00E+00 | 0.00E+00 | 0.00E+00 |
#include <stdio.h>
#include <math.h>
#include <string.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/opencl.h>
#endif
#include "clAmdFft.h"
int main(int argc, char *argv[])
{
cl_platform_id platform_id;
cl_device_id device_id;
cl_context context;
cl_mem mem;
cl_command_queue queue;
clAmdFftPlanHandle plan;
clAmdFftSetupData setupData;
float data[2][48];
size_t distance = &data[1][0] - &data[0][0];
size_t lengths[3] = {32, 1, 1};
size_t strideIn[4] = {1, distance, distance, distance};
size_t strideOut[4] = {1, distance / 2, distance / 2, distance / 2};
memset(data, 0, sizeof(data));
data[1][15] = 0.5f;
data[1][16] = 1.0f;
data[1][17] = 0.5f;
clGetPlatformIDs(1, &platform_id, NULL);
clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, NULL);
queue = clCreateCommandQueue(context, device_id, 0, NULL);
mem = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * 2 * distance, NULL, NULL);
clEnqueueWriteBuffer(queue, mem, CL_TRUE, 0L, sizeof(float) * 2 * distance, data[0], 0, NULL, NULL);
setupData.major = clAmdFftVersionMajor;
setupData.minor = clAmdFftVersionMinor;
setupData.patch = clAmdFftVersionPatch;
setupData.debugFlags = 0;
clAmdFftSetup(&setupData);
clAmdFftCreateDefaultPlan(&plan, context, CLFFT_1D, lengths);
clAmdFftSetPlanPrecision(plan, CLFFT_SINGLE);
clAmdFftSetLayout(plan, CLFFT_REAL, CLFFT_HERMITIAN_INTERLEAVED);
clAmdFftSetPlanDistance(plan, distance, distance / 2);
clAmdFftSetPlanInStride(plan, CLFFT_1D, strideIn);
clAmdFftSetPlanOutStride(plan, CLFFT_1D, strideOut);
clAmdFftSetResultLocation(plan, CLFFT_INPLACE);
clAmdFftSetPlanBatchSize(plan, 2);
clAmdFftBakePlan(plan, 1, &queue, NULL, NULL);
clAmdFftEnqueueTransform(plan, CLFFT_FORWARD, 1, &queue, 0, NULL, NULL, &mem, &mem, NULL);
for (int j = 0; j < 2; j++) {
for (int i = 0; i < 34; i++)
printf("%e\t", data);
printf("\n");
}
printf("\n");
clEnqueueReadBuffer(queue, mem, CL_TRUE, 0L, sizeof(float) * 2 * distance, data[0], 0, NULL, NULL);
for (int j = 0; j < 2; j++) {
for (int i = 0; i < 34; i++)
printf("%e\t", data);
printf("\n");
}
printf("\n");
clAmdFftDestroyPlan(&plan);
clAmdFftTeardown();
clReleaseMemObject(mem);
clReleaseCommandQueue(queue);
clReleaseContext(context);
clReleaseDevice(device_id);
}