Hi,
I have been trying to execute a simple kernel but it returns garbage values and I am unable to figure why. I want to find the closest set of planes from a given plane set using the angles between the planes. So, the criteria is to find the minimum of the square of the difference of the corresponding angles. In this case, the correct answer should be given as the planes which have near similar orientation. I am getting the desirable answer in CPU. But when I am sending it to kernel, it sends out a different answer not consistent with my calculations.
__kernel void getTransformation( __global uint* permut1, __global float2* dot1,__global int4* combo1, __global float2* dot2,__global int4* combo2, , int size1, int size2, __global float4* trans)
{
int gid = get_global_id(0);
float2 temp_dot;
float min_dot = FLT_MAX;
int ind = 0;
for(int i=0;i<size2;i++)
{
temp_dot = (dot2.x - dot1[permut1[gid]].x,dot2.y - dot1[permut1[gid]].y);
if((temp_dot.x*temp_dot.x + temp_dot.y*temp_dot.y) < min_dot)
{
min_dot = temp_dot.x*temp_dot.x + temp_dot.y*temp_dot.y;
ind = i;
}
}
float4 num_pl2 = combo2[ind];
trans[gid] = convert_float4_rtp(num_pl2);
}
Hi,
The problem may be because the vector data arrays you are passing to the kernel are not properly byte aligned. Could you tell us how you are setting arguments of the kernel and calling it from the host?
Thanks,
AMD Support
cl_mem d_permut1,d_combo1,d_combo2,d_dot1,d_dot2;
// use about 1024 different combinations
limit_size = 1024;
unsigned int* temp_permut = (unsigned int*)malloc(sizeof(unsigned int)*limit_size);
d_permut1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(unsigned int)*limit_size, NULL, &err);
d_combo1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint4)*size1, NULL, &err);
d_dot1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float2)*size1, NULL, &err);
d_combo2 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint4)*size2, NULL, &err);
d_dot2 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float2)*size2, NULL, &err);
d_trans = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float4)*limit_size*4, NULL, &err);
int j=0;
// Transferring data starting from end
for(int i=size1-1;i>=size1-limit_size*6;i-=6,j++)
temp_permut
= permut1; err |= clEnqueueWriteBuffer(commands,d_permut1, CL_TRUE, 0, sizeof(unsigned int)*limit_size, temp_permut,0,NULL,NULL);
err |= clEnqueueWriteBuffer(commands,d_combo1 , CL_TRUE, 0, sizeof(cl_uint4)*size1, combo1, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(commands,d_dot1 , CL_TRUE, 0, sizeof(cl_float2)*size1, dot1, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(commands,d_combo2 , CL_TRUE, 0, sizeof(cl_uint4)*size2, combo2, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(commands,d_dot2 , CL_TRUE, 0, sizeof(cl_float2)*size2, dot2, 0, NULL, NULL);
clFinish(commands);
err |= clSetKernelArg(transKernel, 0, sizeof(cl_mem), (void*)&d_permut1);
err |= clSetKernelArg(transKernel, 1, sizeof(cl_mem), (void*)&d_combo1);
err |= clSetKernelArg(transKernel, 2, sizeof(cl_mem), (void*)&d_dot1);
err |= clSetKernelArg(transKernel, 3, sizeof(cl_mem), (void*)&d_combo2);
err |= clSetKernelArg(transKernel, 4, sizeof(cl_mem), (void*)&d_dot2);
err |= clSetKernelArg(transKernel, 5, sizeof(unsigned int), (void*)&pl1_size);
err |= clSetKernelArg(transKernel, 6, sizeof(unsigned int), (void*)&pl2_size);
err |= clSetKernelArg(transKernel, 7, sizeof(cl_mem), (void*)&d_trans);
size_t global_size[1] = {limit_size};
size_t local_size[1] = {64};
clEnqueueNDRangeKernel(commands,transKernel, 1, NULL, global_size, local_size, 0, NULL, NULL);
clFinish(commands);
I am populating the dot and combo arrays in a previous kernel. I have found out that they are populating properly.
Hi,
Can you try rewriting your code without using vector data types and see if the same problem persists. This problem may be occurrig due to misalignment of vector data types.