3 Replies Latest reply on Apr 4, 2014 1:25 AM by sudarshan

    Finding minimum of square of difference between two arrays



      I have been trying to execute a simple kernel but it returns garbage values and I am unable to figure why. I want to find the closest set of planes from a given plane set using the angles between the planes. So, the criteria is to find the minimum of the square of the difference of the corresponding angles. In this case, the correct answer should be given as the planes which have near similar orientation. I am getting the desirable answer in CPU. But when I am sending it to kernel, it sends out a different answer not consistent with my calculations.


      __kernel void getTransformation( __global uint* permut1, __global float2* dot1,__global int4* combo1, __global float2* dot2,__global int4* combo2, , int size1, int size2, __global float4* trans)
        int gid = get_global_id(0);
        float2 temp_dot;
        float min_dot = FLT_MAX;
        int ind = 0;
        for(int i=0;i<size2;i++)
        temp_dot = (dot2[i].x - dot1[permut1[gid]].x,dot2[i].y - dot1[permut1[gid]].y);
        if((temp_dot.x*temp_dot.x + temp_dot.y*temp_dot.y) < min_dot)
        min_dot = temp_dot.x*temp_dot.x + temp_dot.y*temp_dot.y;
        ind = i;
        float4 num_pl2 = combo2[ind];
        trans[gid] = convert_float4_rtp(num_pl2);
        • Re: Finding minimum of square of difference between two arrays


             The problem may be because the vector data arrays you are passing to the kernel are not properly byte aligned. Could you tell us how you are setting arguments of the kernel and calling it from the host?



          AMD Support

            • Re: Re: Finding minimum of square of difference between two arrays
              cl_mem d_permut1,d_combo1,d_combo2,d_dot1,d_dot2;
                // use about 1024 different combinations
                limit_size = 1024;
                unsigned int* temp_permut = (unsigned int*)malloc(sizeof(unsigned int)*limit_size);
                d_permut1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(unsigned int)*limit_size, NULL, &err);
                d_combo1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint4)*size1, NULL, &err);
                d_dot1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float2)*size1, NULL, &err);
                d_combo2 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint4)*size2, NULL, &err);
                d_dot2 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float2)*size2, NULL, &err);
                d_trans   = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float4)*limit_size*4, NULL, &err);
                int j=0;
                // Transferring data starting from end
                for(int i=size1-1;i>=size1-limit_size*6;i-=6,j++)
                temp_permut[j] = permut1[i];
                err |= clEnqueueWriteBuffer(commands,d_permut1, CL_TRUE, 0, sizeof(unsigned int)*limit_size, temp_permut,0,NULL,NULL);
                err |= clEnqueueWriteBuffer(commands,d_combo1 , CL_TRUE, 0, sizeof(cl_uint4)*size1, combo1, 0, NULL, NULL);
                err |= clEnqueueWriteBuffer(commands,d_dot1   , CL_TRUE, 0, sizeof(cl_float2)*size1, dot1, 0, NULL, NULL);
                err |= clEnqueueWriteBuffer(commands,d_combo2 , CL_TRUE, 0, sizeof(cl_uint4)*size2, combo2, 0, NULL, NULL);
                err |= clEnqueueWriteBuffer(commands,d_dot2   , CL_TRUE, 0, sizeof(cl_float2)*size2, dot2, 0, NULL, NULL);
                err |= clSetKernelArg(transKernel, 0, sizeof(cl_mem), (void*)&d_permut1);
                err |= clSetKernelArg(transKernel, 1, sizeof(cl_mem), (void*)&d_combo1);
                err |= clSetKernelArg(transKernel, 2, sizeof(cl_mem), (void*)&d_dot1);
                err |= clSetKernelArg(transKernel, 3, sizeof(cl_mem), (void*)&d_combo2);
                err |= clSetKernelArg(transKernel, 4, sizeof(cl_mem), (void*)&d_dot2);
                err |= clSetKernelArg(transKernel, 5, sizeof(unsigned int), (void*)&pl1_size);
                err |= clSetKernelArg(transKernel, 6, sizeof(unsigned int), (void*)&pl2_size);
                err |= clSetKernelArg(transKernel, 7, sizeof(cl_mem), (void*)&d_trans);
                size_t global_size[1] = {limit_size};
                size_t local_size[1] = {64};
                clEnqueueNDRangeKernel(commands,transKernel, 1, NULL, global_size, local_size, 0, NULL, NULL);


              I am populating the dot and combo arrays in a previous kernel. I have found out that they are populating properly.