AnsweredAssumed Answered

OpenCL GPU compiler crash

Question asked by trlulu on Aug 19, 2014
Latest reply on Aug 21, 2014 by maxdz8

Hi,

 

I am working on a kernel that find intersections between ray and a triangle list, but (there is always a "but" ) i got some trouble using my opencl compiler indeed it crashes when I try to compile it.

I try to compile it on my CPU compiler and it compile well, but with my GPU compiler it crashes...

 

//-----------------------------------------------------------------------------
//---------------------------------DEFINES-------------------------------------
//-----------------------------------------------------------------------------

#define RAYON_SORTANT -1000
#define RAYON_ENTRANT 1000

#define MIN_LONGUEUR_RT 1.E-6f

//-----------------------------------------------------------------------------
//---------------------------------CONTENT-------------------------------------
//-----------------------------------------------------------------------------

typedef struct s_CDPoint
{
    float x;
    float y;
    float z;
} CDPoint;

typedef struct s_TTriangle
{
    CDPoint            triangle_[3];
    CDPoint            normal_;
} TTriangle;

typedef struct s_GridIntersection
{
    CDPoint         pos_;
    float             distance_;
    int                sensNormale_;
    unsigned int    idTriangle_;
} TGridIntersection;

//-----------------------------------------------------------------------------
//---------------------------------MUTEX---------------------------------------
//-----------------------------------------------------------------------------

void GetSemaphor(__global int * semaphor)
{
   int occupied = atomic_xchg(semaphor, 1);
   while(occupied > 0)
   {
     occupied = atomic_xchg(semaphor, 1);
   }
}

void ReleaseSemaphor(__global int * semaphor)
{
   int prevVal = atomic_xchg(semaphor, 0);
}

//-----------------------------------------------------------------------------
//---------------------------------GEOMETRIE-----------------------------------
//-----------------------------------------------------------------------------

float dotProduct(const CDPoint* pA, const CDPoint* pB)
{
    return (pA->x * pB->x + pA->y * pB->y + pA->z * pB->z);
}

CDPoint crossProduct(const CDPoint* pA, const CDPoint* pB)
{
    CDPoint res;

    res.x = pA->y * pB->z - pB->y * pA->z;
    res.y = pA->z * pB->x - pB->z * pA->x;
    res.z = pA->x * pB->y - pB->x * pA->y;

    return res;
}

CDPoint soustraction(const CDPoint* pA, const CDPoint* pB)
{
    CDPoint res;
    res.x = pA->x - pB->x;
    res.y = pA->y - pB->y;
    res.z = pA->z - pB->z;
    return res;
}

CDPoint addition(const CDPoint* pA, const CDPoint* pB)
{
    CDPoint res;
    res.x = pA->x + pB->x;
    res.y = pA->y + pB->y;
    res.z = pA->z + pB->z;
    return res;
}

CDPoint homothetie(const CDPoint* pA, float val)
{
    CDPoint pnt;
    pnt.x = pA->x * val;
    pnt.y = pA->y * val;
    pnt.z = pA->z * val;
    return pnt;
}

//-----------------------------------------------------------------------------
//---------------------------------KERNEL--------------------------------------
//-----------------------------------------------------------------------------

__kernel void IntersectionTriangle(    __global const TTriangle* pTriangleListe,
                            const unsigned int idxDebutTriangle,
                            const unsigned int idxFin,
                            __constant const CDPoint* pPointOrigine,
                            __constant const CDPoint* pDir,
                            __global int *nbInter,
                            __global TGridIntersection* pResults    )
{
    __private unsigned int index = get_global_id(0) + idxDebutTriangle;
   
    if (index > idxFin) return;
   
    __global const TTriangle *pTriangle = &pTriangleListe[index];
    __private float distance = 0.f;
   
    // Côté du triangle et normale au plan
    __private CDPoint edge1 = soustraction(&pTriangle->triangle_[1], &pTriangle->triangle_[0]);
    __private CDPoint edge2 = soustraction(&pTriangle->triangle_[2], &pTriangle->triangle_[0]);
    __private CDPoint pvec = crossProduct(pDir, &edge2);            // produit vectoriel

    // Le rayon et le triangle sont il parallèle ?
    __private float det = dotProduct(&edge1, &pvec);
    if (det == 0.f)
    {
        return ;
    }
    __private float inv_det = 1.f / det;

    // Distance origin t0
    __private CDPoint tvec = soustraction(pPointOrigine, &pTriangle->triangle_[0]);

    //Calculate u parameter and test bound
    __private float u = (dotProduct(&tvec, &pvec)) * inv_det;
    //The intersection lies outside of the triangle
    if (u < -MIN_LONGUEUR_RT
        || u > 1.f + MIN_LONGUEUR_RT)
    {
        return ;
    }

    u = max(u, 0.f);

    //Prepare to test v parameter
    __private CDPoint qvec = crossProduct(&tvec, &edge1);

    //Calculate V parameter and test bound
    __private float v = dotProduct(pDir, &qvec) * inv_det;

    //The intersection lies outside of the triangle
    if (v < -MIN_LONGUEUR_RT
        || u + v  > 1.f + MIN_LONGUEUR_RT)
    {
        return ;
    }

    // Get distance
    distance = dotProduct(&edge2, &qvec) * inv_det;

    if (distance > -MIN_LONGUEUR_RT)
    {
        // We are using nbInter as semaphor index
        GetSemaphor(nbInter);
       
        __private int idxInter = *nbInter;

        pResults[idxInter].distance_ = max(distance, 0.f);

        // Intersection point
        __private CDPoint vDir = homothetie(pDir, distance);
        pResults[idxInter].pos_ = addition(pPointOrigine, &vDir);

        // Get ray way
        pResults[idxInter].sensNormale_ = dotProduct(&pTriangle->normal_, pDir) > 0.f ? RAYON_SORTANT : RAYON_ENTRANT;

        // Triangle id
        pResults[idxInter].idTriangle_ = index - idxDebutTriangle;
       
        // inc nb inter
        *nbInter = *nbInter + 1;
       
        ReleaseSemaphor(nbInter);
    }
   
}

 

I notice that if I change "__global const TTriangle* pTriangleListe" by "const TTriangle* pTriangleListe" it compiles but it is not the code i want !

 

Here it is my openCL computer configuration :

 

Platform [0]
   id    = 5339E7D8
   profile = FULL_PROFILE
   version = OpenCL 1.2 AMD-APP (1445.5)
   name  = AMD Accelerated Parallel Processing
   vendor = Advanced Micro Devices, Inc.
   extensions = cl_khr_icd
                cl_khr_d3d10_sharing
                cl_khr_d3d11_sharing
                cl_khr_dx9_media_sharing
                cl_amd_event_callback
                cl_amd_offline_devices
                cl_amd_hsa

 

 

   2 Devices detected
       Device [0]
           id                       = 010DFA00
           type                     = CL_DEVICE_TYPE_GPU
           name                     = Cedar
           vendor                   = Advanced Micro Devices, Inc.
           driver version           = 1445.5 (VM)
           device version           = OpenCL 1.2 AMD-APP (1445.5)
           profile                  = FULL_PROFILE
           max compute units        = 2
           max work items dimensions = 3
           max work item sizes      = 128 / 128 / 128
           max work group size      = 128
           max clock frequency      = 650 MHz
           address_bits             = 32
           max mem alloc size       = 512 MB
           global mem size          = 1024 MB
           image support            = CL_TRUE
           max read image args      = 128
           max write image args     = 8
           2D image max size        = 16384 x 16384
           3D image max size        = 2048 x 2048 x 2048
           max samplers             = 16
           max parameter size       = 1024
           mem base addr align      = 2048
           min data type align size = 128
           single fp config         = CL_FP_INF_NAN CL_FP_ROUND_TO_NEAREST CL_FP_ROUND_TO_ZERO CL_FP_ROUND_TO_INF CL_FP_FMA
           global mem cache type    = CL_NONE
           max constant buffer size = 64 KB
           max constant args        = 8
           local mem type           = CL_LOCAL
           local mem size           = 32 KB
           error correction support = CL_FALSE
           profiling timer resolution = 1 ns
           endian little            = CL_TRUE
           available                = CL_TRUE
           compiler available       = CL_TRUE
           execution capabilities   = CL_EXEC_KERNEL
           queue properties         = CL_QUEUE_PROFILING_ENABLE
           extensions               = cl_khr_global_int32_base_atomics
                                           cl_khr_global_int32_extended_atomics
                                           cl_khr_local_int32_base_atomics
                                           cl_khr_local_int32_extended_atomics
                                           cl_khr_3d_image_writes
                                           cl_khr_byte_addressable_store
                                           cl_khr_gl_sharing
                                           cl_ext_atomic_counters_32
                                           cl_amd_device_attribute_query
                                           cl_amd_vec3
                                           cl_amd_printf
                                           cl_amd_media_ops
                                           cl_amd_media_ops2
                                           cl_amd_popcnt
                                           cl_khr_d3d10_sharing
                                           cl_khr_d3d11_sharing
                                           cl_khr_dx9_media_sharing
                                           cl_amd_image2d_from_buffer_read_only
                                           cl_khr_spir
                                           cl_khr_gl_event

 

       Device [1]
           id                       = 03501CD0
           type                     = CL_DEVICE_TYPE_CPU
           name                     =     Intel(R) Core(TM) i3-2130 CPU @ 3.40GHz
           vendor                   = GenuineIntel
           driver version           = 1445.5 (sse2,avx)
           device version           = OpenCL 1.2 AMD-APP (1445.5)
           profile                  = FULL_PROFILE
           max compute units        = 4
           max work items dimensions = 3
           max work item sizes      = 1024 / 1024 / 1024
           max work group size      = 1024
           max clock frequency      = 3392 MHz
           address_bits             = 32
           max mem alloc size       = 1024 MB
           global mem size          = 2048 MB
           image support            = CL_TRUE
           max read image args      = 128
           max write image args     = 8
           2D image max size        = 8192 x 8192
           3D image max size        = 2048 x 2048 x 2048
           max samplers             = 16
           max parameter size       = 4096
           mem base addr align      = 1024
           min data type align size = 128
           single fp config         = CL_FP_DENORM CL_FP_INF_NAN CL_FP_ROUND_TO_NEAREST CL_FP_ROUND_TO_ZERO CL_FP_ROUND_TO_INF CL_FP_FMA
           global mem cache type    = CL_READ_WRITE_CACHE
           global mem cacheline size = 64
           global mem cache size    = 32768
           max constant buffer size = 64 KB
           max constant args        = 8
           local mem type           = CL_GLOBAL
           local mem size           = 32 KB
           error correction support = CL_FALSE
           profiling timer resolution = 301 ns
           endian little            = CL_TRUE
           available                = CL_TRUE
           compiler available       = CL_TRUE
           execution capabilities   = CL_EXEC_KERNEL CL_EXEC_NATIVE_KERNEL
           queue properties         = CL_QUEUE_PROFILING_ENABLE
           extensions               = cl_khr_fp64
                                           cl_amd_fp64
                                           cl_khr_global_int32_base_atomics
                                           cl_khr_global_int32_extended_atomics
                                           cl_khr_local_int32_base_atomics
                                           cl_khr_local_int32_extended_atomics
                                           cl_khr_3d_image_writes
                                           cl_khr_byte_addressable_store
                                           cl_khr_gl_sharing
                                           cl_ext_device_fission
                                           cl_amd_device_attribute_query
                                           cl_amd_vec3
                                           cl_amd_printf
                                           cl_amd_media_ops
                                           cl_amd_media_ops2
                                           cl_amd_popcnt
                                           cl_khr_d3d10_sharing
                                           cl_khr_spir
                                           cl_amd_svm
                                           cl_khr_gl_event

 

 

 

I hope you will be able to help me, I am on this issue since days and i don't see what i am doing wrong !

Thank you and sorry about my poor english

Outcomes