Archives Discussions

trlulu · ‎08-19-2014

Hi,

I am working on a kernel that find intersections between ray and a triangle list, but (there is always a "but" ) i got some trouble using my opencl compiler indeed it crashes when I try to compile it.

I try to compile it on my CPU compiler and it compile well, but with my GPU compiler it crashes...


//-----------------------------------------------------------------------------

//---------------------------------DEFINES-------------------------------------

//-----------------------------------------------------------------------------


#define RAYON_SORTANT -1000

#define RAYON_ENTRANT 1000


#define MIN_LONGUEUR_RT 1.E-6f


//-----------------------------------------------------------------------------

//---------------------------------CONTENT-------------------------------------

//-----------------------------------------------------------------------------


typedef struct s_CDPoint

{

    float x;

    float y;

    float z;

} CDPoint;


typedef struct s_TTriangle

{

    CDPoint            triangle_[3];

    CDPoint            normal_;

} TTriangle;


typedef struct s_GridIntersection

{

    CDPoint         pos_;

    float             distance_;

    int                sensNormale_;

    unsigned int    idTriangle_;

} TGridIntersection;


//-----------------------------------------------------------------------------

//---------------------------------MUTEX---------------------------------------

//-----------------------------------------------------------------------------


void GetSemaphor(__global int * semaphor)

{

   int occupied = atomic_xchg(semaphor, 1);

   while(occupied > 0)

   {

     occupied = atomic_xchg(semaphor, 1);

   }

}


void ReleaseSemaphor(__global int * semaphor)

{

   int prevVal = atomic_xchg(semaphor, 0);

}


//-----------------------------------------------------------------------------

//---------------------------------GEOMETRIE-----------------------------------

//-----------------------------------------------------------------------------


float dotProduct(const CDPoint* pA, const CDPoint* pB)

{

    return (pA->x * pB->x + pA->y * pB->y + pA->z * pB->z);

}


CDPoint crossProduct(const CDPoint* pA, const CDPoint* pB)

{

    CDPoint res;


    res.x = pA->y * pB->z - pB->y * pA->z;

    res.y = pA->z * pB->x - pB->z * pA->x;

    res.z = pA->x * pB->y - pB->x * pA->y;


    return res;

}


CDPoint soustraction(const CDPoint* pA, const CDPoint* pB)

{

    CDPoint res;

    res.x = pA->x - pB->x;

    res.y = pA->y - pB->y;

    res.z = pA->z - pB->z;

    return res;

}


CDPoint addition(const CDPoint* pA, const CDPoint* pB)

{

    CDPoint res;

    res.x = pA->x + pB->x;

    res.y = pA->y + pB->y;

    res.z = pA->z + pB->z;

    return res;

}


CDPoint homothetie(const CDPoint* pA, float val)

{

    CDPoint pnt;

    pnt.x = pA->x * val;

    pnt.y = pA->y * val;

    pnt.z = pA->z * val;

    return pnt;

}


//-----------------------------------------------------------------------------

//---------------------------------KERNEL--------------------------------------

//-----------------------------------------------------------------------------


__kernel void IntersectionTriangle(    __global const TTriangle* pTriangleListe,

                            const unsigned int idxDebutTriangle,

                            const unsigned int idxFin,

                            __constant const CDPoint* pPointOrigine,

                            __constant const CDPoint* pDir,

                            __global int *nbInter,

                            __global TGridIntersection* pResults    )

{

    __private unsigned int index = get_global_id(0) + idxDebutTriangle;

   

    if (index > idxFin) return;

   

    __global const TTriangle *pTriangle = &pTriangleListe[index];

    __private float distance = 0.f;

   

    // Côté du triangle et normale au plan

    __private CDPoint edge1 = soustraction(&pTriangle->triangle_[1], &pTriangle->triangle_[0]);

    __private CDPoint edge2 = soustraction(&pTriangle->triangle_[2], &pTriangle->triangle_[0]);

    __private CDPoint pvec = crossProduct(pDir, &edge2);            // produit vectoriel


    // Le rayon et le triangle sont il parallèle ?

    __private float det = dotProduct(&edge1, &pvec);

    if (det == 0.f)

    {

        return ;

    }

    __private float inv_det = 1.f / det;


    // Distance origin t0

    __private CDPoint tvec = soustraction(pPointOrigine, &pTriangle->triangle_[0]);


    //Calculate u parameter and test bound

    __private float u = (dotProduct(&tvec, &pvec)) * inv_det;

    //The intersection lies outside of the triangle

    if (u < -MIN_LONGUEUR_RT

        || u > 1.f + MIN_LONGUEUR_RT)

    {

        return ;

    }


    u = max(u, 0.f);


    //Prepare to test v parameter

    __private CDPoint qvec = crossProduct(&tvec, &edge1);


    //Calculate V parameter and test bound

    __private float v = dotProduct(pDir, &qvec) * inv_det;


    //The intersection lies outside of the triangle

    if (v < -MIN_LONGUEUR_RT

        || u + v  > 1.f + MIN_LONGUEUR_RT)

    {

        return ;

    }


    // Get distance

    distance = dotProduct(&edge2, &qvec) * inv_det;


    if (distance > -MIN_LONGUEUR_RT)

    {

        // We are using nbInter as semaphor index

        GetSemaphor(nbInter);

       

        __private int idxInter = *nbInter;


        pResults[idxInter].distance_ = max(distance, 0.f);


        // Intersection point

        __private CDPoint vDir = homothetie(pDir, distance);

        pResults[idxInter].pos_ = addition(pPointOrigine, &vDir);


        // Get ray way

        pResults[idxInter].sensNormale_ = dotProduct(&pTriangle->normal_, pDir) > 0.f ? RAYON_SORTANT : RAYON_ENTRANT;


        // Triangle id

        pResults[idxInter].idTriangle_ = index - idxDebutTriangle;

       

        // inc nb inter

        *nbInter = *nbInter + 1;

       

        ReleaseSemaphor(nbInter);

    }

   

}

I notice that if I change "__global const TTriangle* pTriangleListe" by "const TTriangle* pTriangleListe" it compiles but it is not the code i want !

Here it is my openCL computer configuration :

Platform [0]
   id    = 5339E7D8
   profile = FULL_PROFILE
   version = OpenCL 1.2 AMD-APP (1445.5)
   name = AMD Accelerated Parallel Processing
   vendor = Advanced Micro Devices, Inc.
   extensions = cl_khr_icd
                cl_khr_d3d10_sharing
                cl_khr_d3d11_sharing
                cl_khr_dx9_media_sharing
                cl_amd_event_callback
                cl_amd_offline_devices
                cl_amd_hsa

   2 Devices detected
       Device [0]
           id                       = 010DFA00
           type                     = CL_DEVICE_TYPE_GPU
           name                     = Cedar
           vendor                   = Advanced Micro Devices, Inc.
           driver version           = 1445.5 (VM)
           device version           = OpenCL 1.2 AMD-APP (1445.5)
           profile                  = FULL_PROFILE
           max compute units        = 2
           max work items dimensions = 3
           max work item sizes      = 128 / 128 / 128
           max work group size      = 128
           max clock frequency      = 650 MHz
           address_bits             = 32
           max mem alloc size       = 512 MB
           global mem size          = 1024 MB
           image support            = CL_TRUE
           max read image args      = 128
           max write image args     = 8
           2D image max size        = 16384 x 16384
           3D image max size        = 2048 x 2048 x 2048
           max samplers             = 16
           max parameter size       = 1024
           mem base addr align      = 2048
           min data type align size = 128
           single fp config         = CL_FP_INF_NAN CL_FP_ROUND_TO_NEAREST CL_FP_ROUND_TO_ZERO CL_FP_ROUND_TO_INF CL_FP_FMA
           global mem cache type    = CL_NONE
           max constant buffer size = 64 KB
           max constant args        = 8
           local mem type           = CL_LOCAL
           local mem size           = 32 KB
           error correction support = CL_FALSE
           profiling timer resolution = 1 ns
           endian little            = CL_TRUE
           available                = CL_TRUE
           compiler available       = CL_TRUE
           execution capabilities   = CL_EXEC_KERNEL
           queue properties         = CL_QUEUE_PROFILING_ENABLE
           extensions               = cl_khr_global_int32_base_atomics
                                           cl_khr_global_int32_extended_atomics
                                           cl_khr_local_int32_base_atomics
                                           cl_khr_local_int32_extended_atomics
                                           cl_khr_3d_image_writes
                                           cl_khr_byte_addressable_store
                                           cl_khr_gl_sharing
                                           cl_ext_atomic_counters_32
                                           cl_amd_device_attribute_query
                                           cl_amd_vec3
                                           cl_amd_printf
                                           cl_amd_media_ops
                                           cl_amd_media_ops2
                                           cl_amd_popcnt
                                           cl_khr_d3d10_sharing
                                           cl_khr_d3d11_sharing
                                           cl_khr_dx9_media_sharing
                                           cl_amd_image2d_from_buffer_read_only
                                           cl_khr_spir
                                           cl_khr_gl_event

       Device [1]
           id                       = 03501CD0
           type                     = CL_DEVICE_TYPE_CPU
           name                     =     Intel(R) Core(TM) i3-2130 CPU @ 3.40GHz
           vendor                   = GenuineIntel
           driver version           = 1445.5 (sse2,avx)
           device version           = OpenCL 1.2 AMD-APP (1445.5)
           profile                  = FULL_PROFILE
           max compute units        = 4
           max work items dimensions = 3
           max work item sizes      = 1024 / 1024 / 1024
           max work group size      = 1024
           max clock frequency      = 3392 MHz
           address_bits             = 32
           max mem alloc size       = 1024 MB
           global mem size          = 2048 MB
           image support            = CL_TRUE
           max read image args      = 128
           max write image args     = 8
           2D image max size        = 8192 x 8192
           3D image max size        = 2048 x 2048 x 2048
           max samplers             = 16
           max parameter size       = 4096
           mem base addr align      = 1024
           min data type align size = 128
           single fp config         = CL_FP_DENORM CL_FP_INF_NAN CL_FP_ROUND_TO_NEAREST CL_FP_ROUND_TO_ZERO CL_FP_ROUND_TO_INF CL_FP_FMA
           global mem cache type    = CL_READ_WRITE_CACHE
           global mem cacheline size = 64
           global mem cache size    = 32768
           max constant buffer size = 64 KB
           max constant args        = 8
           local mem type           = CL_GLOBAL
           local mem size           = 32 KB
           error correction support = CL_FALSE
           profiling timer resolution = 301 ns
           endian little            = CL_TRUE
           available                = CL_TRUE
           compiler available       = CL_TRUE
           execution capabilities   = CL_EXEC_KERNEL CL_EXEC_NATIVE_KERNEL
           queue properties         = CL_QUEUE_PROFILING_ENABLE
           extensions               = cl_khr_fp64
                                           cl_amd_fp64
                                           cl_khr_global_int32_base_atomics
                                           cl_khr_global_int32_extended_atomics
                                           cl_khr_local_int32_base_atomics
                                           cl_khr_local_int32_extended_atomics
                                           cl_khr_3d_image_writes
                                           cl_khr_byte_addressable_store
                                           cl_khr_gl_sharing
                                           cl_ext_device_fission
                                           cl_amd_device_attribute_query
                                           cl_amd_vec3
                                           cl_amd_printf
                                           cl_amd_media_ops
                                           cl_amd_media_ops2
                                           cl_amd_popcnt
                                           cl_khr_d3d10_sharing
                                           cl_khr_spir
                                           cl_amd_svm
                                           cl_khr_gl_event

I hope you will be able to help me, I am on this issue since days and i don't see what i am doing wrong !

Thank you and sorry about my poor english

maxdz8 · ‎08-20-2014

You don't need the two underscores anymore. Drop them for improved readability.

You don't need private, it's default.

You should NOT define your own struct to replace vec3. This only makes the compiler have more work to do, eventually disabling some optimizations and preventing you from using intrinsic instructions. Feel free to pass vectors by value: most things in OpenCL-C get inlined anyway.

You probably think those two lines to be equivalent:


CDPoint soustraction(const CDPoint* pA, const CDPoint* pB);

CDPoint soustraction(global const CDPoint* pA, global const CDPoint* pB);

They are not!

Unfortunately, when not specified, the default address space for pointers is private.

At LN120 you put there a global pointer. The compiler has a quirk and crashes (most of the time) instead of just giving you a warning.

View solution in original post

maxdz8 · ‎08-20-2014

You don't need the two underscores anymore. Drop them for improved readability.

You don't need private, it's default.

You should NOT define your own struct to replace vec3. This only makes the compiler have more work to do, eventually disabling some optimizations and preventing you from using intrinsic instructions. Feel free to pass vectors by value: most things in OpenCL-C get inlined anyway.

You probably think those two lines to be equivalent:


CDPoint soustraction(const CDPoint* pA, const CDPoint* pB);

CDPoint soustraction(global const CDPoint* pA, global const CDPoint* pB);

They are not!

Unfortunately, when not specified, the default address space for pointers is private.

At LN120 you put there a global pointer. The compiler has a quirk and crashes (most of the time) instead of just giving you a warning.

trlulu · ‎08-20-2014

You told me to use float3 for vector, but I notice that sizeof(cl_float3) = 16 (Instead of 12 expected), there is an useless loss of 4 bytes per vertex... Sometime I work with huge model with lot of triangle, Is there a mean to optimize it ?

maxdz8 · ‎08-21-2014

If you read the documentation, it's clearly noted that vec3 is basically syntactic sugar with no difference from vec4.

BTW, I didn't told you to use vec3. I told you to not use three xyz values to replace what's essentially a vec3.

If you have a lot, a lot, a lot of memory to work on, you might have to pack/unpack those yourself. We're talking about a few million vertices at the very least.

As for the register usage, don't worry about it until your VGPRs start to grow too much.

Never optimize prematurely problems that exist in your head... only exist in your head! Profile, measure, iterate.

dipak · ‎08-20-2014

Hi,

Thanks for reporting this.

I notice that if I change "__global const TTriangle* pTriangleListe" by "const TTriangle* pTriangleListe" it compiles but it is not the code i want

Don't understand how changing a pointer from global to private data-space solves the problem. Please can you share a sample code base (Host+Device) that manifests the problem and also, let us know your system setup like GPU, SDK, driver, OS etc.?

Regards,

trlulu · ‎08-20-2014

The kernel code make crash compiler, I use CodeXL compiler and it crashes (my program crashes also). It's difficult for me to give you the "host" code because my firm isn't agree.

But maxdz8 is right, the problem come from "CDPoint soustraction(const CDPoint* pA, const CDPoint* pB);", to solve it I give to kernel direct type for orgin and direction, i also made a copy into local memory of triangle to have all my points in local memory and let my function prototype with "const CDPoint*..."


__kernel

void IntersectionTriangle(    __global const TTriangle* pTriangleListe,

                            const unsigned int idxDebutTriangle,

                            const unsigned int idxFin,

                            const CDPoint pointOrigine,

                            const CDPoint direction,

                            __global unsigned int *nbInter,

                            __global TGridIntersection* pResults    )

{

    const unsigned int gid = get_global_id(0);

    const unsigned int index = gid + idxDebutTriangle;

   

    if (index >= idxFin) return;

   

    __global const TTriangle *pGlobalTriangle = &pTriangleListe[index];

    float distance = 0.f;

   

    // Copy into local memory

    TTriangle triangle = *pGlobalTriangle;


    // Triangle side and plane normal

    CDPoint edge1 = soustraction(&triangle.triangle_[1], &triangle.triangle_[0]);

    CDPoint edge2 = soustraction(&triangle.triangle_[2], &triangle.triangle_[0]);

    CDPoint pvec = crossProduct(&direction, &edge2);            // produit vectoriel


    // Is ray and triangle parrallel ?

    float det = dotProduct(&edge1, &pvec);

    if (det == 0.f)

    {

        return ;

    }

    float inv_det = 1.f / det;


    // Distance origin t0

    CDPoint tvec = soustraction(&pointOrigine, &triangle.triangle_[0]);


    //Calculate u parameter and test bound

    float u = (dotProduct(&tvec, &pvec)) * inv_det;

    //The intersection lies outside of the triangle

    if (u < -MIN_LONGUEUR_RT

        || u > 1.f + MIN_LONGUEUR_RT)

    {

        return ;

    }


    u = max(u, 0.f);


    //Prepare to test v parameter

    CDPoint qvec = crossProduct(&tvec, &edge1);


    //Calculate V parameter and test bound

    float v = dotProduct(&direction, &qvec) * inv_det;


    //The intersection lies outside of the triangle

    if (v < -MIN_LONGUEUR_RT

        || u + v  > 1.f + MIN_LONGUEUR_RT)

    {

        return ;

    }


    distance = dotProduct(&edge2, &qvec) * inv_det;


    if (distance > -MIN_LONGUEUR_RT)

    {

        distance = max(distance, 0.f);


        // atomic incrementation of nbInter

#ifndef _DEBUG

        unsigned int idxInter = atomic_inc(nbInter);

#else   

        //To debug with CodeXL that doesn't manage atomic operation...

        unsigned int idxInter = *nbInter;

        ++*nbInter;

#endif


        pResults[idxInter].distance_ = distance;


        // Intersection

        CDPoint vDir = homotetie(&direction, distance);

        pResults[idxInter].pos_ = addition(&pointOrigine, &vDir);


        //Intersection way (inside/outside)

        pResults[idxInter].sensNormale_ = dotProduct(&triangle.normal_, &direction) > 0.f ? RAYON_SORTANT : RAYON_ENTRANT;

        pResults[idxInter].idTriangle_ = gid;

    }


}

I kept "CDPoint" because my c++ code uses it, but i wasn't aware it isn't optimized at all, the next step is to change them by float3.

The code upper is functionnal and give me the good intersections, but i am not sure it is really thread safe...

Thank for your reply and advices !

Edit :

My GPU is a Radeon 4050 (Cedar)

AMD SDK is 2.9

Catalyst drivers version is 2014.041.2226.38446

OS is Win7 SP1 64bits

Archives Discussions

OpenCL GPU compiler crash