5 Replies Latest reply on Aug 21, 2014 3:18 AM by maxdz8

    OpenCL GPU compiler crash

    trlulu

      Hi,

       

      I am working on a kernel that find intersections between ray and a triangle list, but (there is always a "but" ) i got some trouble using my opencl compiler indeed it crashes when I try to compile it.

      I try to compile it on my CPU compiler and it compile well, but with my GPU compiler it crashes...

       

      //-----------------------------------------------------------------------------
      //---------------------------------DEFINES-------------------------------------
      //-----------------------------------------------------------------------------
      
      #define RAYON_SORTANT -1000
      #define RAYON_ENTRANT 1000
      
      #define MIN_LONGUEUR_RT 1.E-6f
      
      //-----------------------------------------------------------------------------
      //---------------------------------CONTENT-------------------------------------
      //-----------------------------------------------------------------------------
      
      typedef struct s_CDPoint
      {
          float x;
          float y;
          float z;
      } CDPoint;
      
      typedef struct s_TTriangle
      {
          CDPoint            triangle_[3];
          CDPoint            normal_;
      } TTriangle;
      
      typedef struct s_GridIntersection
      {
          CDPoint         pos_;
          float             distance_;
          int                sensNormale_;
          unsigned int    idTriangle_;
      } TGridIntersection;
      
      //-----------------------------------------------------------------------------
      //---------------------------------MUTEX---------------------------------------
      //-----------------------------------------------------------------------------
      
      void GetSemaphor(__global int * semaphor)
      {
         int occupied = atomic_xchg(semaphor, 1);
         while(occupied > 0)
         {
           occupied = atomic_xchg(semaphor, 1);
         }
      }
      
      void ReleaseSemaphor(__global int * semaphor)
      {
         int prevVal = atomic_xchg(semaphor, 0);
      }
      
      //-----------------------------------------------------------------------------
      //---------------------------------GEOMETRIE-----------------------------------
      //-----------------------------------------------------------------------------
      
      float dotProduct(const CDPoint* pA, const CDPoint* pB)
      {
          return (pA->x * pB->x + pA->y * pB->y + pA->z * pB->z);
      }
      
      CDPoint crossProduct(const CDPoint* pA, const CDPoint* pB)
      {
          CDPoint res;
      
          res.x = pA->y * pB->z - pB->y * pA->z;
          res.y = pA->z * pB->x - pB->z * pA->x;
          res.z = pA->x * pB->y - pB->x * pA->y;
      
          return res;
      }
      
      CDPoint soustraction(const CDPoint* pA, const CDPoint* pB)
      {
          CDPoint res;
          res.x = pA->x - pB->x;
          res.y = pA->y - pB->y;
          res.z = pA->z - pB->z;
          return res;
      }
      
      CDPoint addition(const CDPoint* pA, const CDPoint* pB)
      {
          CDPoint res;
          res.x = pA->x + pB->x;
          res.y = pA->y + pB->y;
          res.z = pA->z + pB->z;
          return res;
      }
      
      CDPoint homothetie(const CDPoint* pA, float val)
      {
          CDPoint pnt;
          pnt.x = pA->x * val;
          pnt.y = pA->y * val;
          pnt.z = pA->z * val;
          return pnt;
      }
      
      //-----------------------------------------------------------------------------
      //---------------------------------KERNEL--------------------------------------
      //-----------------------------------------------------------------------------
      
      __kernel void IntersectionTriangle(    __global const TTriangle* pTriangleListe,
                                  const unsigned int idxDebutTriangle,
                                  const unsigned int idxFin,
                                  __constant const CDPoint* pPointOrigine,
                                  __constant const CDPoint* pDir,
                                  __global int *nbInter,
                                  __global TGridIntersection* pResults    )
      {
          __private unsigned int index = get_global_id(0) + idxDebutTriangle;
         
          if (index > idxFin) return;
         
          __global const TTriangle *pTriangle = &pTriangleListe[index];
          __private float distance = 0.f;
         
          // Côté du triangle et normale au plan
          __private CDPoint edge1 = soustraction(&pTriangle->triangle_[1], &pTriangle->triangle_[0]);
          __private CDPoint edge2 = soustraction(&pTriangle->triangle_[2], &pTriangle->triangle_[0]);
          __private CDPoint pvec = crossProduct(pDir, &edge2);            // produit vectoriel
      
          // Le rayon et le triangle sont il parallèle ?
          __private float det = dotProduct(&edge1, &pvec);
          if (det == 0.f)
          {
              return ;
          }
          __private float inv_det = 1.f / det;
      
          // Distance origin t0
          __private CDPoint tvec = soustraction(pPointOrigine, &pTriangle->triangle_[0]);
      
          //Calculate u parameter and test bound
          __private float u = (dotProduct(&tvec, &pvec)) * inv_det;
          //The intersection lies outside of the triangle
          if (u < -MIN_LONGUEUR_RT
              || u > 1.f + MIN_LONGUEUR_RT)
          {
              return ;
          }
      
          u = max(u, 0.f);
      
          //Prepare to test v parameter
          __private CDPoint qvec = crossProduct(&tvec, &edge1);
      
          //Calculate V parameter and test bound
          __private float v = dotProduct(pDir, &qvec) * inv_det;
      
          //The intersection lies outside of the triangle
          if (v < -MIN_LONGUEUR_RT
              || u + v  > 1.f + MIN_LONGUEUR_RT)
          {
              return ;
          }
      
          // Get distance
          distance = dotProduct(&edge2, &qvec) * inv_det;
      
          if (distance > -MIN_LONGUEUR_RT)
          {
              // We are using nbInter as semaphor index
              GetSemaphor(nbInter);
             
              __private int idxInter = *nbInter;
      
              pResults[idxInter].distance_ = max(distance, 0.f);
      
              // Intersection point
              __private CDPoint vDir = homothetie(pDir, distance);
              pResults[idxInter].pos_ = addition(pPointOrigine, &vDir);
      
              // Get ray way
              pResults[idxInter].sensNormale_ = dotProduct(&pTriangle->normal_, pDir) > 0.f ? RAYON_SORTANT : RAYON_ENTRANT;
      
              // Triangle id
              pResults[idxInter].idTriangle_ = index - idxDebutTriangle;
             
              // inc nb inter
              *nbInter = *nbInter + 1;
             
              ReleaseSemaphor(nbInter);
          }
         
      }
      
      

       

      I notice that if I change "__global const TTriangle* pTriangleListe" by "const TTriangle* pTriangleListe" it compiles but it is not the code i want !

       

      Here it is my openCL computer configuration :

       

      Platform [0]
         id    = 5339E7D8
         profile = FULL_PROFILE
         version = OpenCL 1.2 AMD-APP (1445.5)
         name  = AMD Accelerated Parallel Processing
         vendor = Advanced Micro Devices, Inc.
         extensions = cl_khr_icd
                      cl_khr_d3d10_sharing
                      cl_khr_d3d11_sharing
                      cl_khr_dx9_media_sharing
                      cl_amd_event_callback
                      cl_amd_offline_devices
                      cl_amd_hsa

       

       

         2 Devices detected
             Device [0]
                 id                       = 010DFA00
                 type                     = CL_DEVICE_TYPE_GPU
                 name                     = Cedar
                 vendor                   = Advanced Micro Devices, Inc.
                 driver version           = 1445.5 (VM)
                 device version           = OpenCL 1.2 AMD-APP (1445.5)
                 profile                  = FULL_PROFILE
                 max compute units        = 2
                 max work items dimensions = 3
                 max work item sizes      = 128 / 128 / 128
                 max work group size      = 128
                 max clock frequency      = 650 MHz
                 address_bits             = 32
                 max mem alloc size       = 512 MB
                 global mem size          = 1024 MB
                 image support            = CL_TRUE
                 max read image args      = 128
                 max write image args     = 8
                 2D image max size        = 16384 x 16384
                 3D image max size        = 2048 x 2048 x 2048
                 max samplers             = 16
                 max parameter size       = 1024
                 mem base addr align      = 2048
                 min data type align size = 128
                 single fp config         = CL_FP_INF_NAN CL_FP_ROUND_TO_NEAREST CL_FP_ROUND_TO_ZERO CL_FP_ROUND_TO_INF CL_FP_FMA
                 global mem cache type    = CL_NONE
                 max constant buffer size = 64 KB
                 max constant args        = 8
                 local mem type           = CL_LOCAL
                 local mem size           = 32 KB
                 error correction support = CL_FALSE
                 profiling timer resolution = 1 ns
                 endian little            = CL_TRUE
                 available                = CL_TRUE
                 compiler available       = CL_TRUE
                 execution capabilities   = CL_EXEC_KERNEL
                 queue properties         = CL_QUEUE_PROFILING_ENABLE
                 extensions               = cl_khr_global_int32_base_atomics
                                                 cl_khr_global_int32_extended_atomics
                                                 cl_khr_local_int32_base_atomics
                                                 cl_khr_local_int32_extended_atomics
                                                 cl_khr_3d_image_writes
                                                 cl_khr_byte_addressable_store
                                                 cl_khr_gl_sharing
                                                 cl_ext_atomic_counters_32
                                                 cl_amd_device_attribute_query
                                                 cl_amd_vec3
                                                 cl_amd_printf
                                                 cl_amd_media_ops
                                                 cl_amd_media_ops2
                                                 cl_amd_popcnt
                                                 cl_khr_d3d10_sharing
                                                 cl_khr_d3d11_sharing
                                                 cl_khr_dx9_media_sharing
                                                 cl_amd_image2d_from_buffer_read_only
                                                 cl_khr_spir
                                                 cl_khr_gl_event

       

             Device [1]
                 id                       = 03501CD0
                 type                     = CL_DEVICE_TYPE_CPU
                 name                     =     Intel(R) Core(TM) i3-2130 CPU @ 3.40GHz
                 vendor                   = GenuineIntel
                 driver version           = 1445.5 (sse2,avx)
                 device version           = OpenCL 1.2 AMD-APP (1445.5)
                 profile                  = FULL_PROFILE
                 max compute units        = 4
                 max work items dimensions = 3
                 max work item sizes      = 1024 / 1024 / 1024
                 max work group size      = 1024
                 max clock frequency      = 3392 MHz
                 address_bits             = 32
                 max mem alloc size       = 1024 MB
                 global mem size          = 2048 MB
                 image support            = CL_TRUE
                 max read image args      = 128
                 max write image args     = 8
                 2D image max size        = 8192 x 8192
                 3D image max size        = 2048 x 2048 x 2048
                 max samplers             = 16
                 max parameter size       = 4096
                 mem base addr align      = 1024
                 min data type align size = 128
                 single fp config         = CL_FP_DENORM CL_FP_INF_NAN CL_FP_ROUND_TO_NEAREST CL_FP_ROUND_TO_ZERO CL_FP_ROUND_TO_INF CL_FP_FMA
                 global mem cache type    = CL_READ_WRITE_CACHE
                 global mem cacheline size = 64
                 global mem cache size    = 32768
                 max constant buffer size = 64 KB
                 max constant args        = 8
                 local mem type           = CL_GLOBAL
                 local mem size           = 32 KB
                 error correction support = CL_FALSE
                 profiling timer resolution = 301 ns
                 endian little            = CL_TRUE
                 available                = CL_TRUE
                 compiler available       = CL_TRUE
                 execution capabilities   = CL_EXEC_KERNEL CL_EXEC_NATIVE_KERNEL
                 queue properties         = CL_QUEUE_PROFILING_ENABLE
                 extensions               = cl_khr_fp64
                                                 cl_amd_fp64
                                                 cl_khr_global_int32_base_atomics
                                                 cl_khr_global_int32_extended_atomics
                                                 cl_khr_local_int32_base_atomics
                                                 cl_khr_local_int32_extended_atomics
                                                 cl_khr_3d_image_writes
                                                 cl_khr_byte_addressable_store
                                                 cl_khr_gl_sharing
                                                 cl_ext_device_fission
                                                 cl_amd_device_attribute_query
                                                 cl_amd_vec3
                                                 cl_amd_printf
                                                 cl_amd_media_ops
                                                 cl_amd_media_ops2
                                                 cl_amd_popcnt
                                                 cl_khr_d3d10_sharing
                                                 cl_khr_spir
                                                 cl_amd_svm
                                                 cl_khr_gl_event

       

       

       

      I hope you will be able to help me, I am on this issue since days and i don't see what i am doing wrong !

      Thank you and sorry about my poor english

        • Re: OpenCL GPU compiler crash
          maxdz8

          You don't need the two underscores anymore. Drop them for improved readability.

          You don't need private, it's default.

          You should NOT define your own struct to replace vec3. This only makes the compiler have more work to do, eventually disabling some optimizations and preventing you from using intrinsic instructions. Feel free to pass vectors by value: most things in OpenCL-C get inlined anyway.

           

          You probably think those two lines to be equivalent:

          CDPoint soustraction(const CDPoint* pA, const CDPoint* pB);
          CDPoint soustraction(global const CDPoint* pA, global const CDPoint* pB);
          

          They are not!

          Unfortunately, when not specified, the default address space for pointers is private.

          At LN120 you put there a global pointer. The compiler has a quirk and crashes (most of the time) instead of just giving you a warning.

            • Re: OpenCL GPU compiler crash
              trlulu

              You told me to use float3 for vector, but I notice that sizeof(cl_float3) = 16 (Instead of 12 expected), there is an useless loss of 4 bytes per vertex... Sometime I work with huge model with lot of triangle, Is there a mean to optimize it ?

                • Re: OpenCL GPU compiler crash
                  maxdz8

                  If you read the documentation, it's clearly noted that vec3 is basically syntactic sugar with no difference from vec4.

                  BTW, I didn't told you to use vec3. I told you to not use three xyz values to replace what's essentially a vec3.

                  If you have a lot, a lot, a lot of memory to work on, you might have to pack/unpack those yourself. We're talking about a few million vertices at the very least.

                   

                  As for the register usage, don't worry about it until your VGPRs start to grow too much.

                   

                  Never optimize prematurely problems that exist in your head... only exist in your head! Profile, measure, iterate.

              • Re: OpenCL GPU compiler crash
                dipak

                Hi,

                Thanks for reporting this.

                 

                I notice that if I change "__global const TTriangle* pTriangleListe" by "const TTriangle* pTriangleListe" it compiles but it is not the code i want

                Don't understand how changing a pointer from global to private data-space solves the problem. Please can you share a sample code base (Host+Device) that manifests the problem and also, let us know your system setup like GPU, SDK, driver, OS etc.?

                 

                Regards,

                  • Re: Re: OpenCL GPU compiler crash
                    trlulu

                    The kernel code make crash compiler, I use CodeXL compiler and it crashes (my program crashes also). It's difficult for me to give you the "host" code because my firm isn't agree.

                    But CDPoint soustraction(const CDPoint* pA, const CDPoint* pB);", to solve it I give to kernel direct type for orgin and direction, i also made a copy into local memory of triangle to have all my points in local memory and let my function prototype with "const CDPoint*..."

                     

                    __kernel
                    void IntersectionTriangle(    __global const TTriangle* pTriangleListe,
                                                const unsigned int idxDebutTriangle,
                                                const unsigned int idxFin,
                                                const CDPoint pointOrigine,
                                                const CDPoint direction,
                                                __global unsigned int *nbInter,
                                                __global TGridIntersection* pResults    )
                    {
                        const unsigned int gid = get_global_id(0);
                        const unsigned int index = gid + idxDebutTriangle;
                       
                        if (index >= idxFin) return;
                       
                        __global const TTriangle *pGlobalTriangle = &pTriangleListe[index];
                        float distance = 0.f;
                       
                        // Copy into local memory
                        TTriangle triangle = *pGlobalTriangle;
                    
                        // Triangle side and plane normal
                        CDPoint edge1 = soustraction(&triangle.triangle_[1], &triangle.triangle_[0]);
                        CDPoint edge2 = soustraction(&triangle.triangle_[2], &triangle.triangle_[0]);
                        CDPoint pvec = crossProduct(&direction, &edge2);            // produit vectoriel
                    
                        // Is ray and triangle parrallel ?
                        float det = dotProduct(&edge1, &pvec);
                        if (det == 0.f)
                        {
                            return ;
                        }
                        float inv_det = 1.f / det;
                    
                        // Distance origin t0
                        CDPoint tvec = soustraction(&pointOrigine, &triangle.triangle_[0]);
                    
                        //Calculate u parameter and test bound
                        float u = (dotProduct(&tvec, &pvec)) * inv_det;
                        //The intersection lies outside of the triangle
                        if (u < -MIN_LONGUEUR_RT
                            || u > 1.f + MIN_LONGUEUR_RT)
                        {
                            return ;
                        }
                    
                        u = max(u, 0.f);
                    
                        //Prepare to test v parameter
                        CDPoint qvec = crossProduct(&tvec, &edge1);
                    
                        //Calculate V parameter and test bound
                        float v = dotProduct(&direction, &qvec) * inv_det;
                    
                        //The intersection lies outside of the triangle
                        if (v < -MIN_LONGUEUR_RT
                            || u + v  > 1.f + MIN_LONGUEUR_RT)
                        {
                            return ;
                        }
                    
                        distance = dotProduct(&edge2, &qvec) * inv_det;
                    
                        if (distance > -MIN_LONGUEUR_RT)
                        {
                            distance = max(distance, 0.f);
                    
                            // atomic incrementation of nbInter
                    #ifndef _DEBUG
                            unsigned int idxInter = atomic_inc(nbInter);
                    #else   
                            //To debug with CodeXL that doesn't manage atomic operation...
                            unsigned int idxInter = *nbInter;
                            ++*nbInter;
                    #endif
                    
                            pResults[idxInter].distance_ = distance;
                    
                            // Intersection
                            CDPoint vDir = homotetie(&direction, distance);
                            pResults[idxInter].pos_ = addition(&pointOrigine, &vDir);
                    
                            //Intersection way (inside/outside)
                            pResults[idxInter].sensNormale_ = dotProduct(&triangle.normal_, &direction) > 0.f ? RAYON_SORTANT : RAYON_ENTRANT;
                            pResults[idxInter].idTriangle_ = gid;
                        }
                    
                    }
                    
                    

                     

                    I kept "CDPoint" because my c++ code uses it, but i wasn't aware it isn't optimized at all, the next step is to change them by float3.

                    The code upper is functionnal and give me the good intersections, but i am not sure it is really thread safe...

                     

                    Thank for your reply and advices !

                     

                    Edit :

                    My GPU is a Radeon 4050 (Cedar)

                    AMD SDK is 2.9

                    Catalyst drivers version is 2014.041.2226.38446

                    OS is Win7 SP1 64bits