Hi,
I am working on a kernel that find intersections between ray and a triangle list, but (there is always a "but" ) i got some trouble using my opencl compiler indeed it crashes when I try to compile it.
I try to compile it on my CPU compiler and it compile well, but with my GPU compiler it crashes...
//-----------------------------------------------------------------------------
//---------------------------------DEFINES-------------------------------------
//-----------------------------------------------------------------------------
#define RAYON_SORTANT -1000
#define RAYON_ENTRANT 1000
#define MIN_LONGUEUR_RT 1.E-6f
//-----------------------------------------------------------------------------
//---------------------------------CONTENT-------------------------------------
//-----------------------------------------------------------------------------
typedef struct s_CDPoint
{
float x;
float y;
float z;
} CDPoint;
typedef struct s_TTriangle
{
CDPoint triangle_[3];
CDPoint normal_;
} TTriangle;
typedef struct s_GridIntersection
{
CDPoint pos_;
float distance_;
int sensNormale_;
unsigned int idTriangle_;
} TGridIntersection;
//-----------------------------------------------------------------------------
//---------------------------------MUTEX---------------------------------------
//-----------------------------------------------------------------------------
void GetSemaphor(__global int * semaphor)
{
int occupied = atomic_xchg(semaphor, 1);
while(occupied > 0)
{
occupied = atomic_xchg(semaphor, 1);
}
}
void ReleaseSemaphor(__global int * semaphor)
{
int prevVal = atomic_xchg(semaphor, 0);
}
//-----------------------------------------------------------------------------
//---------------------------------GEOMETRIE-----------------------------------
//-----------------------------------------------------------------------------
float dotProduct(const CDPoint* pA, const CDPoint* pB)
{
return (pA->x * pB->x + pA->y * pB->y + pA->z * pB->z);
}
CDPoint crossProduct(const CDPoint* pA, const CDPoint* pB)
{
CDPoint res;
res.x = pA->y * pB->z - pB->y * pA->z;
res.y = pA->z * pB->x - pB->z * pA->x;
res.z = pA->x * pB->y - pB->x * pA->y;
return res;
}
CDPoint soustraction(const CDPoint* pA, const CDPoint* pB)
{
CDPoint res;
res.x = pA->x - pB->x;
res.y = pA->y - pB->y;
res.z = pA->z - pB->z;
return res;
}
CDPoint addition(const CDPoint* pA, const CDPoint* pB)
{
CDPoint res;
res.x = pA->x + pB->x;
res.y = pA->y + pB->y;
res.z = pA->z + pB->z;
return res;
}
CDPoint homothetie(const CDPoint* pA, float val)
{
CDPoint pnt;
pnt.x = pA->x * val;
pnt.y = pA->y * val;
pnt.z = pA->z * val;
return pnt;
}
//-----------------------------------------------------------------------------
//---------------------------------KERNEL--------------------------------------
//-----------------------------------------------------------------------------
__kernel void IntersectionTriangle( __global const TTriangle* pTriangleListe,
const unsigned int idxDebutTriangle,
const unsigned int idxFin,
__constant const CDPoint* pPointOrigine,
__constant const CDPoint* pDir,
__global int *nbInter,
__global TGridIntersection* pResults )
{
__private unsigned int index = get_global_id(0) + idxDebutTriangle;
if (index > idxFin) return;
__global const TTriangle *pTriangle = &pTriangleListe[index];
__private float distance = 0.f;
// Côté du triangle et normale au plan
__private CDPoint edge1 = soustraction(&pTriangle->triangle_[1], &pTriangle->triangle_[0]);
__private CDPoint edge2 = soustraction(&pTriangle->triangle_[2], &pTriangle->triangle_[0]);
__private CDPoint pvec = crossProduct(pDir, &edge2); // produit vectoriel
// Le rayon et le triangle sont il parallèle ?
__private float det = dotProduct(&edge1, &pvec);
if (det == 0.f)
{
return ;
}
__private float inv_det = 1.f / det;
// Distance origin t0
__private CDPoint tvec = soustraction(pPointOrigine, &pTriangle->triangle_[0]);
//Calculate u parameter and test bound
__private float u = (dotProduct(&tvec, &pvec)) * inv_det;
//The intersection lies outside of the triangle
if (u < -MIN_LONGUEUR_RT
|| u > 1.f + MIN_LONGUEUR_RT)
{
return ;
}
u = max(u, 0.f);
//Prepare to test v parameter
__private CDPoint qvec = crossProduct(&tvec, &edge1);
//Calculate V parameter and test bound
__private float v = dotProduct(pDir, &qvec) * inv_det;
//The intersection lies outside of the triangle
if (v < -MIN_LONGUEUR_RT
|| u + v > 1.f + MIN_LONGUEUR_RT)
{
return ;
}
// Get distance
distance = dotProduct(&edge2, &qvec) * inv_det;
if (distance > -MIN_LONGUEUR_RT)
{
// We are using nbInter as semaphor index
GetSemaphor(nbInter);
__private int idxInter = *nbInter;
pResults[idxInter].distance_ = max(distance, 0.f);
// Intersection point
__private CDPoint vDir = homothetie(pDir, distance);
pResults[idxInter].pos_ = addition(pPointOrigine, &vDir);
// Get ray way
pResults[idxInter].sensNormale_ = dotProduct(&pTriangle->normal_, pDir) > 0.f ? RAYON_SORTANT : RAYON_ENTRANT;
// Triangle id
pResults[idxInter].idTriangle_ = index - idxDebutTriangle;
// inc nb inter
*nbInter = *nbInter + 1;
ReleaseSemaphor(nbInter);
}
}
I notice that if I change "__global const TTriangle* pTriangleListe" by "const TTriangle* pTriangleListe" it compiles but it is not the code i want !
Here it is my openCL computer configuration :
Platform [0] id = 5339E7D8 profile = FULL_PROFILE version = OpenCL 1.2 AMD-APP (1445.5) name = AMD Accelerated Parallel Processing vendor = Advanced Micro Devices, Inc. extensions = cl_khr_icd cl_khr_d3d10_sharing cl_khr_d3d11_sharing cl_khr_dx9_media_sharing cl_amd_event_callback cl_amd_offline_devices cl_amd_hsa
2 Devices detected Device [0] id = 010DFA00 type = CL_DEVICE_TYPE_GPU name = Cedar vendor = Advanced Micro Devices, Inc. driver version = 1445.5 (VM) device version = OpenCL 1.2 AMD-APP (1445.5) profile = FULL_PROFILE max compute units = 2 max work items dimensions = 3 max work item sizes = 128 / 128 / 128 max work group size = 128 max clock frequency = 650 MHz address_bits = 32 max mem alloc size = 512 MB global mem size = 1024 MB image support = CL_TRUE max read image args = 128 max write image args = 8 2D image max size = 16384 x 16384 3D image max size = 2048 x 2048 x 2048 max samplers = 16 max parameter size = 1024 mem base addr align = 2048 min data type align size = 128 single fp config = CL_FP_INF_NAN CL_FP_ROUND_TO_NEAREST CL_FP_ROUND_TO_ZERO CL_FP_ROUND_TO_INF CL_FP_FMA global mem cache type = CL_NONE max constant buffer size = 64 KB max constant args = 8 local mem type = CL_LOCAL local mem size = 32 KB error correction support = CL_FALSE profiling timer resolution = 1 ns endian little = CL_TRUE available = CL_TRUE compiler available = CL_TRUE execution capabilities = CL_EXEC_KERNEL queue properties = CL_QUEUE_PROFILING_ENABLE extensions = cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_gl_sharing cl_ext_atomic_counters_32 cl_amd_device_attribute_query cl_amd_vec3 cl_amd_printf cl_amd_media_ops cl_amd_media_ops2 cl_amd_popcnt cl_khr_d3d10_sharing cl_khr_d3d11_sharing cl_khr_dx9_media_sharing cl_amd_image2d_from_buffer_read_only cl_khr_spir cl_khr_gl_event
Device [1] id = 03501CD0 type = CL_DEVICE_TYPE_CPU name = Intel(R) Core(TM) i3-2130 CPU @ 3.40GHz vendor = GenuineIntel driver version = 1445.5 (sse2,avx) device version = OpenCL 1.2 AMD-APP (1445.5) profile = FULL_PROFILE max compute units = 4 max work items dimensions = 3 max work item sizes = 1024 / 1024 / 1024 max work group size = 1024 max clock frequency = 3392 MHz address_bits = 32 max mem alloc size = 1024 MB global mem size = 2048 MB image support = CL_TRUE max read image args = 128 max write image args = 8 2D image max size = 8192 x 8192 3D image max size = 2048 x 2048 x 2048 max samplers = 16 max parameter size = 4096 mem base addr align = 1024 min data type align size = 128 single fp config = CL_FP_DENORM CL_FP_INF_NAN CL_FP_ROUND_TO_NEAREST CL_FP_ROUND_TO_ZERO CL_FP_ROUND_TO_INF CL_FP_FMA global mem cache type = CL_READ_WRITE_CACHE global mem cacheline size = 64 global mem cache size = 32768 max constant buffer size = 64 KB max constant args = 8 local mem type = CL_GLOBAL local mem size = 32 KB error correction support = CL_FALSE profiling timer resolution = 301 ns endian little = CL_TRUE available = CL_TRUE compiler available = CL_TRUE execution capabilities = CL_EXEC_KERNEL CL_EXEC_NATIVE_KERNEL queue properties = CL_QUEUE_PROFILING_ENABLE extensions = cl_khr_fp64 cl_amd_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_gl_sharing cl_ext_device_fission cl_amd_device_attribute_query cl_amd_vec3 cl_amd_printf cl_amd_media_ops cl_amd_media_ops2 cl_amd_popcnt cl_khr_d3d10_sharing cl_khr_spir cl_amd_svm cl_khr_gl_event
I hope you will be able to help me, I am on this issue since days and i don't see what i am doing wrong !
Thank you and sorry about my poor english
Solved! Go to Solution.
You don't need the two underscores anymore. Drop them for improved readability.
You don't need private, it's default.
You should NOT define your own struct to replace vec3. This only makes the compiler have more work to do, eventually disabling some optimizations and preventing you from using intrinsic instructions. Feel free to pass vectors by value: most things in OpenCL-C get inlined anyway.
You probably think those two lines to be equivalent:
CDPoint soustraction(const CDPoint* pA, const CDPoint* pB);
CDPoint soustraction(global const CDPoint* pA, global const CDPoint* pB);
They are not!
Unfortunately, when not specified, the default address space for pointers is private.
At LN120 you put there a global pointer. The compiler has a quirk and crashes (most of the time) instead of just giving you a warning.
You don't need the two underscores anymore. Drop them for improved readability.
You don't need private, it's default.
You should NOT define your own struct to replace vec3. This only makes the compiler have more work to do, eventually disabling some optimizations and preventing you from using intrinsic instructions. Feel free to pass vectors by value: most things in OpenCL-C get inlined anyway.
You probably think those two lines to be equivalent:
CDPoint soustraction(const CDPoint* pA, const CDPoint* pB);
CDPoint soustraction(global const CDPoint* pA, global const CDPoint* pB);
They are not!
Unfortunately, when not specified, the default address space for pointers is private.
At LN120 you put there a global pointer. The compiler has a quirk and crashes (most of the time) instead of just giving you a warning.
You told me to use float3 for vector, but I notice that sizeof(cl_float3) = 16 (Instead of 12 expected), there is an useless loss of 4 bytes per vertex... Sometime I work with huge model with lot of triangle, Is there a mean to optimize it ?
If you read the documentation, it's clearly noted that vec3 is basically syntactic sugar with no difference from vec4.
BTW, I didn't told you to use vec3. I told you to not use three xyz values to replace what's essentially a vec3.
If you have a lot, a lot, a lot of memory to work on, you might have to pack/unpack those yourself. We're talking about a few million vertices at the very least.
As for the register usage, don't worry about it until your VGPRs start to grow too much.
Never optimize prematurely problems that exist in your head... only exist in your head! Profile, measure, iterate.
Hi,
Thanks for reporting this.
I notice that if I change "__global const TTriangle* pTriangleListe" by "const TTriangle* pTriangleListe" it compiles but it is not the code i want
Don't understand how changing a pointer from global to private data-space solves the problem. Please can you share a sample code base (Host+Device) that manifests the problem and also, let us know your system setup like GPU, SDK, driver, OS etc.?
Regards,
The kernel code make crash compiler, I use CodeXL compiler and it crashes (my program crashes also). It's difficult for me to give you the "host" code because my firm isn't agree.
But maxdz8 is right, the problem come from "CDPoint soustraction(const CDPoint* pA, const CDPoint* pB);", to solve it I give to kernel direct type for orgin and direction, i also made a copy into local memory of triangle to have all my points in local memory and let my function prototype with "const CDPoint*..."
__kernel
void IntersectionTriangle( __global const TTriangle* pTriangleListe,
const unsigned int idxDebutTriangle,
const unsigned int idxFin,
const CDPoint pointOrigine,
const CDPoint direction,
__global unsigned int *nbInter,
__global TGridIntersection* pResults )
{
const unsigned int gid = get_global_id(0);
const unsigned int index = gid + idxDebutTriangle;
if (index >= idxFin) return;
__global const TTriangle *pGlobalTriangle = &pTriangleListe[index];
float distance = 0.f;
// Copy into local memory
TTriangle triangle = *pGlobalTriangle;
// Triangle side and plane normal
CDPoint edge1 = soustraction(&triangle.triangle_[1], &triangle.triangle_[0]);
CDPoint edge2 = soustraction(&triangle.triangle_[2], &triangle.triangle_[0]);
CDPoint pvec = crossProduct(&direction, &edge2); // produit vectoriel
// Is ray and triangle parrallel ?
float det = dotProduct(&edge1, &pvec);
if (det == 0.f)
{
return ;
}
float inv_det = 1.f / det;
// Distance origin t0
CDPoint tvec = soustraction(&pointOrigine, &triangle.triangle_[0]);
//Calculate u parameter and test bound
float u = (dotProduct(&tvec, &pvec)) * inv_det;
//The intersection lies outside of the triangle
if (u < -MIN_LONGUEUR_RT
|| u > 1.f + MIN_LONGUEUR_RT)
{
return ;
}
u = max(u, 0.f);
//Prepare to test v parameter
CDPoint qvec = crossProduct(&tvec, &edge1);
//Calculate V parameter and test bound
float v = dotProduct(&direction, &qvec) * inv_det;
//The intersection lies outside of the triangle
if (v < -MIN_LONGUEUR_RT
|| u + v > 1.f + MIN_LONGUEUR_RT)
{
return ;
}
distance = dotProduct(&edge2, &qvec) * inv_det;
if (distance > -MIN_LONGUEUR_RT)
{
distance = max(distance, 0.f);
// atomic incrementation of nbInter
#ifndef _DEBUG
unsigned int idxInter = atomic_inc(nbInter);
#else
//To debug with CodeXL that doesn't manage atomic operation...
unsigned int idxInter = *nbInter;
++*nbInter;
#endif
pResults[idxInter].distance_ = distance;
// Intersection
CDPoint vDir = homotetie(&direction, distance);
pResults[idxInter].pos_ = addition(&pointOrigine, &vDir);
//Intersection way (inside/outside)
pResults[idxInter].sensNormale_ = dotProduct(&triangle.normal_, &direction) > 0.f ? RAYON_SORTANT : RAYON_ENTRANT;
pResults[idxInter].idTriangle_ = gid;
}
}
I kept "CDPoint" because my c++ code uses it, but i wasn't aware it isn't optimized at all, the next step is to change them by float3.
The code upper is functionnal and give me the good intersections, but i am not sure it is really thread safe...
Thank for your reply and advices !
Edit :
My GPU is a Radeon 4050 (Cedar)
AMD SDK is 2.9
Catalyst drivers version is 2014.041.2226.38446
OS is Win7 SP1 64bits