3 Replies Latest reply on Sep 27, 2010 2:17 PM by LeeHowes

    physics simulation kernel optimization

    chevydevil

       Hello. I'm programming a mass-spring system with OCL. It uses tetrahedrons, with four mass-points connected by springs. Every tetrahedron calculates its spring force and a volume conservation force for every masspoint connected to it. Since tetrahedrons share masspoints, forces need to be accumlated. I'm using a force texture for that where every tetrahedron writes its forces at a precalculated postion. Then every masspoint sums up its column in the texture where its forces from all connected tetrahedron are stored. So far so good. The algorithm works but the performance is not exactly what i expected. Has anyone suggestions how to make the following code faster? And I like to get rid of the texture since the the size is limited to 8192 on the radeon 5870 and so is the maximum of the masspoints.

      typedef struct { float4 rl1, rl2; float m_v0, stiffnes_v, stiffnes_e, c; int4 mps, mpC; } tetra ; typedef struct { float4 mpPos, m_x0, m_f_ext, m_f_con, m_f_pen, velocity; int4 store; } masspoint; typedef struct { float x,y,z; float nx, ny, nz; } vertex; __kernel void force_calc(__global masspoint* _massPoints, __global tetra* _tetra, __write_only image2d_t image) { int id = get_global_id(0); __global tetra *t = &_tetra[id]; //get masspoint indices int mpid1 = t->mps.x; int mpid2 = t->mps.y; int mpid3 = t->mps.z; int mpid4 = t->mps.w; //get masspoints __global masspoint *a = &_massPoints[mpid1]; __global masspoint *b = &_massPoints[mpid2]; __global masspoint *c = &_massPoints[mpid3]; __global masspoint *d = &_massPoints[mpid4]; //calculate edges float4 ac, ad, bc, bd, cd; ac = c->mpPos - a->mpPos; ad = d->mpPos - a->mpPos; bc = c->mpPos - b->mpPos; bd = d->mpPos - b->mpPos; cd = d->mpPos - c->mpPos; //calculate edge forces float4 fe01, fe02, fe03, fe12, fe13, fe23; fe01 = (t->stiffnes_e * ((length(b->mpPos - a->mpPos) - t->rl1.x) / length(b->mpPos - a->mpPos)) * (b->mpPos - a->mpPos)); fe02 = (t->stiffnes_e * ((length(ac) - t->rl1.y) / length(ac)) * ac); fe03 = (t->stiffnes_e * ((length(ad) - t->rl1.z) / length(ad)) * ad); fe12 = (t->stiffnes_e * ((length(bc) - t->rl1.w) / length(bc)) * bc); fe13 = (t->stiffnes_e * ((length(bd) - t->rl2.x) / length(bd)) * bd); fe23 = (t->stiffnes_e * ((length(cd) - t->rl2.y) / length(cd)) * cd); fe01.w = 0.0f; fe02.w = 0.0f; fe03.w = 0.0f; fe12.w = 0.0f; fe13.w = 0.0f; fe23.w = 0.0f; //calculate actual volume float v = (dot((cross((b->mpPos - a->mpPos), (c->mpPos - a->mpPos))), (d->mpPos - a->mpPos)))/6.0f; //calculate normals float4 n1, n2, n3, n4; n1 = cross(c->mpPos - a->mpPos, b->mpPos - a->mpPos); n2 = cross(c->mpPos - b->mpPos, d->mpPos - b->mpPos); n3 = cross(d->mpPos - a->mpPos, c->mpPos - a->mpPos); n4 = cross(b->mpPos - a->mpPos, d->mpPos - a->mpPos); //calculate center and check if normal is outfacing float4 c1, c2, c3, c4; c1 = (a->mpPos + b->mpPos + c->mpPos)/3; c2 = (b->mpPos + c->mpPos + d->mpPos)/3; c3 = (a->mpPos + c->mpPos + d->mpPos)/3; c4 = (a->mpPos + b->mpPos + d->mpPos)/3; //rotate normals if they are "infacing" n1 = (isless(dot( (d->mpPos - c2), n1), 0.0f)) ? n1 : n1 * (-1); n2 = (isless(dot( (a->mpPos - c3), n2), 0.0f)) ? n2 : n2 * (-1); n3 = (isless(dot( (b->mpPos - c4), n3), 0.0f)) ? n3 : n3 * (-1); n4 = (isless(dot( (c->mpPos - c1), n4), 0.0f)) ? n4 : n4 * (-1); //normalize n1 /= fast_length(n1); n2 /= fast_length(n2); n3 /= fast_length(n3); n4 /= fast_length(n4); //calculate volume preservation forces float4 fv1, fv2, fv3, fv4; fv1 = t->stiffnes_v * (v - t->m_v0) * (n2); fv2 = t->stiffnes_v * (v - t->m_v0) * (n3); fv3 = t->stiffnes_v * (v - t->m_v0) * (n4); fv4 = t->stiffnes_v * (v - t->m_v0) * (n1); //write everything to the texture write_imagef(image, (int2)(mpid1, t->mpC.x-1), (fv1 + fe01 + fe02 + fe03)); write_imagef(image, (int2)(mpid2, t->mpC.y-1), (fv2 + fe12 + fe13 - fe01)); write_imagef(image, (int2)(mpid3, t->mpC.z-1), (fv3 + fe23 - fe02 - fe12)); write_imagef(image, (int2)(mpid4, t->mpC.w-1), (fv4 - fe03 - fe13 - fe23)); } __kernel void verlet_collision(__global masspoint* _massPoints, __read_only image2d_t image, __global vertex* _out) { int id = get_global_id(0); __global masspoint *mp = &_massPoints[id]; const sampler_t samplerA = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; float4 force = 0.0f; //sum up forces for masspoint at id for(int i=0; i < mp->store.y; ++i){ force += read_imagef(image, samplerA, (float2)(id,i)); } mp->m_f_pen = force; mp->m_f_pen.w = 0.0f; //verlet integration float4 mp0 = mp->mpPos; int damp = 5; float t = 0.005f; float4 forceTotal = (mp->m_f_ext + mp->m_f_pen - (damp * (mp->mpPos - mp->m_x0) / t)); mp->mpPos = (forceTotal / mp->store.x) * (t * t) + 2 * mp->mpPos - mp->m_x0; mp->m_x0 = mp0; if (mp->mpPos.y < -10.0f) mp->mpPos.y = -9.9999f; //vertex postions _out[id].x = mp->mpPos.x; _out[id].y = mp->mpPos.y; _out[id].z = mp->mpPos.z; }

        • physics simulation kernel optimization
          chevydevil

          Hm. No one? The Bottleneck seems to be the accumulation, which result in depend texture fetches, i think. Has someone an idea how to scatter my forces instead of using one big image with every single force in it?

            • physics simulation kernel optimization
              karbous

              My guess is: Try to use __local memory instead of __global for variables which you read more times. e.g b->mpPos, c->mpPos... Think about how the treads are accessing global memory and try to make reads more coalesce (put the input data into straight arrays without structs, just to optimize the reads...)

              Profile your code with OpenCL profiler and see more what happens and where are the bottlenecks.

               

              • physics simulation kernel optimization
                LeeHowes

                The bottleneck "seems to be" the accumulation? Are you saying that second kernel takes the majority of execution time?

                You could scatter your forces, but you'd have to use atomic compare and exchange with loops, or possibly do your accumulation in fixed precision (that might work in a lot of circumstances) so you can use integer atomics.

                I don't think I'd use an image there at all, you probably won't get any caching benefit. You've already assumed that a dimension of the image has a maximum that reaches the maximum number of tets for a given mass in one dimension. Lay an array out in the same way - you only read it once, and you can read it in a perfectly structured fashion as you do with the image.

                If your performance is far short of reaching peak memory bandwidth you are probably clause bound in the hardware because the loop can't be unrolled. Try unrolling by some factor that should give you reasonable efficiency over the mesh and just always load a sequence of addresses. Then either do a small test on the force counter value and do conditional adds or make sure the dataset is full of 0s so you can just add it in anyway and not worry about it. The gain from unrolling may well be greater than the loss from junk loads in those cases where the entire wavefront is reading useless data.

                 

                Also check that the compiler is pushing these values through correctly:

                {
                   mp->m_f_pen = force;   
                ...
                   float4 forceTotal = (mp->m_f_ext + mp->m_f_pen - (damp * (mp->mpPos - mp->m_x0) / t));

                }

                It's possible that thanks to the pointer indirection the compiler won't catch that m_f_pen is reused so it need not do another (really inefficient strided) memory load rather than just use the value that's in a register. Using force in that case rather than m_f_pen might help if that's the case. The same applies to mpPos. I hope the compiler would catch that but you never know with llvm.