Hello. I'm programming a mass-spring system with OCL. It uses tetrahedrons, with four mass-points connected by springs. Every tetrahedron calculates its spring force and a volume conservation force for every masspoint connected to it. Since tetrahedrons share masspoints, forces need to be accumlated. I'm using a force texture for that where every tetrahedron writes its forces at a precalculated postion. Then every masspoint sums up its column in the texture where its forces from all connected tetrahedron are stored. So far so good. The algorithm works but the performance is not exactly what i expected. Has anyone suggestions how to make the following code faster? And I like to get rid of the texture since the the size is limited to 8192 on the radeon 5870 and so is the maximum of the masspoints.

typedef struct { float4 rl1, rl2; float m_v0, stiffnes_v, stiffnes_e, c; int4 mps, mpC; } tetra ; typedef struct { float4 mpPos, m_x0, m_f_ext, m_f_con, m_f_pen, velocity; int4 store; } masspoint; typedef struct { float x,y,z; float nx, ny, nz; } vertex; __kernel void force_calc(__global masspoint* _massPoints, __global tetra* _tetra, __write_only image2d_t image) { int id = get_global_id(0); __global tetra *t = &_tetra[id]; //get masspoint indices int mpid1 = t->mps.x; int mpid2 = t->mps.y; int mpid3 = t->mps.z; int mpid4 = t->mps.w; //get masspoints __global masspoint *a = &_massPoints[mpid1]; __global masspoint *b = &_massPoints[mpid2]; __global masspoint *c = &_massPoints[mpid3]; __global masspoint *d = &_massPoints[mpid4]; //calculate edges float4 ac, ad, bc, bd, cd; ac = c->mpPos - a->mpPos; ad = d->mpPos - a->mpPos; bc = c->mpPos - b->mpPos; bd = d->mpPos - b->mpPos; cd = d->mpPos - c->mpPos; //calculate edge forces float4 fe01, fe02, fe03, fe12, fe13, fe23; fe01 = (t->stiffnes_e * ((length(b->mpPos - a->mpPos) - t->rl1.x) / length(b->mpPos - a->mpPos)) * (b->mpPos - a->mpPos)); fe02 = (t->stiffnes_e * ((length(ac) - t->rl1.y) / length(ac)) * ac); fe03 = (t->stiffnes_e * ((length(ad) - t->rl1.z) / length(ad)) * ad); fe12 = (t->stiffnes_e * ((length(bc) - t->rl1.w) / length(bc)) * bc); fe13 = (t->stiffnes_e * ((length(bd) - t->rl2.x) / length(bd)) * bd); fe23 = (t->stiffnes_e * ((length(cd) - t->rl2.y) / length(cd)) * cd); fe01.w = 0.0f; fe02.w = 0.0f; fe03.w = 0.0f; fe12.w = 0.0f; fe13.w = 0.0f; fe23.w = 0.0f; //calculate actual volume float v = (dot((cross((b->mpPos - a->mpPos), (c->mpPos - a->mpPos))), (d->mpPos - a->mpPos)))/6.0f; //calculate normals float4 n1, n2, n3, n4; n1 = cross(c->mpPos - a->mpPos, b->mpPos - a->mpPos); n2 = cross(c->mpPos - b->mpPos, d->mpPos - b->mpPos); n3 = cross(d->mpPos - a->mpPos, c->mpPos - a->mpPos); n4 = cross(b->mpPos - a->mpPos, d->mpPos - a->mpPos); //calculate center and check if normal is outfacing float4 c1, c2, c3, c4; c1 = (a->mpPos + b->mpPos + c->mpPos)/3; c2 = (b->mpPos + c->mpPos + d->mpPos)/3; c3 = (a->mpPos + c->mpPos + d->mpPos)/3; c4 = (a->mpPos + b->mpPos + d->mpPos)/3; //rotate normals if they are "infacing" n1 = (isless(dot( (d->mpPos - c2), n1), 0.0f)) ? n1 : n1 * (-1); n2 = (isless(dot( (a->mpPos - c3), n2), 0.0f)) ? n2 : n2 * (-1); n3 = (isless(dot( (b->mpPos - c4), n3), 0.0f)) ? n3 : n3 * (-1); n4 = (isless(dot( (c->mpPos - c1), n4), 0.0f)) ? n4 : n4 * (-1); //normalize n1 /= fast_length(n1); n2 /= fast_length(n2); n3 /= fast_length(n3); n4 /= fast_length(n4); //calculate volume preservation forces float4 fv1, fv2, fv3, fv4; fv1 = t->stiffnes_v * (v - t->m_v0) * (n2); fv2 = t->stiffnes_v * (v - t->m_v0) * (n3); fv3 = t->stiffnes_v * (v - t->m_v0) * (n4); fv4 = t->stiffnes_v * (v - t->m_v0) * (n1); //write everything to the texture write_imagef(image, (int2)(mpid1, t->mpC.x-1), (fv1 + fe01 + fe02 + fe03)); write_imagef(image, (int2)(mpid2, t->mpC.y-1), (fv2 + fe12 + fe13 - fe01)); write_imagef(image, (int2)(mpid3, t->mpC.z-1), (fv3 + fe23 - fe02 - fe12)); write_imagef(image, (int2)(mpid4, t->mpC.w-1), (fv4 - fe03 - fe13 - fe23)); } __kernel void verlet_collision(__global masspoint* _massPoints, __read_only image2d_t image, __global vertex* _out) { int id = get_global_id(0); __global masspoint *mp = &_massPoints[id]; const sampler_t samplerA = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; float4 force = 0.0f; //sum up forces for masspoint at id for(int i=0; i < mp->store.y; ++i){ force += read_imagef(image, samplerA, (float2)(id,i)); } mp->m_f_pen = force; mp->m_f_pen.w = 0.0f; //verlet integration float4 mp0 = mp->mpPos; int damp = 5; float t = 0.005f; float4 forceTotal = (mp->m_f_ext + mp->m_f_pen - (damp * (mp->mpPos - mp->m_x0) / t)); mp->mpPos = (forceTotal / mp->store.x) * (t * t) + 2 * mp->mpPos - mp->m_x0; mp->m_x0 = mp0; if (mp->mpPos.y < -10.0f) mp->mpPos.y = -9.9999f; //vertex postions _out[id].x = mp->mpPos.x; _out[id].y = mp->mpPos.y; _out[id].z = mp->mpPos.z; }

Hm. No one? The Bottleneck seems to be the accumulation, which result in depend texture fetches, i think. Has someone an idea how to scatter my forces instead of using one big image with every single force in it?