Archives Discussions

plap · ‎02-09-2010

anyone done it?

I am trying to calculate the gradient vector field (GVF) in 3D, but I am having little luck making my code run well on the GPU (4870). Currently it runs about 6 times faster on the CPU (core i7 920) than on the gpu. I was hoping to get it turned around. 😛 There is probably a good reason for that. GVF is very computational intensive with O(n^2) complexity (n=number of voxels)...

sizeX, Y and Z can be up to 512 each(!) but my test volume is 'only' 67^3 (for GPU run, about 25^3 is max before BSOD/VPU recovery occurs).

My question is: Seeing as GVF needs to access _all_ input voxels for each output voxel, is this really a jobs that the GPU is good for? I am thinking about memory latency that may be the killer here.

Or is this just a question of understanding the memory models better?

/* extract from the initializer: */ /* buffers created like this */ inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_uchar ) * sizeX*sizeY*sizeZ, input3d, &err); checkErr(err, "clCreateBuffer:in"); outH = malloc(sizeX*sizeY*sizeZ*sizeof(cl_float4)); outCL = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeX*sizeY*sizeZ*sizeof(cl_float4), outH, &err); checkErr(err, "clCreateBuffer:out"); /* kernel setup */ global_work_size[0] = sizeX; global_work_size[1] = sizeY; global_work_size[2] = sizeZ; local_work_size[0] = 1; local_work_size[1] = 1; local_work_size[2] = 1; err = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &event); checkErr(err, "clEnqueueNDRangeKernel"); clWaitForEvents(1, &event); /* * kernel */ __kernel void gvf(__constant uchar* inputImage, __global float4* outputImage) { int x = get_global_id(0); int y = get_global_id(1); int z = get_global_id(2); uint width = get_global_size(0); uint height = get_global_size(1); uint depth = get_global_size(2); float sumGX = 0.0; float sumGY = 0; /**/ float sumGZ = 0; /**/ float distX, distX2; float distY, distY2; float distZ, distZ2; float dist; uint idx; float4 sumG = 0.0f; float voxel,len; float4 dir; for(uint i = 0; i < width; ++i) { distX = (float)((int)i-(int)x); distX2 = distX * distX; for(uint j = 0; j < height; ++j) { distY = (float)((int)j-(int)y); distY2 = distY * distY; for(uint k = 0; k < depth; ++k) { distZ = (float)((int)k-(int)z); distZ2 = distZ * distZ; idx = i + j*width + k*width*height; voxel=(float)inputImage[idx]; dist=(distX2+distY2+distZ2); dir = (float4)(distX, distY, distZ, 0.0f); sumG+=(dist>0.0f)?voxel*dir/sqrt((dir.x*dir.x+dir.y*dir.y+dir.z*dir.z)*dist):(float4)(0.0f, 0.0f, 0.0f, 0.0f); } } } outputImage[x + y*width + z*width*height]=sumG; }

plap · ‎02-09-2010

Well, silly me.. I didn't try to increase the local_work_size... doing that worked wonders

genaganna · ‎02-10-2010

Originally posted by: plap Well, silly me.. I didn't try to increase the local_work_size... doing that worked wonders

Read following .pdf for Optimizations.

http://developer.amd.com/gpu/ATIStreamSDK/assets/ATI_Stream_SDK_Performance_Notes.pdf

plap · ‎02-13-2010

I tried to use local memory, but as the 4870 doesn't have any, I didn't get any speedup.

local_work_size is 4 by 4 by 4 and global_work_size is 48 by 48 by 48 during my test, but is to be 200 by 200 by 200 when I am certain it can't be made faster.

But if any knowledged person would take a quick look at the kernel and let me know what I might have missed, that would be great.

__kernel void gvf(__constant float* inputImage, __global float4* outputImage, const uint depth2) { __local float localBuffer[4*4*4]; float4 sumG = 0.0f; float4 dir; int x = get_global_id(0); int y = get_global_id(1); int z = get_global_id(2); uint width = get_global_size(0); uint height = get_global_size(1); uint depth = get_global_size(2); uint lx = get_local_id(0); uint ly = get_local_id(1); uint lz = get_local_id(2); float dist; uint idx=0; for(int k = 0; k < depth; k+=4) { for(int j = 0; j < height; j+=4) { for(int i = 0; i < width; i+=4) { localBuffer[lx+ly*4+lz*16] = inputImage[i+lx + (j+ly)*width + (k+lz)*width*height]; barrier(CLK_LOCAL_MEM_FENCE); // we should now have the subvolume i:i+3,j:j+3,k:k+3 of inputImage cached in localBuffer for(int K=0;K<4;K++) { for(int J=0;J<4;J++) { for(int I=0;I<4;I++) { idx = I + J*4 + K*16; dir = (float4)(i+I-x, j+J-y, k+K-z, 0); dist = dot(dir, dir); sumG+=(dist>0.0f)?localBuffer[idx]*dir/dist:(float4)(0.0f); } } } } } } outputImage[x + y*width + z*width*height]=sumG; }

nou · ‎02-13-2010

are you compute gradient from whole volume and not from local 3x3x3 volume?

plap · ‎02-13-2010

I intend to calculate the vector field for every voxel. For each voxel, the algorithm has to access all other voxels to gather their weight for the final vector for the destination voxel.

So yes, this is an algorithm that requires a lot of calculations and memory access.

n0thing · ‎02-13-2010

There's a LOT of for loops in there, you should try unrolling the loops or change your algorithm which has less control flow instructions.

plap · ‎02-14-2010

Ok. Thanks. I just thought that the compiler did simple loop unrolling itself. But if that is all I can do, I guess I have reached more or less peak performance.

plap · ‎02-14-2010

Anyone got a good idea how to avoid the check for divide by zero?

b+=(dist>0.0f)?a/dist:0;

The check is quite expensive and for each kernel dist is only zero once.

Once a divide-by-zero result is added to b, b is ruined. 😕

plap · ‎02-14-2010

Originally posted by: plap Anyone got a good idea how to avoid the check for divide by zero?

b+=(dist>0.0f)?a/dist:0;

The check is quite expensive and for each kernel dist is only zero once.

Once a divide-by-zero result is added to b, b is ruined. 😕

I changed it to a/(dist+0.00001f). As a is multiplied with 0 if dist is 0, this works just fine. And the imperfection in the calculations are insignificant. Execution time is however cut in half.

Archives Discussions

gradient vector field...