plap

gradient vector field...

Discussion created by plap on Feb 9, 2010
Latest reply on Feb 14, 2010 by plap
anyone done it?

I am trying to calculate the gradient vector field (GVF) in 3D, but I am having little luck making my code run well on the GPU (4870). Currently it runs about 6 times faster on the CPU (core i7 920) than on the gpu. I was hoping to get it turned around. :-P  There is probably a good reason for that. GVF is very computational intensive with O(n^2) complexity (n=number of voxels)...

sizeX, Y and Z can be up to 512 each(!) but my test volume is 'only' 67^3 (for GPU run, about 25^3 is max before BSOD/VPU recovery occurs).

My question is: Seeing as GVF needs to access _all_ input voxels for each output voxel, is this really a jobs that the GPU is good for? I am thinking about memory latency that may be the killer here.

Or is this just a question of understanding the memory models better?

/* extract from the initializer: */ /* buffers created like this */ inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_uchar ) * sizeX*sizeY*sizeZ, input3d, &err); checkErr(err, "clCreateBuffer:in"); outH = malloc(sizeX*sizeY*sizeZ*sizeof(cl_float4)); outCL = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeX*sizeY*sizeZ*sizeof(cl_float4), outH, &err); checkErr(err, "clCreateBuffer:out"); /* kernel setup */ global_work_size[0] = sizeX; global_work_size[1] = sizeY; global_work_size[2] = sizeZ; local_work_size[0] = 1; local_work_size[1] = 1; local_work_size[2] = 1; err = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &event); checkErr(err, "clEnqueueNDRangeKernel"); clWaitForEvents(1, &event); /* * kernel */ __kernel void gvf(__constant uchar* inputImage, __global float4* outputImage) { int x = get_global_id(0); int y = get_global_id(1); int z = get_global_id(2); uint width = get_global_size(0); uint height = get_global_size(1); uint depth = get_global_size(2); float sumGX = 0.0; float sumGY = 0; /**/ float sumGZ = 0; /**/ float distX, distX2; float distY, distY2; float distZ, distZ2; float dist; uint idx; float4 sumG = 0.0f; float voxel,len; float4 dir; for(uint i = 0; i < width; ++i) { distX = (float)((int)i-(int)x); distX2 = distX * distX; for(uint j = 0; j < height; ++j) { distY = (float)((int)j-(int)y); distY2 = distY * distY; for(uint k = 0; k < depth; ++k) { distZ = (float)((int)k-(int)z); distZ2 = distZ * distZ; idx = i + j*width + k*width*height; voxel=(float)inputImage[idx]; dist=(distX2+distY2+distZ2); dir = (float4)(distX, distY, distZ, 0.0f); sumG+=(dist>0.0f)?voxel*dir/sqrt((dir.x*dir.x+dir.y*dir.y+dir.z*dir.z)*dist):(float4)(0.0f, 0.0f, 0.0f, 0.0f); } } } outputImage[x + y*width + z*width*height]=sumG; }

Outcomes