Archives Discussions

vlj · ‎06-04-2014

Hi,

I'm trying to port a gaussian blur fragment shader to a gaussian blur compute shader.

According to GPU Perf Studio the original fragment shader is bandwidth bound on a hd7750 so such a port make sense.

However the new compute shader is 4 time slower than the fragment shader.

I use a local size of 8x8 with a shared memory of 48x32 vec4 (do I reduce occupancy by allocating too much LDS ?). The dispatch grid is 210x32.
According to gpu perf studio I'm ALU bound and reducing the loop count in the kernel reduce the execution time.

This is surprising as the computation are the same as in the original fragment shader.

Here are the fragment shaders and the compute shaders.

I tried to output 4 pixels in the compute shader but it didn't change anything.

Is there something I'm doing wrong ?

________________________

uniform sampler2D tex;

uniform vec2 pixel;

uniform float sigma = 5.;

out vec4 FragColor;

void main()

{

vec2 uv = gl_FragCoord.xy * pixel;

float X = uv.x;

float Y = uv.y;

float g0, g1, g2;

g0 = 1.0 / (sqrt(2.0 * 3.14) * sigma);

g1 = exp(-0.5 / (sigma * sigma));

g2 = g1 * g1;

vec4 sum = texture(tex, vec2(X, Y)) * g0;

g0 *= g1;

g1 *= g2;

for (int i = 1; i < 9; i++) {

sum += texture(tex, vec2(X - i * pixel.x, Y)) * g0;

sum += texture(tex, vec2(X + i * pixel.x, Y)) * g0;

g0 *= g1;

g1 *= g2;

}

FragColor = sum;

}

________________________

// Debug Name:
/*------------------- Shader 141 -------------------*/
#version 430
//C:\Users\vljn_000\Documents\GitHub\stk-code\bld\bin\Release/../../../data/shaders/gaussian.comp
#define VSLayer

uniform layout(size1x16) restrict readonly image2D source;
uniform layout(size1x16) volatile restrict writeonly image2D dest;
uniform vec2 pixel;
uniform float sigma = 5.;

layout (local_size_x = 8, local_size_y = 😎 in;

shared vec4 local_src[8 + 2 * 8][32];

void main()
{
    int x = int(gl_LocalInvocationID.x), y = int(gl_LocalInvocationID.y);
    for (int i = 0; i < 4; i++)
    {
        ivec2 uv = ivec2(gl_GlobalInvocationID.x, gl_GlobalInvocationID.y * 4 + i);
        local_src[y + i * 8] = imageLoad(source, ivec2(uv) - ivec2(8, 0));
        local_src[x + 8][y + i * 8] = imageLoad(source, ivec2(uv));
        local_src[x + 16][y + i * 8] = imageLoad(source, ivec2(uv) + ivec2(8, 0));
    }

barrier();

    for (int i = 0; i < 4; i++)
    {
        float g0, g1, g2;
        g0 = 1.0 / (sqrt(2.0 * 3.14) * sigma);
        g1 = exp(-0.5 / (sigma * sigma));
        g2 = g1 * g1;
        vec4 sum = local_src[x + 8][y + i * 8] * g0;
        g0 *= g1;
        g1 *= g2;
        for (int j = 1; j < 8; j++) {
            sum += local_src[8 + x - j][y + i * 8] * g0;
            sum += local_src[8 + x + j][y + i * 8] * g0;
            g0 *= g1;
            g1 *= g2;
        }
        ivec2 uv = ivec2(gl_GlobalInvocationID.x, gl_GlobalInvocationID.y * 4 + i);
        imageStore(dest, ivec2(uv), sum);
    }
}

________________________

Vincent

Archives Discussions

[OpengGL 4.3] Compute Shader 4 time slower than equivalent fragment shader