I guess that if you use row major ordering the algorithm is correct, but if you're using float4 data type you have to use float4 indexing. Maybe this will work?
__kernel void add(__global *float4 c, __global float4 a, __global float4 b)
{
int i = get_global_id(0);
int j = get_global_id(1);
w = get_global_size(0);
c[i*w+j].x = a[i*w+j].x + b[i*w+j].x;
c[i*w+j].y = a[i*w+j].y + b[i*w+j].y;
c[i*w+j].z = a[i*w+j].z + b[i*w+j].z;
c[i*w+j].w = a[i*w+j].w + b[i*w+j].w;
}