I will just cross post from stackoverflow:

I am trying to parallel a classic map-reduce problem (which can parallel well with MPI) with OpenCL, namely, the AMD implementation. But the result bothers me.

Let me brief about the problem first. There are two type of data that flow into the system: the feature set (30 parameters for each) and the sample set (9000+ dimensions for each). It is a classic map-reduce problem in the sense that I need to calculate the score of every feature on every sample (Map). And then, sum up the overall score for every feature (Reduce). There are around 10k features and 30k samples.

I tried different ways to solve the problem. First, I tried to decompose the problem by features. The problem is that the score calculation consists of random memory access (pick some of the 9000+ dimensions and do plus/subtraction calculations). Since I cannot coalesce memory access, it costs. Then, I tried to decompose the problem by samples. The problem is that to sum up overall score, all threads are competing for few score variables. It keeps overwriting the score which turns out to be incorrect. (I cannot carry out individual score first and sum up later because it requires 10k * 30k * 4 bytes).

The first method I tried gives me the same performance on i7 860 CPU with 8 threads. However, I don't think the problem is unsolvable: it is remarkably similar to ray tracing problem (for which you carry out calculation that millions of rays against millions of triangles). Any ideas?

In addition, I am posting some of the code I have:

// decompose by feature (works, but slow): __kernel void __ccv_cl_pos_error_rate(__global unsigned int* err_rate, __constant int* feature, __constant int* data, int num, __constant unsigned int* w, int s, int isiz0, int isiz01, int step0, int step1) { int igrid = get_global_id(0); __constant int* of = feature + igrid * 30; unsigned int e = 0; int k, i; int step[] = { step0, step1 }; for (k = 0; k < num; k++) { __constant int* kd = data + k * isiz01; int pmin = kd[of[0] * isiz0 + of[1] + of[2] * step[of[0]]]; int nmax = kd[of[3] * isiz0 + of[4] + of[5] * step[of[3]]]; for (i = 0; i < 5; i++) { if (of[i * 6] >= 0) pmin = min(pmin, kd[of[i * 6] * isiz0 + of[i * 6 + 1] + of[i * 6 + 2] * step[of[i * 6]]]); if (of[i * 6 + 3] >= 0) nmax = max(nmax, kd[of[i * 6 + 3] * isiz0 + of[i * 6 + 4] + of[i * 6 + 5] * step[of[i * 6 + 3]]]); } if (pmin <= nmax) e += w[s + k]; } err_rate[igrid] += e; } // decompose by sample, not work: __kernel void __ccv_cl_pos_error_rate(__global unsigned int* err_rate, __constant int* feature, __constant int* data, int num, __constant unsigned int* w, int s, int isiz0, int isiz01, int step0, int step1, __local int* shared) { int igrid = get_global_id(0); int lsize = get_local_size(0); int lid = get_local_id(0); unsigned int e = 0; int k, i; int ws = w[s + igrid]; int step[] = { step0, step1 }; for (k = 0; k < isiz01; k += lsize) if (k + lid < isiz01) shared[k + lid] = data[igrid * isiz01 + k + lid]; barrier(....); for (k = 0; k < num; k++) { __constant int* of = feature + k * 30; int pmin = shared[of[0] * isiz0 + of[1] + of[2] * step[of[0]]]; int nmax = shared[of[3] * isiz0 + of[4] + of[5] * step[of[3]]]; for (i = 0; i < 5; i++) { if (of[i * 6] >= 0) pmin = min(pmin, shared[of[i * 6] * isiz0 + of[i * 6 + 1] + of[i * 6 + 2] * step[of[i * 6]]]); if (of[i * 6 + 3] >= 0) nmax = max(nmax, shared[of[i * 6 + 3] * isiz0 + of[i * 6 + 4] + of[i * 6 + 5] * step[of[i * 6 + 3]]]); } if (pmin <= nmax) err_rate[k] += ws; // here is wrong. } barrier(....); }