In the kernel attached below, the profiler reports 27 fetches and 34 writes. The profiler info says these are global fetches/writes. I would have thought that variables declared in my kernel were non-global, in which case I count about 17 fetches and 2 writes. Obviously there is something I don't understand about where writes are occurring. (I also realize I can make this more efficient, but that's another issue.)
The GPU is a Firestream with an RV770 chipset.
inline uint4 newSeed(uint4 seed) { uint4 a4 = (uint4) (1664525U, 1664525U, 1664525U, 1664525U); uint4 c4 = (uint4) (1013904223U, 1013904223U, 1013904223U, 1013904223U); return (a4 * seed + c4); } int4 compareToNeighbor(int4 new_spin, int4 neighbor) { return (int4)((int4)((int4) new_spin != (int4) neighbor) & (int4)(1,1,1,1)); } inline int index(int row, int col, int COLS) { return row * COLS + col; } __kernel void pots_loop (__global int4 * spin, __global int4 * spin_out, __global uint4 * seeds, __const uint ROWS, __const uint COLS, __const uint NUM_SPIN_STATES) { /* find the global location in output */ private int row = get_global_id(0); private int col = get_global_id(1); private int my_index = index(row, col, COLS); //initialize energy private int4 old_energy = (int4)(0u, 0u, 0u, 0u); private int4 new_energy = (int4)(0u, 0u, 0u, 0u); private int4 neighbor; private int4 new_spin; private int4 spin_local; private int4 old_is_better; //initialize the indices for neighbor positions private int col_rt = col + 1; private int col_left = col - 1; private int row_up = row - 1; private int row_down = row + 1; if (col_rt >= COLS) col_rt = 0; if (col_left < 0) col_left = COLS - 1; if (row_up < 0) row_up = ROWS -1; if (row_down >= ROWS) row_down = 0; spin_local = spin[my_index]; // compute the proposed new energy seeds[my_index] = newSeed(seeds[my_index]); new_spin = (int4) (seeds[my_index] % (uint4)(NUM_SPIN_STATES, NUM_SPIN_STATES, NUM_SPIN_STATES, NUM_SPIN_STATES)); //find neighbor to right neighbor = spin[my_index].yzwx; neighbor.w = spin[index(row, col_rt, COLS)].x; new_energy += compareToNeighbor(new_spin, neighbor); old_energy += compareToNeighbor(spin_local, neighbor); //find neighbor to left neighbor = spin[my_index].wxyz; neighbor.x = spin[index(row, col_left, COLS)].w; new_energy += compareToNeighbor(new_spin, neighbor); old_energy += compareToNeighbor(spin_local, neighbor); //find neighbor above neighbor = spin[index(row_up, col, COLS)]; new_energy += compareToNeighbor(new_spin, neighbor); old_energy += compareToNeighbor(spin_local, neighbor); //find neighbor below neighbor = spin[index(row_down, col, COLS)]; new_energy += compareToNeighbor(new_spin, neighbor); old_energy += compareToNeighbor(spin_local, neighbor); //find neighbor to above and right neighbor = spin[index(row_up, col, COLS)].yzwx; neighbor.w = spin[index(row_up, col_rt, COLS)].x; new_energy += compareToNeighbor(new_spin, neighbor); old_energy += compareToNeighbor(spin_local, neighbor); //find neighbor to above and left neighbor = spin[index(row_up, col, COLS)].wxyz; neighbor.x = spin[index(row_up, col_left, COLS)].w; new_energy += compareToNeighbor(new_spin, neighbor); old_energy += compareToNeighbor(spin_local, neighbor); //find neighbor to down and right neighbor = spin[index(row_down, col, COLS)].yzwx; neighbor.w = spin[index(row_down, col_rt, COLS)].x; new_energy += compareToNeighbor(new_spin, neighbor); old_energy += compareToNeighbor(spin_local, neighbor); //find neighbor to down and left neighbor = spin[index(row_down, col, COLS)].wxyz; neighbor.x = spin[index(row_down, col_left, COLS)].w; new_energy += compareToNeighbor(new_spin, neighbor); old_energy += compareToNeighbor(spin_local, neighbor); old_is_better = -(int4)( old_energy <= new_energy); spin_out[my_index] = old_is_better * spin_local + ((int4) 1 - old_is_better) * new_spin; }
can you think of any way to force the swizzle into a register?
Or perhaps copying first to local memory would at least make the write local?
Unfortunately I'm out of town, away from my hardware, so I can't test and report results till next week.
I replaced several of the swizzles with a tmp as you suggested, but am still getting 34 writes, same as before. Any other ideas?
that's reassuring for two reasons
1. It's what I get by counting up accesses/writes looking at the code, so I guess I do understand what's a global read/write.
2. It means the SDK is getting better.
So, when can I expect to be able to get my hands on the new and improved version. With the huge number of reads/writes, I'm getting only 1% ALU usage, which obviously is quite bad.