drstrip

why so many fetches, writes

Discussion created by drstrip on Apr 10, 2010
Latest reply on Apr 12, 2010 by drstrip

In the kernel attached below, the profiler reports 27 fetches and 34 writes. The profiler info says these are global fetches/writes. I would have thought that variables declared in my kernel were non-global, in which case I count about 17 fetches and 2 writes. Obviously there is something I don't understand about where writes are occurring. (I also realize I can make this more efficient, but that's another issue.)

 

The GPU is a Firestream with an RV770 chipset.

 

 

inline uint4 newSeed(uint4 seed) { uint4 a4 = (uint4) (1664525U, 1664525U, 1664525U, 1664525U); uint4 c4 = (uint4) (1013904223U, 1013904223U, 1013904223U, 1013904223U); return (a4 * seed + c4); } int4 compareToNeighbor(int4 new_spin, int4 neighbor) { return (int4)((int4)((int4) new_spin != (int4) neighbor) & (int4)(1,1,1,1)); } inline int index(int row, int col, int COLS) { return row * COLS + col; } __kernel void pots_loop (__global int4 * spin, __global int4 * spin_out, __global uint4 * seeds, __const uint ROWS, __const uint COLS, __const uint NUM_SPIN_STATES) { /* find the global location in output */ private int row = get_global_id(0); private int col = get_global_id(1); private int my_index = index(row, col, COLS); //initialize energy private int4 old_energy = (int4)(0u, 0u, 0u, 0u); private int4 new_energy = (int4)(0u, 0u, 0u, 0u); private int4 neighbor; private int4 new_spin; private int4 spin_local; private int4 old_is_better; //initialize the indices for neighbor positions private int col_rt = col + 1; private int col_left = col - 1; private int row_up = row - 1; private int row_down = row + 1; if (col_rt >= COLS) col_rt = 0; if (col_left < 0) col_left = COLS - 1; if (row_up < 0) row_up = ROWS -1; if (row_down >= ROWS) row_down = 0; spin_local = spin[my_index]; // compute the proposed new energy seeds[my_index] = newSeed(seeds[my_index]); new_spin = (int4) (seeds[my_index] % (uint4)(NUM_SPIN_STATES, NUM_SPIN_STATES, NUM_SPIN_STATES, NUM_SPIN_STATES)); //find neighbor to right neighbor = spin[my_index].yzwx; neighbor.w = spin[index(row, col_rt, COLS)].x; new_energy += compareToNeighbor(new_spin, neighbor); old_energy += compareToNeighbor(spin_local, neighbor); //find neighbor to left neighbor = spin[my_index].wxyz; neighbor.x = spin[index(row, col_left, COLS)].w; new_energy += compareToNeighbor(new_spin, neighbor); old_energy += compareToNeighbor(spin_local, neighbor); //find neighbor above neighbor = spin[index(row_up, col, COLS)]; new_energy += compareToNeighbor(new_spin, neighbor); old_energy += compareToNeighbor(spin_local, neighbor); //find neighbor below neighbor = spin[index(row_down, col, COLS)]; new_energy += compareToNeighbor(new_spin, neighbor); old_energy += compareToNeighbor(spin_local, neighbor); //find neighbor to above and right neighbor = spin[index(row_up, col, COLS)].yzwx; neighbor.w = spin[index(row_up, col_rt, COLS)].x; new_energy += compareToNeighbor(new_spin, neighbor); old_energy += compareToNeighbor(spin_local, neighbor); //find neighbor to above and left neighbor = spin[index(row_up, col, COLS)].wxyz; neighbor.x = spin[index(row_up, col_left, COLS)].w; new_energy += compareToNeighbor(new_spin, neighbor); old_energy += compareToNeighbor(spin_local, neighbor); //find neighbor to down and right neighbor = spin[index(row_down, col, COLS)].yzwx; neighbor.w = spin[index(row_down, col_rt, COLS)].x; new_energy += compareToNeighbor(new_spin, neighbor); old_energy += compareToNeighbor(spin_local, neighbor); //find neighbor to down and left neighbor = spin[index(row_down, col, COLS)].wxyz; neighbor.x = spin[index(row_down, col_left, COLS)].w; new_energy += compareToNeighbor(new_spin, neighbor); old_energy += compareToNeighbor(spin_local, neighbor); old_is_better = -(int4)( old_energy <= new_energy); spin_out[my_index] = old_is_better * spin_local + ((int4) 1 - old_is_better) * new_spin; }

Outcomes