I am launching a kernel in tiles.I am using a 3D stream and since i have to go from 1000-10,000 in each dimension i am launching kernel with stream dimension

8192x90X90, 8192x(90)x(91-180) , ............... so on
So basically i am launchin too many kernels.

My question is should do this or should I eliminate 1 dimension and have the loop in kernel instead so that i have to launch less number of kernel.

Which would be a effecient one.

`int main(int argc, char ** argv) { int i,j,k,range; int startRange =1000; int endRange = 1100; int *solution; time_t start, end; unsigned int dim[] = {10,10,10}; start = time(NULL); //these loops determine the dimension of the stream and launch the kernel. for(i=0;i<(endRange - startRange);) { if((endRange - startRange-i)<8192) dim[0] = endRange - startRange-i; else dim[0] = 8192; for(j=0;j<(endRange - startRange);) { if((endRange - startRange-j)<90) dim[1] = endRange - startRange-j; else dim[1] = 90; for(k=0;k<(endRange - startRange);) { if((endRange - startRange-k)<90) dim[2] = endRange - startRange-k; else dim[2] = 90; Stream<int> aStream(3,dim); threadABC(startRange+i,startRange+j,startRange+k,aStream); //Every pass writes the result of the previous //this check is to see me its the first pass //i am not checking for aStream.isSync() since in either case i want to do this step //by default it will be done in parallel if(i!=0||k!=0){ writeResultsToFile(solution,dim); free(solution); } solution = (int *)malloc(dim[0]*dim[1]*dim[2]*sizeof(int)); streamWrite(aStream,solution); k+=90; } j+=90; } i+=8192; } writeResultsToFile(solution,dim); end = time(NULL); printf("according to difftime()%.2f sec's\n", difftime(end, start)); //display the result //streamWrite(aStream,solution); getch(); return 0; } //kernel Code #include<stdio.h> /* Equation to solve is A^X + B^Y = C^Z */ kernel int findGcd(int u,int v) { int gcd = 1; int r ; int num1=u; int num2 =v; while (1) { if (num2 == 0) { gcd = num1; break; } else { r = num1 % num2; num1 = num2; num2 = r; } } return gcd; } //Function to keep the values in the range of float kernel float modulusPower(float number,int exponent) { //biggest prime number less than 2^12 //N is taken as less than 2^12 even though float can store 2^24 as max integer //this is because if N is greater it gives wrong result for the mod (don't know why) float N = 4093.0f; float base = number; int counter = exponent; float result =1.0f; while (counter > 0) { if (counter & 1) { result = fmod((result * base),N); } counter = counter >>1; base = fmod((base * base) ,N); } return result; } kernel void threadABC(int startRangeA,int startRangeB,int startRangeC,out int a<>) { int X,Y,Z; int A,B,C; int gcdAB,gcdAC,gcdBC; float N = 4093.0f; //using the index of the output stream as the values for A,B,C A = instance().x+startRangeA; B = instance().y+startRangeB; C = instance().z+startRangeC; // intialising a to 0 so that when we filter the reuslts we can know that 0 means that //location does not have a reuslt. a=0; gcdAB = findGcd(A,B); gcdAC = findGcd(A,C); gcdBC = findGcd(B,C); if(gcdAB==1 && gcdAC==1 && gcdBC==1){ for( X = 3; X < 10; X++) { for( Y = 3; Y < 10; Y++) { for( Z = 3; Z < 10; Z++) { float sum = modulusPower((float)A,X)+modulusPower((float)B, Y); float cpowerZ = modulusPower((float)C,Z); sum = fmod(sum,N); if(cpowerZ == sum){ // here the possible solution should be stored and returned to host code //have to figure out the way to return the values of A,B,C,X,Y,Z to host a= Z; } } } } } }`