Hi ,

Sorry for spaming the forum with lots of question.

But i hope you can understand my limited knowledge and limited time.

I am launching a kernel in tiles.I am using a 3D stream and since i have to go from 1000-10,000 in each dimension i am launching kernel with stream dimension

8192x90X90, 8192x(90)x(91-180) , ............... so on

So basically i am launchin too many kernels.

My question is should do this or should I eliminate 1 dimension and have the loop in kernel instead so that i have to launch less number of kernel.

Which would be a effecient one.

Attaching my kernel and host code . I understand no none has the time to go through my code.But just incase if some one is interested in helping me out

int main(int argc, char ** argv) { int i,j,k,range; int startRange =1000; int endRange = 1100; int *solution; time_t start, end; unsigned int dim[] = {10,10,10}; start = time(NULL); //these loops determine the dimension of the stream and launch the kernel. for(i=0;i<(endRange - startRange);) { if((endRange - startRange-i)<8192) dim[0] = endRange - startRange-i; else dim[0] = 8192; for(j=0;j<(endRange - startRange);) { if((endRange - startRange-j)<90) dim[1] = endRange - startRange-j; else dim[1] = 90; for(k=0;k<(endRange - startRange);) { if((endRange - startRange-k)<90) dim[2] = endRange - startRange-k; else dim[2] = 90; Stream<int> aStream(3,dim); threadABC(startRange+i,startRange+j,startRange+k,aStream); //Every pass writes the result of the previous //this check is to see me its the first pass //i am not checking for aStream.isSync() since in either case i want to do this step //by default it will be done in parallel if(i!=0||k!=0){ writeResultsToFile(solution,dim); free(solution); } solution = (int *)malloc(dim[0]*dim[1]*dim[2]*sizeof(int)); streamWrite(aStream,solution); k+=90; } j+=90; } i+=8192; } writeResultsToFile(solution,dim); end = time(NULL); printf("according to difftime()%.2f sec's\n", difftime(end, start)); //display the result //streamWrite(aStream,solution); getch(); return 0; } //kernel Code #include<stdio.h> /* Equation to solve is A^X + B^Y = C^Z */ kernel int findGcd(int u,int v) { int gcd = 1; int r ; int num1=u; int num2 =v; while (1) { if (num2 == 0) { gcd = num1; break; } else { r = num1 % num2; num1 = num2; num2 = r; } } return gcd; } //Function to keep the values in the range of float kernel float modulusPower(float number,int exponent) { //biggest prime number less than 2^12 //N is taken as less than 2^12 even though float can store 2^24 as max integer //this is because if N is greater it gives wrong result for the mod (don't know why) float N = 4093.0f; float base = number; int counter = exponent; float result =1.0f; while (counter > 0) { if (counter & 1) { result = fmod((result * base),N); } counter = counter >>1; base = fmod((base * base) ,N); } return result; } kernel void threadABC(int startRangeA,int startRangeB,int startRangeC,out int a<>) { int X,Y,Z; int A,B,C; int gcdAB,gcdAC,gcdBC; float N = 4093.0f; //using the index of the output stream as the values for A,B,C A = instance().x+startRangeA; B = instance().y+startRangeB; C = instance().z+startRangeC; // intialising a to 0 so that when we filter the reuslts we can know that 0 means that //location does not have a reuslt. a=0; gcdAB = findGcd(A,B); gcdAC = findGcd(A,C); gcdBC = findGcd(B,C); if(gcdAB==1 && gcdAC==1 && gcdBC==1){ for( X = 3; X < 10; X++) { for( Y = 3; Y < 10; Y++) { for( Z = 3; Z < 10; Z++) { float sum = modulusPower((float)A,X)+modulusPower((float)B, Y); float cpowerZ = modulusPower((float)C,Z); sum = fmod(sum,N); if(cpowerZ == sum){ // here the possible solution should be stored and returned to host code //have to figure out the way to return the values of A,B,C,X,Y,Z to host a= Z; } } } } } }

It depends from the hardware.

The best way is to test both cases and see the results (but from what i have read you dont have the harware).

I guess that the differences will be small but i think that it is better to launch the kernel several times.

I notice that my card is slow when it executes the first kernel but twice as fast on the second and third kernel which are execuded emediatelly after the first one finishes.