The equation is Beals Conjecture

A^X + B^Y = C^Z

testing the equation over a large range values like 1,000 to 10,000 for A,B,C and 3-10 for X,Y,Z.

Launching kernel with 3D stream and using the index of the stream as values for A,B,C

The Code is well commented .

I dont have the actual hardware so cant test it on a GPU if some one would test the code on their setup that would be great.

Or just go through the code and if there are any suggestion for optimizing it.

I also have a serial C code for reference if needed

Attaching the Brook+ code (kernel + host code)

//////KERNEL CODE/////////////////// #include<stdio.h> /* Equation to solve is A^X + B^Y = C^Z */ kernel int findGcd(int u,int v) { int gcd = 1; int r ; int num1=u; int num2 =v; while (1) { if (num2 == 0) { gcd = num1; break; } else { r = num1 % num2; num1 = num2; num2 = r; } } return gcd; } //Function to keep the values in the range of float kernel float modulusPower(float number,int exponent) { //biggest prime number less than 2^12 //N is taken as less than 2^12 even though float can store 2^24 as max integer //this is because if N is greater it gives wrong result for the mod. float N = 4093.0f; float base = number; int counter = exponent; float result =1.0f; while (counter > 0) { if (counter & 1) { result = fmod((result * base),N); } counter = counter >>1; base = fmod((base * base) ,N); } return result; } kernel void findCounterExample(int startRangeA,int startRangeB,int startRangeC,int X ,int Y,int Z,out int a<>) { int A,B,C; int gcdAB,gcdAC,gcdBC; float N = 4093.0f; //using the index of the output stream as the values for A,B,C A = instance().x+startRangeA; B = instance().y+startRangeB; C = instance().z+startRangeC; // intialising a to 0 so that when we filter the reuslts we can know that 0 means that //location does not have a reuslt. a=0; gcdAB = findGcd(A,B); gcdAC = findGcd(A,C); gcdBC = findGcd(B,C); //if A,B,C are co primes test for the equation if(gcdAB==1 && gcdAC==1 && gcdBC==1) { float sum = modulusPower((float)A,X)+modulusPower((float)B, Y); float cpowerZ = modulusPower((float)C,Z); sum = fmod(sum,N); if(cpowerZ == sum) { // here the possible solution should be stored and returned to host code //have to figure out the way to return the values of A,B,C,X,Y,Z to host // to tackle this problem made the stream equal to 1 so now // on the host code would filter the stream to check if it has value one // the index would be the vavlues of A,B,C a= 1; } } } /// host code ////////////////////////////////////// #include "brookgenfiles\beals.h" #include "conio.h" #include "brook\stream.h" #include <time.h> using namespace brook; //function that filters the results from the stream and write it to the file. void writeResultsToFile(int *counterExample,unsigned int dimension[],int startRange[],int X,int Y,int Z) { int i,j,k; for(i=0;i<dimension[0];i++) for(j=0;j<dimension[1];j++) for(k=0;k<dimension[2];k++) { if(counterExample[i*dimension[1]*dimension[2]+j*dimension[2]+k]!=0) { //write to a file // A^X B^Y C^Z //"(i+startRange[0]^X)+(j+startRange[1]^Y)= (k+startRange[2]^Z)" //this string to be wrten to a file } } } int main(int argc, char ** argv) { int iA,jB,kC,X,Y,Z,range[3]; int startRange =1000; int endRange = 1020; int *counterExample; int exponentRange =10; time_t start, end; unsigned int dimension[3] ={0}; unsigned int dim[] = {10,10,10}; start = time(NULL); //Since the maximum size of stream can be 8192 x 8192. //I am launching kernel with stream size 8192 x 90 x 90 so that max number of threads can be launched. for(iA=0;iA<(endRange - startRange);) { if((endRange - startRange-iA)<8192) dim[0] = endRange - startRange-iA; else dim[0] = 8192; for(jB=0;jB<(endRange - startRange);) { if((endRange - startRange-jB)<90) dim[1] = endRange - startRange-jB; else dim[1] = 90; for(kC=0;kC<(endRange - startRange);) { if((endRange - startRange-kC)<90) dim[2] = endRange - startRange-kC; else dim[2] = 90; // the next loops are for X,Y,Z //the reason for not having the loop in the kernel is then // each thread could possible generate more than one result. // There is no other way to return the result then through streams for( X = 3; X < exponentRange; X++) { for( Y = 3; Y < exponentRange; Y++) { for( Z = 3; Z < exponentRange; Z++) { Stream<int> aStream(3,dim); findCounterExample(startRange+iA,startRange+jB,startRange+kC,X,Y,Z,aStream); //Every pass writes the result of the previous //this check is to see if its the first pass //Not checking for aStream.isSync() since in either case this step needs to be done //By default it will be done in parallel as the control return to host code after //kernel call. if(iA!=0||kC!=0){ writeResultsToFile(counterExample,dimension,range,X,Y,Z); free(counterExample); } counterExample = (int *)malloc(dim[0]*dim[1]*dim[2]*sizeof(int)); streamWrite(aStream,counterExample); // since the result of previous kernel call are filtered need to store the //dimension of the stream with which the previous kernel was called. dimension[0] = dim[0]; dimension[1] = dim[1]; dimension[2] = dim[2]; //Stores the start range for A,B,C for each kernel call. range[0] = startRange+iA; range[1] = startRange+jB; range[2] = startRange+kC; } } } kC+= 90; } jB+=90; } iA+=8192; } // Filtering the results from the stream for the last kernel call . writeResultsToFile(counterExample,dimension,range,X,Y,Z); end = time(NULL); printf("according to difftime()%.2f sec's\n", difftime(end, start)); getch(); return 0; }

Running the code on CPU backhand (thats the only option i have)

for values of A,B,C - from 1000 to 1010

X,Y,Z from 3- 10

it takes 22 second whereas the serial code executes in less than a second