13 Replies Latest reply on Jun 20, 2010 4:00 PM by niravshah00

    200 lines code  please suggest for improvment please

    niravshah00

      The equation is Beals Conjecture
      A^X + B^Y = C^Z

      testing the equation over a large range values like 1,000 to 10,000 for A,B,C and  3-10 for X,Y,Z.

      Launching kernel with 3D stream and using the index of the stream as values for A,B,C

      The Code is well commented .

      I dont have the actual hardware so cant test it on a GPU if some one would test the code on their setup that would be great.

      Or just go through the code and if there are any suggestion for optimizing it.

      I also have a serial C code for reference if needed

      Attaching the Brook+ code (kernel + host code)

      //////KERNEL CODE/////////////////// #include<stdio.h> /* Equation to solve is A^X + B^Y = C^Z */ kernel int findGcd(int u,int v) { int gcd = 1; int r ; int num1=u; int num2 =v; while (1) { if (num2 == 0) { gcd = num1; break; } else { r = num1 % num2; num1 = num2; num2 = r; } } return gcd; } //Function to keep the values in the range of float kernel float modulusPower(float number,int exponent) { //biggest prime number less than 2^12 //N is taken as less than 2^12 even though float can store 2^24 as max integer //this is because if N is greater it gives wrong result for the mod. float N = 4093.0f; float base = number; int counter = exponent; float result =1.0f; while (counter > 0) { if (counter & 1) { result = fmod((result * base),N); } counter = counter >>1; base = fmod((base * base) ,N); } return result; } kernel void findCounterExample(int startRangeA,int startRangeB,int startRangeC,int X ,int Y,int Z,out int a<>) { int A,B,C; int gcdAB,gcdAC,gcdBC; float N = 4093.0f; //using the index of the output stream as the values for A,B,C A = instance().x+startRangeA; B = instance().y+startRangeB; C = instance().z+startRangeC; // intialising a to 0 so that when we filter the reuslts we can know that 0 means that //location does not have a reuslt. a=0; gcdAB = findGcd(A,B); gcdAC = findGcd(A,C); gcdBC = findGcd(B,C); //if A,B,C are co primes test for the equation if(gcdAB==1 && gcdAC==1 && gcdBC==1) { float sum = modulusPower((float)A,X)+modulusPower((float)B, Y); float cpowerZ = modulusPower((float)C,Z); sum = fmod(sum,N); if(cpowerZ == sum) { // here the possible solution should be stored and returned to host code //have to figure out the way to return the values of A,B,C,X,Y,Z to host // to tackle this problem made the stream equal to 1 so now // on the host code would filter the stream to check if it has value one // the index would be the vavlues of A,B,C a= 1; } } } /// host code ////////////////////////////////////// #include "brookgenfiles\beals.h" #include "conio.h" #include "brook\stream.h" #include <time.h> using namespace brook; //function that filters the results from the stream and write it to the file. void writeResultsToFile(int *counterExample,unsigned int dimension[],int startRange[],int X,int Y,int Z) { int i,j,k; for(i=0;i<dimension[0];i++) for(j=0;j<dimension[1];j++) for(k=0;k<dimension[2];k++) { if(counterExample[i*dimension[1]*dimension[2]+j*dimension[2]+k]!=0) { //write to a file // A^X B^Y C^Z //"(i+startRange[0]^X)+(j+startRange[1]^Y)= (k+startRange[2]^Z)" //this string to be wrten to a file } } } int main(int argc, char ** argv) { int iA,jB,kC,X,Y,Z,range[3]; int startRange =1000; int endRange = 1020; int *counterExample; int exponentRange =10; time_t start, end; unsigned int dimension[3] ={0}; unsigned int dim[] = {10,10,10}; start = time(NULL); //Since the maximum size of stream can be 8192 x 8192. //I am launching kernel with stream size 8192 x 90 x 90 so that max number of threads can be launched. for(iA=0;iA<(endRange - startRange);) { if((endRange - startRange-iA)<8192) dim[0] = endRange - startRange-iA; else dim[0] = 8192; for(jB=0;jB<(endRange - startRange);) { if((endRange - startRange-jB)<90) dim[1] = endRange - startRange-jB; else dim[1] = 90; for(kC=0;kC<(endRange - startRange);) { if((endRange - startRange-kC)<90) dim[2] = endRange - startRange-kC; else dim[2] = 90; // the next loops are for X,Y,Z //the reason for not having the loop in the kernel is then // each thread could possible generate more than one result. // There is no other way to return the result then through streams for( X = 3; X < exponentRange; X++) { for( Y = 3; Y < exponentRange; Y++) { for( Z = 3; Z < exponentRange; Z++) { Stream<int> aStream(3,dim); findCounterExample(startRange+iA,startRange+jB,startRange+kC,X,Y,Z,aStream); //Every pass writes the result of the previous //this check is to see if its the first pass //Not checking for aStream.isSync() since in either case this step needs to be done //By default it will be done in parallel as the control return to host code after //kernel call. if(iA!=0||kC!=0){ writeResultsToFile(counterExample,dimension,range,X,Y,Z); free(counterExample); } counterExample = (int *)malloc(dim[0]*dim[1]*dim[2]*sizeof(int)); streamWrite(aStream,counterExample); // since the result of previous kernel call are filtered need to store the //dimension of the stream with which the previous kernel was called. dimension[0] = dim[0]; dimension[1] = dim[1]; dimension[2] = dim[2]; //Stores the start range for A,B,C for each kernel call. range[0] = startRange+iA; range[1] = startRange+jB; range[2] = startRange+kC; } } } kC+= 90; } jB+=90; } iA+=8192; } // Filtering the results from the stream for the last kernel call . writeResultsToFile(counterExample,dimension,range,X,Y,Z); end = time(NULL); printf("according to difftime()%.2f sec's\n", difftime(end, start)); getch(); return 0; }

        • 200 lines code  please suggest for improvment please
          niravshah00

          Running the code on CPU backhand (thats the only option i have)

          for values of A,B,C  - from 1000 to 1010

          X,Y,Z from  3- 10

          it takes 22 second whereas the serial code executes in less than a second

            • 200 lines code  please suggest for improvment please
              ryta1203

              I'm not sure it makes a difference for the CPU backend, but for the GPU you could start by vectorizing your code.

              Control flow elimination will help increase your packing percentage.

                • 200 lines code  please suggest for improvment please
                  niravshah00

                  What do u mean by vectorizing my code?

                  Well for the control flow elimination i can't help it , most of them are very much required.

                    • 200 lines code  please suggest for improvment please
                      geekmaster

                      You must use float4 variables (i think). Better study the amd stream user guide and code examples from other people.

                      To "eleminate" flow control you must use stuff like b +=  (a < x)  (not sure) because these commands are executed concurrently.

                      Anyway kernel optimization is a hard work.

                        • 200 lines code  please suggest for improvment please
                          niravshah00

                           

                          Originally posted by: geekmaster You must use float4 variables (i think). Better study the amd stream user guide and code examples from other people.

                           

                          To "eleminate" flow control you must use stuff like b +=  (a < x)  (not sure) because these commands are executed concurrently.

                           

                          Anyway kernel optimization is a hard work.

                           

                           

                          Thanks geekmaster

                          I have read streamprocessing user guide several times.
                          But I cant figure out how can I use float4 in my algorithm since they way i am using stream is a lot different. I am using streams for their index which will give the values of A,B,C . Well since there will be a thread created for each of the element in 3-Dimensional Stream i will get all combination that i get in 3 nested loops (brute force)

                          All I want to know is will this code on 9170 execute faster than the serial code I have in C with 6 nested loops

                          If you can suggest me how to go about using float4 in my algorith I would appreciate it

                        • 200 lines code  please suggest for improvment please
                          ryta1203

                           

                          Originally posted by: niravshah00 What do u mean by vectorizing my code?

                          Well for the control flow elimination i can't help it , most of them are very much required.

                          Vectorize: use float4 instead of float. So, if your float is one element and the kernel operates on that one element, use float4 and operate on four elements per kernel call instead, this will help with several issues: memory read/writes and ALU packing, most importantly.

                          No, you can still use control flow but there are several techniques you can use to get around using "if/else" blocks. This is what I mean. For example, if you are using Brook+, try using the conditional operator for EACH statement OR you can try using software based predication.

                            • 200 lines code  please suggest for improvment please
                              niravshah00

                               

                              Originally posted by: ryta1203
                              Originally posted by: niravshah00 What do u mean by vectorizing my code?

                               

                              Well for the control flow elimination i can't help it , most of them are very much required.

                               

                               

                              Vectorize: use float4 instead of float. So, if your float is one element and the kernel operates on that one element, use float4 and operate on four elements per kernel call instead, this will help with several issues: memory read/writes and ALU packing, most importantly.

                               

                              No, you can still use control flow but there are several techniques you can use to get around using "if/else" blocks. This is what I mean. For example, if you are using Brook+, try using the conditional operator for EACH statement OR you can try using software based predication.

                               

                               

                              Thanks ryta123

                              Well I did not think in this direction.I am not sure if I know how am I going to use float4 in my implementation.

                              Secondly what I wanted to verify is that the way (3d stream of 8192x90x90) I am launching the kernel would not give me results (execution time) that are worst than the serial code.
                              In the host code I am launching the kernel multiple times since I have to work in blocks as the range can be very high. Each time i m reading the stream and filtering the whole stream .

                              Since I dont have the hardware I cannot test the performance  on GPU but have to use cpu backend that does not give me any clue. Infact it scares me since it take more time than the normal C code with nested loop.

                                • 200 lines code  please suggest for improvment please
                                  geekmaster

                                  niravshah00 it would be a good idea to let someone have your source tested on hardware, since there is no point at developing an application that is slower than your older one.

                                  As for using float4 variables i also have a hard time to understand what to do. But what i know is that when float4 is used the stream dimensions are reduced because each float4 variable contains 4 floats.

                                  (correct me if i am wrong) 

                                  i think that float4 is defined and used in this way:

                                  struct float4{

                                   float x;

                                  float y;

                                  float z;

                                  float w;

                                  };

                                  so with 1 command (let a and b and c be float4) we have

                                  a = b + c

                                  which is equivalent to

                                  a.x = b.x + c.x;

                                  a.y = b.y + c.y;

                                  a.z = b.z + c.z;

                                  a.w = b.w + c.w;

                                  Before using float4 you have to change the stream dimensions using the cpu or just another kernel (converting for example a stream from k<64, 64> to n<64, 16> )

                                  (again correct me if i am wrong) 

                                  I think that only when the kernel contains many alu operations per memory fetch this gives better performance.

                                    • 200 lines code  please suggest for improvment please
                                      niravshah00

                                       

                                      Originally posted by: geekmaster niravshah00 it would be a good idea to let someone have your source tested on hardware, since there is no point at developing an application that is slower than your older one.

                                       

                                      As for using float4 variables i also have a hard time to understand what to do. But what i know is that when float4 is used the stream dimensions are reduced because each float4 variable contains 4 floats.

                                       

                                      (correct me if i am wrong) 

                                       

                                      i think that float4 is defined and used in this way:

                                       

                                      struct float4{

                                       

                                       float x;

                                       

                                      float y;

                                       

                                      float z;

                                       

                                      float w;

                                       

                                      };

                                       

                                      so with 1 command (let a and b and c be float4) we have

                                       

                                      a = b + c

                                       

                                      which is equivalent to

                                       

                                      a.x = b.x + c.x;

                                       

                                      a.y = b.y + c.y;

                                       

                                      a.z = b.z + c.z;

                                       

                                      a.w = b.w + c.w;

                                       

                                      Before using float4 you have to change the stream dimensions using the cpu or just another kernel (converting for example a stream from k<64, 64> to n<64, 16> )

                                       

                                      (again correct me if i am wrong) 

                                       

                                      I think that only when the kernel contains many alu operations per memory fetch this gives better performance.

                                       

                                      I know how float4 works
                                      But the way u are suggesting me is not possible in my case.

                                      The stream in my case is just to launch thread for each value of in it.

                                      so basically the instance is used to get the values of variables A,B,C

                                      Its like nested loops of three variables  just that each operated in parallel.

                                      see the idea is to use index of seach stream location of the values like

                                      0,0,0

                                      0,0,1

                                      0,0,2

                                      0,1,0

                                      0,1,1

                                      0,1,2

                                      and so on

                                      If i use float4 on any of this i will miss out on few combinations
                                      I hope u get my point.

                                      And about the hardware I wished i had the hardware to test my code.

                                      I asked ppl here if they could execute my code to see the. performance

                                      but i guess it asking for too much. And I totally understand it

                                        • 200 lines code  please suggest for improvment please
                                          geekmaster

                                          You can upload somewhere your project and i am sure that someone will compile it and test it.

                                          As for me i only have a 3870 and dx9 (xp) so i dont know if i can run your code.

                                          Also if you need more performance you could use cal to utilize two devices (like crossfire) at the same time and multiple cpu threads. (if you use windows its very easy just include windows.h and use)

                                          thread_input is a struct containing parameters for the thread.

                                          _beginthread(thread_function, 0, (void*)&thread_input);

                                            • 200 lines code  please suggest for improvment please
                                              niravshah00

                                               

                                              Originally posted by: geekmaster You can upload somewhere your project and i am sure that someone will compile it and test it.

                                               

                                              As for me i only have a 3870 and dx9 (xp) so i dont know if i can run your code.

                                               

                                              Also if you need more performance you could use cal to utilize two devices (like crossfire) at the same time and multiple cpu threads. (if you use windows its very easy just include windows.h and use)

                                               

                                              thread_input is a struct containing parameters for the thread.

                                               

                                              Are you trying to tell me to use thread on the host code for better performance?

                                              Secondly I have asked people if they could execute my project.

                                              That reminds me that to run the project on GPU i just have to make the change in the environment variable BRTRUNTIME  =cal   and not other change right?

                                            • 200 lines code  please suggest for improvment please
                                              ryta1203

                                              So why can't you calculate four values of A, B, C instead of just one at a time?

                                                • 200 lines code  please suggest for improvment please
                                                  niravshah00

                                                   

                                                  Originally posted by: ryta1203 So why can't you calculate four values of A, B, C instead of just one at a time?

                                                   

                                                  How do i make sure i have all the combination.

                                                  If you see when i am calling the kernel with there is nothing in the stream.

                                                  I am not getting how would i get all combinations of A,B,C if i use float4.

                                                  findcounterexample(out float4<>a)

                                                  {

                                                  now how to get the values of A,B,C using instance().x ,.y and .z

                                                  }

                                                  ????