11 Replies Latest reply on May 17, 2013 1:30 AM by himanshu.gautam

    OpenCL Compiler Crashes

    jsonntag

      I have a kernel which works fine on HD 6xxx and earlier GPUs with Catalyst 12.10 or earlier drivers under Windows 7 x64.  The app is built with AMD APP SDK 2.8.  When attempting to run the app on HD 7xxx GPUs, I get the following error when building the program:

       

      Unhandled Exception Detected...

       

      - Unhandled Exception Record -

      Reason: Access Violation (0xc0000005) at address 0x0000000076F1E4B4 write attempt to address 0x00000024

       

      According to the OpenCL specs, I would expect that it should never crash but should instead fail to compile and allow me to get the build log which would contain the reason for the errors.  Instead, it crashes regardless of any try/catch that is in place.  This same kernel works 100% on nVidia GPUs, OS X, and pre HD7xx GPUs. It seems that commenting out the mul_hi function will allow it to compile.  But, since the output of mul_hi is required to get the correct output, that really isn't an option.  I would tell the thousands of BOINC users to use the older application but since the new GPUs won't run the code compiled with Brook+ properly, that isn't an option either.

       

      So.... anyone have any idea why this won't compile on a HD 7790 with 13.1 thru 13.4 drivers on Windows 7 x64?

       

      __kernel void kernelSteps(__const __global uint4 *mosc, __const uint offset, __const uint4 start, __global uint4 *steps)

      {

        const uint lookahead = 20;

        const uint4 sc = (uint4)(1048575,12,1048577,0);

        const uint t_offset = get_global_id(sc.w) + get_global_size(sc.w) * get_global_id(1);\n

        const uint totalOffset = offset + t_offset;\n

        uint4 carry,lut,stepsOut,steps_in = steps[t_offset];

        uint2 mul_r;

        uint valh,valt,carryh,index,i;

        uint cont=1u;

        uint4 val = start; 

        uint overflow; 

        i = overflow = valh = valt = sc.w;   

        val.x += totalOffset;

        carry.x = (val.x < totalOffset);

        val.y += carry.x;

        carry.y = (val.y < carry.x);

        val.z += carry.y;

        while(cont)

        {

          lut = mosc[val.x & sc.x];

          mul_r = mul64((val.x >> lookahead) + (val.y << sc.y), lut.x);

          val.x = mul_r.x + lut.y;

          carry.x = mul_r.y + (val.x < mul_r.x);

          mul_r = mul64((val.y >> lookahead) + (val.z << sc.y), lut.x);

          val.y = mul_r.x + carry.x;

          carry.y = mul_r.y + (val.y < mul_r.x);

          mul_r = mul64((val.z >> lookahead) + (val.w << sc.y), lut.x);

          val.z = mul_r.x + carry.y;

          carry.z = mul_r.y + (val.z < mul_r.x);

          mul_r = mul64((val.w >> lookahead) + (valh << sc.y), lut.x);

          val.w = mul_r.x + carry.z;

          carry.w = mul_r.y + (val.w < mul_r.x);

          mul_r = mul64((valh >> lookahead) + (valt << sc.y), lut.x);

          valh = mul_r.x + carry.w;

          carryh = mul_r.y + (valh < mul_r.x);

          mul_r = mul64(valt >> lookahead, lut.x);

          valt = mul_r.x + carryh;

          overflow = (overflow | mul_r.y) | (valt < mul_r.x);

          i += lut.z;   

          cont = ((((val.x > (uint)sc.z) | val.y) | (val.z | val.w)) | (valh | valt)) && (i<0x1000000u);

        }

        index=val.x-2u;

        i += mosc[index & sc.x].w;

        if(overflow) i = 0x1000000u;  

        carry.x = steps_in.z + i;

        stepsOut.z = carry.x;

        stepsOut.w = steps_in.w + (carry.x < i);,

        if (i > steps_in.x)

        {   

          stepsOut.x = i;

          stepsOut.y = totalOffset;

        }

        else

        {

          stepsOut.x = steps_in.x;

          stepsOut.y = steps_in.y;

        }

        steps[t_offset] = stepsOut;

      }

        • Re: OpenCL Compiler Crashes
          himanshu.gautam

          I can compile the above kernel using 13.4 driver on Win7-64 with HD 7770 GPU. Build log reports an error about mul64 function, apart from some simple syntax errors.BuildLog.PNG

            • Re: OpenCL Compiler Crashes
              jsonntag

              LOL, I forgot to include the mul64 function in my post.  When I comment it out, it compiles OK for me too. 

               

              inline uint2 mul64(__const uint a, __const uint b){

                return (uint2)(a*b,mul_hi(a,

              }

               

              The entire kernel code (with an alternate to mul_hi which doesn't work for me either) is as follows:

               

              __kernel void kernelZeroBuffer(__global uint4 *steps){

                  steps[get_global_id(0)] = (uint4)(0,0,0,0);

              }

              inline uint2 mul64(__const uint a, __const uint b){

                ulong x = (ulong)a * (ulong)b; 

                return (uint2)((uint)x,(uint)(x>>32));

              }

              __kernel void kernelSteps(__const __global uint4 *mosc, __const uint offset, __const uint4 start, __global uint4 *steps)

              {

              const uint lookahead = 20;

              const uint4 sc = (uint4)(1048575,12,1048577,0);

              const uint t_offset = get_global_id(sc.w) + get_global_size(sc.w) * get_global_id(1);

              const uint totalOffset = offset + t_offset;

              uint4 carry,lut,stepsOut,steps_in = steps[t_offset];

              uint2 mul_r;

              uint valh,valt,carryh,index,i;

              uint cont=1u;

              uint4 val = start; 

              uint overflow; 

              i = overflow = valh = valt = sc.w;   

              val.x += totalOffset;

              carry.x = (val.x < totalOffset);

              val.y += carry.x;

              carry.y = (val.y < carry.x);

              val.z += carry.y;

              while(cont)

              {

              lut = mosc[val.x & sc.x];

              mul_r = mul64((val.x >> lookahead) + (val.y << sc.y), lut.x);

              val.x = mul_r.x + lut.y;

              carry.x = mul_r.y + (val.x < mul_r.x);

              mul_r = mul64((val.y >> lookahead) + (val.z << sc.y), lut.x);

              val.y = mul_r.x + carry.x;

              carry.y = mul_r.y + (val.y < mul_r.x);

              mul_r = mul64((val.z >> lookahead) + (val.w << sc.y), lut.x);

              val.z = mul_r.x + carry.y;

              carry.z = mul_r.y + (val.z < mul_r.x);

              mul_r = mul64((val.w >> lookahead) + (valh << sc.y), lut.x);

              val.w = mul_r.x + carry.z;

              carry.w = mul_r.y + (val.w < mul_r.x);

              mul_r = mul64((valh >> lookahead) + (valt << sc.y), lut.x);

              valh = mul_r.x + carry.w;

              carryh = mul_r.y + (valh < mul_r.x);

              mul_r = mul64(valt >> lookahead, lut.x);

              valt = mul_r.x + carryh;

              overflow = (overflow | mul_r.y) | (valt < mul_r.x);

              i += lut.z;   

              cont = ((((val.x > (uint)sc.z) | val.y) | (val.z | val.w)) | (valh | valt)) && (i<0x1000000u);

              }

              index=val.x-2u;

              i += mosc[index & sc.x].w;

              if(overflow) i = 0x1000000u;  

              carry.x = steps_in.z + i;

              stepsOut.z = carry.x;

              stepsOut.w = steps_in.w + (carry.x < i);,

              if (i > steps_in.x)

              {   

                stepsOut.x = i;

                stepsOut.y = totalOffset;

              }

              else

              {

                stepsOut.x = steps_in.x;

                stepsOut.y = steps_in.y;

              }

              steps[t_offset] = stepsOut;

              }

            • Re: OpenCL Compiler Crashes
              jsonntag

              If I use "-cl-opt-disable" it no longer crashes in the call to clBuildProgram. It appears the issue is due to the optimization it is trying to do with the newer GPUs.  I hate to disable optimization since the Brook+ version of the same application is 50% faster than the OpenCL version and this may make the OpenCL version even slower.

                • Re: OpenCL Compiler Crashes
                  himanshu.gautam

                  Hi jstong,

                  It is a critical issue if OCL compiler is wrongly optimizing the code. But I am still not able to reproduce the issue. There seems to be some copy paste isssue with mul64 code above.

                  I had tried the remaining code and it still compiled for me. No crashes. Can you attach the kernel as a zip file.

                    • Re: OpenCL Compiler Crashes
                      jsonntag

                      I'll do one better.  Here's the entire VS2010 solution.  Using -cl-opt-disable seems to make it work whereas using no options for the clBuildProgram call causes an LLVM error below. 

                       

                      LLVM ERROR: Cannot select: 0x48b5a80: i32 = setcc 0x48a9ac0, 0x48b3f60, 0x48b457

                      0 [ID=59]

                        0x48a9ac0: i64 = AMDILISD::ADD 0x48b3f60, 0x48b4060 [ID=57]

                          0x48b3f60: i64 = or 0x48b3e60, 0x48b3960 [ORD=26] [ID=49]

                            0x48b3e60: i64 = shl 0x48b4a70, 0x48b3d60 [ORD=25] [ID=46]

                              0x48b4a70: i64 = any_extend 0x48b3860 [ID=42]

                                0x48b3860: i32 = AMDILISD::VEXTRACT 0x48aa2c0, 0x48abde0 [ORD=23] [ID=

                      35]

                                  0x48aa2c0: v4i32,ch = CopyFromReg 0x483dd10, 0x48abbe0 [ORD=21] [ID=

                      26]

                                    0x48abbe0: v4i32 = Register %vreg27 [ORD=21] [ID=2]

                                  0x48abde0: i32 = TargetConstant<2> [ORD=14] [ID=22]

                              0x48b3d60: i32 = Constant<32> [ORD=25] [ID=12]

                            0x48b3960: i64 = zero_extend 0x48a99c0 [ORD=22] [ID=43]

                              0x48a99c0: i32 = AMDILISD::VEXTRACT 0x48aa2c0, 0x48a94c0 [ORD=21] [ID=36

                      ]

                                0x48aa2c0: v4i32,ch = CopyFromReg 0x483dd10, 0x48abbe0 [ORD=21] [ID=26

                      ]

                                  0x48abbe0: v4i32 = Register %vreg27 [ORD=21] [ID=2]

                                0x48a94c0: i32 = TargetConstant<1> [ORD=13] [ID=21]

                          0x48b4060: i64 = zero_extend 0x48ac1e0 [ORD=27] [ID=55]

                            0x48ac1e0: i32 = AMDILISD::ADD 0x48ab5e0, 0x48b4870 [ORD=18] [ID=52]

                              0x48ab5e0: i32 = AMDILISD::ADD 0x48aa0c0, 0x48a97c0 [ORD=16] [ID=47]

                                0x48aa0c0: i32 = mul 0x48b5d80, 0x48b5e80 [ORD=15] [ID=44]

                                  0x48b5d80: i32 = AMDILISD::VEXTRACT 0x48a9dc0, 0x48a94c0 [ORD=13] [I

                      D=39]

                                    0x48a9dc0: v4i32,ch = llvm.AMDIL.get.global.size 0x483dd10, 0x48a9

                      bc0 [ORD=12] [ID=29]

                                      0x48a9bc0: i32 = TargetConstant<2563> [ORD=12] [ID=6]

                                    0x48a94c0: i32 = TargetConstant<1> [ORD=13] [ID=21]

                                  0x48b5e80: i32 = AMDILISD::VEXTRACT 0x48abce0, 0x48abde0 [ORD=14] [I

                      D=37]

                                    0x48abce0: v4i32,ch = llvm.AMDIL.get.global.id 0x483dd10, 0x48ab6e

                      0 [ORD=10] [ID=28]

                                      0x48ab6e0: i32 = TargetConstant<2561> [ORD=10] [ID=4]

                                    0x48abde0: i32 = TargetConstant<2> [ORD=14] [ID=22]

                                0x48a97c0: i32 = AMDILISD::VEXTRACT 0x48abce0, 0x48a94c0 [ORD=11] [ID=

                      38]

                                  0x48abce0: v4i32,ch = llvm.AMDIL.get.global.id 0x483dd10, 0x48ab6e0

                      [ORD=10] [ID=28]

                                0x48ab6e0: i32 = TargetConstant<2561> [ORD=10] [ID=4]
                              0x48a94c0: i32 = TargetConstant<1> [ORD=13] [ID=21]
                          0x48b4870: i32 = AMDILISD::VEXTRACT 0x48ab8e0, 0x48a95c0 [ORD=17] [ID=32

                      ]

                            0x48ab8e0: v4i32,ch = CopyFromReg 0x483dd10, 0x48ab9e0 [ORD=17] [ID=25

                      ]

                              0x48ab9e0: v4i32 = Register %vreg26 [ORD=17] [ID=1]
                            0x48a95c0: i32 = TargetConstant<3> [ORD=32] [ID=23]

                        0x48b3f60: i64 = or 0x48b3e60, 0x48b3960 [ORD=26] [ID=49]


                      0x48b3e60: i64 = shl 0x48b4a70, 0x48b3d60 [ORD=25] [ID=46]
                        0x48b4a70: i64 = any_extend 0x48b3860 [ID=42]
                          0x48b3860: i32 = AMDILISD::VEXTRACT 0x48aa2c0, 0x48abde0 [ORD=23] [ID=35

                      ]

                            0x48aa2c0: v4i32,ch = CopyFromReg 0x483dd10, 0x48abbe0 [ORD=21] [ID=26

                      ]

                              0x48abbe0: v4i32 = Register %vreg27 [ORD=21] [ID=2]
                            0x48abde0: i32 = TargetConstant<2> [ORD=14] [ID=22]
                        0x48b3d60: i32 = Constant<32> [ORD=25] [ID=12]

                      0x48b3960: i64 = zero_extend 0x48a99c0 [ORD=22] [ID=43]
                        0x48a99c0: i32 = AMDILISD::VEXTRACT 0x48aa2c0, 0x48a94c0 [ORD=21] [ID=36]
                          0x48aa2c0: v4i32,ch = CopyFromReg 0x483dd10, 0x48abbe0 [ORD=21] [ID=26]
                            0x48abbe0: v4i32 = Register %vreg27 [ORD=21] [ID=2]
                          0x48a94c0: i32 = TargetConstant<1> [ORD=13] [ID=21]
                          • Re: OpenCL Compiler Crashes
                            himanshu.gautam

                            I took your kernel and ran using APP Kernel Analyzer: I get following errors:

                            ========= Build: started ==========
                            OpenCL Compile Error: clBuildProgram failed (CL_BUILD_PROGRAM_FAILURE).
                            --------
                            OpenCL Compile Error: Compiling for device: Tahiti
                            Line 32: error: identifier
                                      "vall" is undefined
                                lut = mosc[vall & scx];
                                           ^

                            Line 33: error: function
                                      "mul64" declared implicitly
                                mul_r = mul64((val.x >> lookahead) + (val.y << scy), lut.x);
                                        ^

                            Line 62: error: a value of
                                      type "uint4" cannot be assigned to an entity of type "uint"
                               i += mosc[(val.x-2)&(scx)];
                                    ^

                            Line 63: error: identifier
                                      "maxsteps" is undefined
                               if (overflow) i = maxsteps;
                                                 ^

                            Line 12: warning: variable
                                      "index" was set but never used
                               uint index, cont, valh, valt, carryh, overflow = 0u;
                                    ^

                            4 errors detected.

                            Frontend phase failed compilation.
                            --------

                            ========== Build: 0 of 1 succeeded ==========

                      • Re: OpenCL Compiler Crashes
                        stevec

                        Just chiming in here to provide more data.  I've been struggling with this exact same issue.  Windows 7, Catalyst 13.4  and 13.5_beta with APP SDK 2.8.  The kernel worked on a HD 6xxx card but crashes during compilation on my new HD 7970.  I also found that -cl-opt-disable prevents it from crashing.  Here is the code:

                         

                         

                        #define CHOLESKY_TOL 1e-30f

                        #define NPAR 5

                        #define SX 16

                        #define SY 16

                        #define NY 256 // 16*16

                         

                        float evaluate(float *p,

                                       __constant float *cutout,

                                       __global float *fvec)

                        {

                          // pull out optimization variables

                          float xoffset = p[0];

                          float yoffset = p[1];

                          float stdev = p[2];

                          float amp = p[3];

                          float background = p[4];

                          float scale = sqrt(2.0f)*stdev;

                         

                          // precompute the x and y erf factors

                          float xerf[SX];

                          float yerf[SY];

                          for (int x=0; x<SX; x++) {

                            float fx = x - SX*0.5f + xoffset;

                            float e = 0.5f * (erf((fx+0.5f)/scale) - erf((fx-0.5f)/scale));

                            xerf[x] = e;

                          }

                         

                          for (int y=0; y<SY; y++) {

                            float fy = y - SY*0.5f + yoffset;

                            float e = 0.5f * (erf((fy+0.5f)/scale) - erf((fy-0.5f)/scale));

                            yerf[y] = e;

                          }

                         

                          float sqerror=0;

                          int c=0;

                          for (int y=0; y<SY; y++) {

                            for (int x=0; x<SX; x++) {

                         

                              float model = amp * xerf[x]*yerf[y] + background;

                              float sample = cutout[c];

                              float err = (model - sample) / sqrt(max(1.0f, sample));

                              fvec[c] = err;

                              sqerror += err*err;

                              c++;

                            }

                          }

                         

                          return sqerror;

                        }

                         

                        float model_photon_count(float *p)

                        {

                          float xoffset = p[0];

                          float yoffset = p[1];

                          float stdev = p[2];

                          float amp = p[3];

                          float scale = sqrt(2.0f)*stdev;

                         

                          float xerf[SX];

                          float yerf[SY];

                          for (int x=0; x<SX; x++) {

                            float fx = x - SX*0.5f + xoffset;

                            float e = 0.5f * (erf((fx+0.5f)/scale) - erf((fx-0.5f)/scale));

                            xerf[x] = e;

                          }

                         

                          for (int y=0; y<SY; y++) {

                            float fy = y - SY*0.5f + yoffset;

                            float e = 0.5 * (erf((fy+0.5f)/scale) - erf((fy-0.5f)/scale));

                            yerf[y] = e;

                          }

                         

                          float count = 0;

                          for (int y=0; y<SY; y++) {

                            for (int x=0; x<SX; x++) {

                              count += amp * xerf[x]*yerf[y];

                            }

                          }

                          return count;

                        }

                         

                        /* solve the equation Ax=b for a symmetric positive-definite matrix A,

                           using the Cholesky decomposition A=LL^T.  The matrix L is passed in "l".

                           Elements above the diagonal are ignored.

                        */

                        void solve_axb_cholesky(float *l,

                                                float *x,

                                                float *b)

                        {

                          int i,j;

                          float sum;

                         

                          /* solve L*y = b for y (where x[] is used to store y) */

                         

                          for (i=0; i<NPAR; i++) {

                            sum = 0;

                            for (j=0; j<i; j++)

                              sum += l[i*NPAR+j] * x[j];

                            x[i] = (b[i] - sum)/l[i*NPAR+i];     

                          }

                         

                          /* solve L^T*x = y for x (where x[] is used to store both y and x) */

                         

                          for (i=NPAR-1; i>=0; i--) {

                            sum = 0;

                            for (j=i+1; j<NPAR; j++)

                              sum += l[j*NPAR+i] * x[j];

                            x[i] = (x[i] - sum)/l[i*NPAR+i];     

                          }

                        }

                         

                         

                        /* This function takes a symmetric, positive-definite matrix "a" and returns

                           its (lower-triangular) Cholesky factor in "l".  Elements above the

                           diagonal are neither used nor modified.  The same array may be passed

                           as both l and a, in which case the decomposition is performed in place.

                        */

                        int cholesky_decomp(float *l,

                                            float *a)

                        {

                          int i,j,k;

                          float sum;

                         

                          for (i=0; i<NPAR; i++) {

                            for (j=0; j<i; j++) {

                              sum = 0;

                              for (k=0; k<j; k++)

                                sum += l[i*NPAR+k] * l[j*NPAR+k];

                              l[i*NPAR+j] = (a[i*NPAR+j] - sum)/l[j*NPAR+j];

                            }

                         

                            sum = 0;

                            for (k=0; k<i; k++)

                              sum += l[i*NPAR+k] * l[i*NPAR+k];

                            sum = a[i*NPAR+i] - sum;

                            if (sum<CHOLESKY_TOL) return 1; /* not positive-definite */

                            l[i*NPAR+i] = sqrt(sum);

                          }

                          return 0;

                        }

                         

                        __kernel void fitParticleGaussian2D(__constant float *cutout_block,

                                                            __global float *ferr_block,

                                                            __global float *ferr2_block,

                                                            __global float *jac_block,

                                                            __global float *params_block,

                                                            const int num_particles,

                                                            const int max_iters)

                        {

                          if (get_global_id(0) >= num_particles)

                            return;

                         

                          __constant float *cutout = &cutout_block[get_global_id(0)*NY];

                          __global float *ferr = &ferr_block[get_global_id(0)*NY];

                          __global float *ferr2 = &ferr2_block[get_global_id(0)*NY];

                          __global float *jac = &jac_block[get_global_id(0)*NY*NPAR];

                          __global float *params = &params_block[get_global_id(0)*8];

                         

                          int x, i, j, it, nit, ill;

                          float lambda, up, down, grad_eps, mult, err, newerr, derr, xnorm, xdelta, ftol, xtol;

                          float h[NPAR*NPAR];

                          float ch[NPAR*NPAR];

                          float d[NPAR];

                          float delta[NPAR];

                          float newpar[NPAR];

                          nit = max_iters;

                          lambda = 0.0001f;

                          up = 10.0f;

                          down = 0.1f;

                          grad_eps = 0.0001f;

                          ftol = 1.0e-3f;

                          xtol = 1.0e-3f;

                          derr = newerr = xnorm = xdelta = mult = 0.0f;

                          ill = 0;

                         

                          /* initial guesses on parameters */

                          float par[NPAR];

                          par[0] = 0.0f; // x

                          par[1] = 0.0f; // y

                          par[2] = 3.0f; // stdev

                          par[3] = 400.0f; // amp

                          par[4] = 0.0f; // background

                         

                          /* calculate the initial error */

                          evaluate(par, cutout, ferr);

                          err = 0;

                          for (x=0; x<NY; x++)

                            err += ferr[x]*ferr[x];

                         

                          /* main iteration */

                          for (it=0; it<nit; it++) {

                         

                            /* use finite differences */

                            for (i=0; i<NPAR; i++) {

                              float temp = par[i];

                              float step = sqrt(grad_eps) * fabs(temp);

                              if (step == 0.0f)

                                step = sqrt(grad_eps);

                              par[i] += step;

                              evaluate(par, cutout, ferr2);

                             

                              for (x=0; x<NY; x++)

                                jac[x*NPAR+i] = (ferr2[x] - ferr[x]) / (par[i] - temp);

                             

                              par[i] = temp;

                            }

                         

                            /* calculate the approximation to the Hessian and the "derivative" d */

                            for (i=0; i<NPAR; i++) {

                              d[i] = 0.0f;

                              for (j=0; j<=i; j++)

                                h[i*NPAR+j] = 0;

                            }

                            for (x=0; x<NY; x++) {

                              for (i=0; i<NPAR; i++) {

                                d[i] += -ferr[x]*jac[x*NPAR+i];

                                for (j=0; j<=i; j++)

                                  h[i*NPAR+j] += jac[x*NPAR+i]*jac[x*NPAR+j];

                              }

                            }

                         

                            /*  make a step "delta."  If the step is rejected, increase

                                lambda and try again */

                            mult = 1.0f + lambda;

                            ill = 1; /* ill-conditioned? */

                            xnorm = 0.0f;

                            xdelta = 0.0f;

                         

                            while (ill && (it<nit)) {

                              for (i=0; i<NPAR; i++)

                                h[i*NPAR+i] = h[i*NPAR+i]*mult;

                         

                              ill = cholesky_decomp(ch, h);

                         

                              if (!ill) {

                                solve_axb_cholesky(ch, delta, d);

                         

                                for (i=0; i<NPAR; i++) {

                                  newpar[i] = par[i] + delta[i];

                                  xdelta += delta[i]*delta[i];

                                  xnorm += newpar[i]*newpar[i];

                                }

                                xdelta = sqrt(xdelta);

                                xnorm = sqrt(xnorm);

                         

                                evaluate(newpar, cutout, ferr);

                         

                                newerr = 0.0f;

                                for (x=0; x<NY; x++)

                                  newerr += ferr[x]*ferr[x];

                         

                                derr = 1.0f - newerr/err;

                                ill = (derr < 0.0f);

                         

                              }

                         

                              if (ill) {

                                mult = (1.0f + lambda*up)/(1.0f + lambda);

                                lambda *= up;

                                it++;

                              }

                            }

                         

                            for (i=0; i<NPAR; i++)

                              par[i] = newpar[i];

                            err = newerr;

                            lambda *= down; 

                         

                            if (!ill) {

                              // terminate on small change in error

                              if (derr<ftol) {

                                 break;

                              }

                             

                              // terminate on small step size

                              if ((xdelta/xnorm)<xtol) {

                                break;

                              }

                            }

                          }

                         

                          for (i=0; i<NPAR; i++) {

                            params[i] = par[i];

                          }

                          params[5] = model_photon_count(par);

                          params[6] = evaluate(par, cutout, ferr);

                          params[7] = it;

                        }