The attached code shows the function where I'm getting this behavior. Disassembly lines are interspersed among C++. The offending address is 0x004015C6. If I run either the "Assess Performance" or "Time-based Profile", CodeAnalyst reports this line as taking over 15000 clock cycles. Why is this, and what do I do about it?

static unsigned FindFactor(unsigned& n, unsigned limit) { 00401580 push esi 00401581 push edi unsigned* firstPrimes = sFirstPrimes; unsigned trialDivisor = 1; unsigned quotient = 0; unsigned testProduct = 0; unsigned localN = n; 00401582 mov edi,dword ptr [ebx] 00401584 mov esi,offset Species::sFirstPrimes (433988h) 00401589 lea esp,[esp] for (int i = 0; i < kNumFirstPrimes; i++) { trialDivisor = firstPrimes[i]; 00401590 mov ecx,dword ptr [esi] quotient = localN / trialDivisor; 00401592 xor edx,edx 00401594 mov eax,edi 00401596 div eax,ecx testProduct = quotient * trialDivisor; 00401598 mov edx,eax 0040159A imul edx,ecx if (testProduct == localN) 0040159D cmp edx,edi 0040159F je Species::FindFactor+6Ch (4015ECh) 004015A1 add esi,4 004015A4 cmp esi,offset Species::sPrimePattern (4339B0h) 004015AA jl Species::FindFactor+10h (401590h) unsigned testProduct = 0; unsigned localN = n; for (int i = 0; i < kNumFirstPrimes; i++) { trialDivisor = firstPrimes[i]; quotient = localN / trialDivisor; testProduct = quotient * trialDivisor; if (testProduct == localN) { n = quotient; return trialDivisor; } } unsigned* primePattern = sPrimePattern; int patternIndex = 0; trialDivisor += primePattern[patternIndex++]; 004015AC add ecx,dword ptr [Species::sPrimePattern (4339B0h)] 004015B2 mov esi,1 while (trialDivisor <= limit) 004015B7 cmp ecx,0F4240h 004015BD ja Species::FindFactor+64h (4015E4h) 004015BF nop { quotient = localN / trialDivisor; 004015C0 xor edx,edx 004015C2 mov eax,edi 004015C4 div eax,ecx testProduct = quotient * trialDivisor; 004015C6 mov edx,eax 004015C8 imul edx,ecx if (testProduct == localN) 004015CB cmp edx,edi 004015CD je Species::FindFactor+6Ch (4015ECh) { n = quotient; return trialDivisor; } // These trial divisors aren't all prime, but the composites are divisible // by primes that came before, so there's no danger of this function returning // a composite divisor. The only problem is a few wasted divisions. trialDivisor += sPrimePattern[patternIndex]; 004015CF add ecx,dword ptr Species::sPrimePattern (4339B0h)[esi*4] patternIndex = (patternIndex + 1) & 7; 004015D6 add esi,1 004015D9 and esi,7 004015DC cmp ecx,0F4240h 004015E2 jbe Species::FindFactor+40h (4015C0h) 004015E4 pop edi } return 1; 004015E5 mov eax,1 004015EA pop esi }

I ran the "Investigate Branching" profile and got the following results for address 0x004015C6:

These numbers are far, far larger than the numbers I'm getting for any other part of the program. I understand the idea of branches, mispredictions, and retired instructions, but I don't understand why this particular line might be giving these results. CodeAnalyst makes it look like this is a terrible performance bottleneck in my program and I don't know what to do about it.