I have written this test application: it goes through iterations from 0 to 9999, for each integer in the range it calculates some useless but calculation-intensive function. As a result the program outputs the sum of function values. To make it run on several threads I'm using InterlockedIncrement - if after increment the iteration number is <10000 then a thread processes this iteration, otherwise it terminates.
I am wondering why it is not scaling as well as I would like it to. With 5 threads it runs 8s versus 36s with a single thread. This gives ~4.5 scalability. During my experiments with OpenMP (on my personal PhenomII X4 computer and on slightly different problems) I was getting much better scalability.
The source code is shown below.
The application is built with Visual Studio 2008. I am running Windows7 OS on a Phenom II X6 desktop. I have turned off CoolNQuiet in the BIOS settings, just in case... Don't know what other parameters might be relevant.
Could you please help me explain this sub-optimal scalability? Many thanks.
#include "boost/thread.hpp"
#include "boost/shared_ptr.hpp"
#include "boost/make_shared.hpp"
#include "vector"
#include "windows.h"
#include "iostream"
#include "cmath"
#include "omp.h"
using namespace std;
using namespace boost;
struct sThreadData
{
sThreadData() : iterCount(0), value( 0.0 ) {}
unsigned iterCount;
double value;
};
volatile LONG g_globalCounter;
const LONG g_maxIter = 10000;
double f( double x )
{
double result = x * 0.12345777;
for( unsigned i = 0; i < 100000; ++i )
result = sqrt( result * log(1.0 + result) );
return result;
}
void ThreadProc( shared_ptr data )
{
double threadValue = 0.0;
unsigned threadCount = 0;
while( true )
{
LONG iterIndex = InterlockedIncrement( &g_globalCounter );
if( iterIndex >= g_maxIter )
break;
++threadCount;
threadValue += f(iterIndex);
}
data->value = threadValue;
data->iterCount = threadCount;
}
int main()
{
const unsigned threadCount = 5;
vector< shared_ptr< sThreadData > > threadData;
for( unsigned i = 0; i < threadCount; ++i )
threadData.push_back( make_shared< sThreadData >() );
g_globalCounter = 0;
DWORD t1 = GetTickCount();
double sum = 0.0;
vector< shared_ptr< thread > > threads;
for( unsigned i = 0; i < threadCount; ++i )
threads.push_back( make_shared< thread >( &ThreadProc, threadData ) );
for( unsigned i = 0; i < threadData.size(); ++i )
{
threads->join();
sum += threadData->value;
}
// the OMP code below behaves similarly to my "manual" parallel code.
// omp_set_num_threads( threadCount );
//#pragma omp parallel for reduction(+:sum) schedule(static)
// for( int i = 0; i < 10000; ++i )
// sum += f(i);
DWORD t2 = GetTickCount();
cout << "T=" << static_cast< double >(t2 - t1) / 1000.0 << "s\n";
cout << "Sum= " << sum << "\n";
for( unsigned i = 0; i < threadData.size(); ++i )
cout << threadData->iterCount << "\n";
return 0;
}
Message was edited by: Alexander Chertov
More HTML-escaping of angle brackets.....