I am not using prefetch and movnt(q).
Why is my best result well bellow 2GB/s? (about 700 MB/s)
Why is my worst result as low as 18Mb/s?
What should I know about hardware architecture
to understand this?Links,Docs?
(note: For sequential movntq i get 1748.86 MB/s.)
(possible explanations: cache is getting in the way,memory address lines are multiplexed ....??)
------------------------------------------------------------------------
Cpu: AMD Thorton 2 GHz (socket A)
Cache: L1 64KB D/I L2 256 KB full speed
Memory: DDR1 133/266 MHz FSB capacity 1.2 GB speed (should be) 2100 MB/s
Chipset: VIA KT400
------------------------------------------------------------------------
Test algorithm:
Addres equals base address.
Address is inkremented by (constant) jump value
and when it reaches the end of the memory chunk
it resets to incremented base address (by 1).
The process then repeats
until base address increment is equal to jump
or is equal to memory size chunk.
(
every n_th address is read then every (n_th)+1
then every (n_th)+2 and so on... until every (n_th)+(jump-1)
thus reading the whole memory chunk
)
Memory chunk size is 4MB.
Memory is tested for speed with diffrent jump values.
note: Memory test code with memory transfers commented out
is about the same speed for diffrent jump values.
-------------------------------------------------------------------------
Test code:
#include<windows.h>
#include<stdlib.h>
#include<stdio.h>
void test1(long* m,long jump){
_asm{
mov edi,
mov esi,edi
add esi,4194304
mov ecx,edi
mov ebx,0
mov edx,jump
shl edx,2
}
ll:
_asm{
mov edi,ecx //mov edi,
add edi,ebx
}
l:
_asm{
mov eax,[edi]
add edi,edx
cmp edi,esi
jb l
add ebx,4
cmp ebx,edx
jae k
cmp ebx,4194304
jae k
jmp ll
}
k: ;
}
void test2(long* m,long jump){
long a;
long adr=0;
long pass=0;
while(pass<jump && pass< 1048576){
adr=pass;
while(adr<1048576){ // 4194304/4
a=m[adr];
adr+=jump;
}
pass++ ;
}
}
void main(){
double cur,best=0,worst=3000;
int j;
long i;
int tick;
FILE *f;
long *m=(long *)malloc(4194304);
f=fopen("test2.txt","w");
for(i=1;i<=524288;i<<=1){
tick=GetTickCount();
for(j=0;j<1000;j++) test2(m,i); //--------------------------
tick=GetTickCount()-tick;
cur=(double)( 4194304/((double)tick/1000000)/(1024*1024) );
if(cur>best)best=cur;
if(cur<worst)worst=cur;
printf("\n speed=%5.2f MB/s jump= %d *4 bytes",cur,i);
fprintf(f,"\n speed=%5.2f MB/s jump= %d *4 bytes",cur,i);
}
printf("\n\n best=%5.2f MB/s worst=%5.2f MB/s \n",best,worst);
fprintf(f,"\n\n best=%5.2f MB/s worst=%5.2f MB/s \n",best,worst);
fclose(f);
}
--------------------------------------------------------------------------
Test results:
test1 assembler:
speed=644.85 MB/s jump= 1 *4 bytes
speed=453.10 MB/s jump= 2 *4 bytes
speed=307.69 MB/s jump= 4 *4 bytes
speed=190.90 MB/s jump= 8 *4 bytes
speed=101.79 MB/s jump= 16 *4 bytes
speed=109.40 MB/s jump= 32 *4 bytes
speed=105.35 MB/s jump= 64 *4 bytes
speed=96.93 MB/s jump= 128 *4 bytes
speed=90.36 MB/s jump= 256 *4 bytes
speed=80.65 MB/s jump= 512 *4 bytes
speed=71.93 MB/s jump= 1024 *4 bytes
speed=67.32 MB/s jump= 2048 *4 bytes
speed=37.19 MB/s jump= 4096 *4 bytes
speed=18.85 MB/s jump= 8192 *4 bytes
speed=18.79 MB/s jump= 16384 *4 bytes
speed=18.80 MB/s jump= 32768 *4 bytes
speed=201.89 MB/s jump= 65536 *4 bytes
speed=475.85 MB/s jump= 131072 *4 bytes
speed=715.05 MB/s jump= 262144 *4 bytes
speed=677.28 MB/s jump= 524288 *4 bytes
best=715.05 MB/s worst=18.79 MB/s
test2 c++ :
speed=436.87 MB/s jump= 1 *4 bytes
speed=261.76 MB/s jump= 2 *4 bytes
speed=139.81 MB/s jump= 4 *4 bytes
speed=101.11 MB/s jump= 8 *4 bytes
speed=71.99 MB/s jump= 16 *4 bytes
speed=98.35 MB/s jump= 32 *4 bytes
speed=95.42 MB/s jump= 64 *4 bytes
speed=89.89 MB/s jump= 128 *4 bytes
speed=71.87 MB/s jump= 256 *4 bytes
speed=68.05 MB/s jump= 512 *4 bytes
speed=64.61 MB/s jump= 1024 *4 bytes
speed=67.67 MB/s jump= 2048 *4 bytes
speed=37.14 MB/s jump= 4096 *4 bytes
speed=18.79 MB/s jump= 8192 *4 bytes
speed=18.75 MB/s jump= 16384 *4 bytes
speed=18.77 MB/s jump= 32768 *4 bytes
speed=195.71 MB/s jump= 65536 *4 bytes
speed=198.45 MB/s jump= 131072 *4 bytes
speed=316.83 MB/s jump= 262144 *4 bytes
speed=460.46 MB/s jump= 524288 *4 bytes
best=460.46 MB/s worst=18.75 MB/s
(What does random access memory stand for again?)