All threads execute nu_buffer() which calls into random() three times per i.
Also observe the numbers you posted, on ARM you get "sys 0m25.370s". This thing does no IO at all, no syscalls itself. What could cause this? Note: Locks on Linux typically boils down to futexes, whose slow (contended) path goes through the kernel (ouchies).
Also I wouldn't be at all surprised if "while (i--) buffer[i ]= (random()-random())/(random() + 1.0);" is significantly more expensive than "while (i--) buffer[ i]= (((double*) arg)[ i] + buffer[i ]) / 2.0;". Depending on the PRNG used by the libc, quite some of these are very slow, much slower than 2 FLOP (add, div).
---
Ok. Let's do this.
Your version (bench1.c).
#include <pthread.h>
#include <stdlib.h>
#include <stdio.h>
#define BUF_SIZE 16*1024*1024
pthread_t t1, t2, t3, t4;
double* nu_buffer () {
double* buffer= malloc(BUF_SIZE);
int i= BUF_SIZE/8;
while (i--) buffer[i]= (random()-random())/(random() + 1.0);
return buffer;
}
void* thread_proc (void* arg) {
int i= BUF_SIZE/8;
double* buffer= nu_buffer();
while (i--) buffer[i]= (((double*) arg)[i] + buffer[i]) / 2.0;
return buffer;
}
void do_threads_stuff (double* buffer) {
pthread_create(&t1, NULL, thread_proc, buffer);
pthread_create(&t2, NULL, thread_proc, buffer);
pthread_create(&t3, NULL, thread_proc, buffer);
pthread_create(&t4, NULL, thread_proc, buffer);
void** r;
pthread_join(t1, r), free(*r);
pthread_join(t2, r), free(*r);
pthread_join(t3, r), free(*r);
pthread_join(t4, r), free(*r);
}
int main (void) {
double* buffer= nu_buffer();
do_threads_stuff(buffer);
//free(buffer); return 0;
}
Note corruption in do_threads_stuff. Let's fix that first, shall we?
#include <pthread.h>
#include <stdlib.h>
#include <stdio.h>
#define BUF_SIZE 16*1024*1024
pthread_t t1, t2, t3, t4;
double* nu_buffer () {
double* buffer= malloc(BUF_SIZE);
int i= BUF_SIZE/8;
while (i--) buffer[i]= (random()-random())/(random() + 1.0);
return buffer;
}
void* thread_proc (void* arg) {
int i= BUF_SIZE/8;
double* buffer= nu_buffer();
while (i--) buffer[i]= (((double*) arg)[i] + buffer[i]) / 2.0;
return buffer;
}
void do_threads_stuff (double* buffer) {
pthread_create(&t1, NULL, thread_proc, buffer);
pthread_create(&t2, NULL, thread_proc, buffer);
pthread_create(&t3, NULL, thread_proc, buffer);
pthread_create(&t4, NULL, thread_proc, buffer);
void* r;
pthread_join(t1, &r), free(r);
pthread_join(t2, &r), free(r);
pthread_join(t3, &r), free(r);
pthread_join(t4, &r), free(r);
}
int main (void) {
double* buffer= nu_buffer();
do_threads_stuff(buffer);
//free(buffer); return 0;
}
Ok. Let's run that.
3.06user 1.63system 0:01.39elapsed 337%CPU (0avgtext+0avgdata 83264maxresident)k
0inputs+0outputs (0major+643minor)pagefaults 0swaps
Ok, pretty much the same bad result you got. Note high system time.
Now let's fix that random() mess.
#include <pthread.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#define BUF_SIZE 16*1024*1024
pthread_t t1, t2, t3, t4;
double* nu_buffer () {
double* buffer= (double*)malloc(BUF_SIZE);
struct random_data rd;
char sb[64];
memset(&rd, 0, sizeof(struct random_data));
initstate_r(random(), sb, 64, &rd);
int i= BUF_SIZE/8;
while (i--) {
int r1, r2, r3;
random_r(&rd, &r1);
random_r(&rd, &r2);
random_r(&rd, &r3);
buffer[i]= (r1-r2)/(r3 + 1.0);
}
return buffer;
}
void* thread_proc (void* arg) {
int i= BUF_SIZE/8;
double* buffer= nu_buffer();
while (i--) buffer[i]= (((double*) arg)[i] + buffer[i]) / 2.0;
return buffer;
}
void do_threads_stuff (double* buffer) {
pthread_create(&t1, NULL, thread_proc, buffer);
pthread_create(&t2, NULL, thread_proc, buffer);
pthread_create(&t3, NULL, thread_proc, buffer);
pthread_create(&t4, NULL, thread_proc, buffer);
void* r;
pthread_join(t1, &r), free(r);
pthread_join(t2, &r), free(r);
pthread_join(t3, &r), free(r);
pthread_join(t4, &r), free(r);
}
int main (void) {
double* buffer= nu_buffer();
do_threads_stuff(buffer);
free(buffer); return 0;
}
=
0.16user 0.00system 0:00.06elapsed 258%CPU (0avgtext+0avgdata 83176maxresident)k
0inputs+0outputs (0major+639minor)pagefaults 0swaps
On this machine the difference is even a bit closer to 20.