1
我想计算一个大矩阵的总和,而当我使用多个线程或仅使用一个线程时,我目前看不到性能改进。我认为这个问题与假分享有关,但我也在我的结构中添加了填充。请看一看!防止使用填充虚假分享
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <pthread.h>
#define WIDTH 20000
pthread_mutex_t mylock = PTHREAD_MUTEX_INITIALIZER;
struct split { // sizeof(split) = 24
int start;
int end;
int* matrix;
int i;
char padding[64 - 24]; //Padding the private sum variables forces them into separate cache lines and removes false sharing. Assume cache line is 64 bytes
};
int ran(){
return rand() % 21;
}
int* createBigMatrix(){
int* a = malloc(sizeof(int)* WIDTH * WIDTH);
for (int i = 0; i < WIDTH * WIDTH; i ++){
a[i] = ran(); // fill up the matrix with random numbers
}
return a;
}
static int finalSum;
void* partialSum(void* arg){
struct split* a = arg;
int totalSum = 0; // create local variable
int i;
for (i = a->start; i <= a->end; i ++){
totalSum += a->matrix[i];
}
pthread_mutex_lock(&mylock);
finalSum += totalSum; // critical section
pthread_mutex_unlock(&mylock);
free(a);
return 0;
}
int main(){ //-294925289
int useMultiThreads = 1; // there is no difference between using one thread or 4 therads
finalSum = 0;
pthread_t thread_ids[4];
// i want a square matrix of npages width
int* c = createBigMatrix();
printf("%lu\n", sizeof(struct split));
if (useMultiThreads){
// split the tasks evenly amoung 4 threads
// since there are 20,000x20,000, there must be 400,000,000 cells
int start[] = {0, 100000000, 200000000, 300000000};
int end[] = {99999999, 199999999, 299999999, 399999999};
// calculate sum
for (int i = 0; i < 4; i ++){
struct split* a = malloc(sizeof(struct split));
a->start = start[i];
a->end = end[i];
a->matrix = c;
pthread_create(thread_ids + i, NULL, partialSum, a);
}
for (int i = 0; i < 4; i ++){ // join em up
pthread_join(thread_ids[i], NULL);
}
}
else { // use single thread
for (int i = 0; i <= 399999999; i ++){
finalSum += c[i];
}
}
printf("total sum is %d\n", finalSum);
/*
real 0m4.871s
user 0m4.844s
sys 0m0.392s
*/
free(c);
return 0;
}
由于线程使用的矩阵索引不重叠,因此似乎没有太多的虚假共享空间,无论如何,填充参数结构都无济于事。你如何衡量所花费的时间?在我看来,这个过程的整体表现将会在求和开始之前通过创建和加载巨大的阵列而被主导? –
要小心你的索引,因为'int'肯定不是大矩阵的正确类型。还要考虑''for''循环中'a->'的使用。编译器无法知道'* a'是否可能在引擎盖下改变,所以他必须在每次迭代时重新加载。你可以将'a'改为'restrict'限定,但更简单的方法是将值(边界和矩阵)加载到局部变量中并在循环中使用它们。 –