CUDA/C矩阵乘法

有人能告诉我我在做什么错吗？我试图创建一个程序，使用cuda将矩阵返回给电源。看起来好像cudaMemcpy（ln103）不返回结果数组。我通过返回矩阵中的第一个元素来检查它，但我总是得到0.也许我的内核有问题吗？将不胜感激任何帮助：CUDA/C矩阵乘法

编辑：我应该澄清，内核是迭代（从矩阵乘以相应的单位矩阵，然后乘以每个结果之后），直到k次矩阵的权力。

即A是矩阵甲^ 0 = I（单位矩阵）甲^ K = a ^（K-1）* A

输入：

<n> 
<power> 
<element> 
.....

代码：

#include <assert.h> 
#include <stdio.h> 
#include <stdlib.h> 
#include <sys/resource.h> 

#define BLOCK 8 
#define SIZE (BLOCK * 64) 
#define TILE_SIZE (8) 

int n; 


float * 
create_matrix_h(unsigned int w, unsigned int h) { 
    float *m; 
    m = (float *) malloc(w * h * sizeof(float)); 
    if (m == NULL) { 
    fprintf(stderr, "Failed to malloc.\n"); 
    exit(1); 
    } 
    return m; 
} 

__global__ void 
kernel3(const float *m1, const float *m2, float *m3, unsigned int width) { 
    const unsigned int row = blockIdx.y*blockDim.y + threadIdx.y; 
    const unsigned int col = blockIdx.x*blockDim.x + threadIdx.x; 
    unsigned int t, i; 
    float result = 0, a, b; 

    for (t = 0; t < width/TILE_SIZE; ++t) { 
    for (i = 0; i != TILE_SIZE; ++i) { 
     a = m1[row*width + t*TILE_SIZE + i]; 
     b = m2[(t*TILE_SIZE + i)*width + col]; 
     result += a * b; 
    } 
    __syncthreads(); 
    } 
    m3[row*width + col] = result; 
} 

float * 
create_matrix_d(int w, int h) { 
    float *m; 
    if (cudaMalloc(&m, w * h * sizeof(float)) == cudaErrorMemoryAllocation) { 
    fprintf(stderr, "Failed to cudaMalloc.\n"); 
    return NULL; 
    //exit(1); 
    } 
    return m; 
} 

void 
fill_matrix_h(float *const m, int w, int h, float *const values, int nvalues) { 
    int i, j = 0; 
    for (i = 0; i != w * h; ++i) { 
    m[i] = values[j]; 
    j = (j + 1) % nvalues; 
    } 
} 

int 
main(void) { 
    int k; 
    if (scanf("%d", &n) !=1 || n<1){ 
     return 0; 
    } 
    if (scanf(" %d", &k) !=1 || k<0){ 
     return 0; 
    } 
    float *hm[3], *dm[3]; 
    dim3 bdim(TILE_SIZE, TILE_SIZE); 
    dim3 gdim(SIZE/TILE_SIZE, SIZE/TILE_SIZE); 
    int i; 
    for(i=0; i<3; ++i) { 
     hm[i] = create_matrix_h(SIZE, SIZE); 
     dm[i] = create_matrix_d(SIZE, SIZE); 
    } 
    float tem[n*n]; 
    for(i=0; i<n*n; ++i) { 
     if (scanf(" %f", &tem[i]) !=1){ 
      return 0; 
     } 
    } 
    float temid[n*n]; 
    int j = 0; 
    for (i = 0; i != n*n; ++i) { 
     if (i==0 || i == j + (n+1)) { 
      temid[i] = 1; 
      j = i; 
     } 
     else { 
      temid[i] = 0; 
     } 
    } 
    fill_matrix_h(hm[0], SIZE, SIZE, tem, sizeof(tem)/sizeof(float)); 
    fill_matrix_h(hm[1], SIZE, SIZE, temid, sizeof(temid)/sizeof(float)); 
    cudaMemcpy(dm[0], hm[0], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice); 
    int w; 
    for (w=0; w<k; ++w) { 
     cudaMemcpy(dm[1], hm[1], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice); 
     kernel3<<<gdim, bdim>>>(dm[0], dm[1], dm[2], SIZE); 
      cudaThreadSynchronize(); 
     cudaMemcpy(hm[2], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToHost); 
     hm[1] = hm[2]; 
    } 
    printf(" %.3f ", hm[2][0]); 
    return 0; 

}

感谢您的回复pavan。现在，当我运行它时，我在内核调用中获得了下面的输入的无限循环。也是新的代码。我感谢你的帮助

新代码：

#include <assert.h> 
#include <stdio.h> 
#include <stdlib.h> 
#include <sys/resource.h> 

#define BLOCK 8 
#define SIZE (BLOCK * 64) 
#define TILE_SIZE (8) 

int n; 


float * 
create_matrix_h(unsigned int w, unsigned int h) { 
    float *m; 
    m = (float *) malloc(w * h * sizeof(float)); 
    if (m == NULL) { 
    fprintf(stderr, "Failed to malloc.\n"); 
    exit(1); 
    } 
    return m; 
} 

void 
print_matrix(const float *m, const int w, const int h) { 
    int x, y; 
    for (y = 0; y != h; ++y) { 
    for (x = 0; x != w; ++x) 
     printf("%.03f ", m[y*w + x]); 
    printf("\n"); 
    } 
} 


void 
cpu_mult(const float *m1, const float *m2, float *m3, unsigned int width) { 
    unsigned int i, j, k; 
    float result; 

    for (i = 0; i != width; ++i) { 
    for (j = 0; j != width; ++j) { 
     result = 0; 
     for (k = 0; k != width; ++k) 
     result += m1[i*width + k] * m2[k*width + j]; 
     m3[i*width + j] = result; 
    } 
    } 
} 


__global__ void 
kernel3(const float *m1, const float *m2, float *m3, unsigned int width) { 
    const unsigned int row = blockIdx.y*blockDim.y + threadIdx.y; 
    const unsigned int col = blockIdx.x*blockDim.x + threadIdx.x; 
    unsigned int t, i; 
    float result = 0, a, b; 

    for (t = 0; t < width/TILE_SIZE; ++t) { 
    for (i = 0; i != TILE_SIZE; ++i) { 
     a = m1[row*width + t*TILE_SIZE + i]; 
     b = m2[(t*TILE_SIZE + i)*width + col]; 
     result += a * b; 
    } 
    __syncthreads(); 
    } 
    m3[row*width + col] = result; 
} 

float * 
create_matrix_d(int w, int h) { 
    float *m; 
    if (cudaMalloc(&m, w * h * sizeof(float)) == cudaErrorMemoryAllocation) { 
    fprintf(stderr, "Failed to cudaMalloc.\n"); 
    return NULL; 
    //exit(1); 
    } 
    return m; 
} 

void 
fill_matrix_h(float *const m, int w, int h, float *const values, int nvalues) { 
    int i, j = 0; 
    for (i = 0; i != w * h; ++i) { 
    m[i] = values[j]; 
    j = (j + 1) % nvalues; 
    } 
} 

int 
main(void) { 
    int k; 
    if (scanf("%d", &n) !=1 || n<1){ 
     return 0; 
    } 
    if (scanf(" %d", &k) !=1 || k<0){ 
     return 0; 
    } 
    float *hm[3], *dm[3]; 
    dim3 bdim(TILE_SIZE, TILE_SIZE); 
    dim3 gdim(SIZE/TILE_SIZE, SIZE/TILE_SIZE); 
    int i; 
    for(i=0; i<3; ++i) { 
     hm[i] = create_matrix_h(SIZE, SIZE); 
     dm[i] = create_matrix_d(SIZE, SIZE); 
    } 
    float tem[n*n]; 
    for(i=0; i<n*n; ++i) { 
     if (scanf(" %f", &tem[i]) !=1){ 
      return 0; 
     } 
    } 
    float temid[n*n]; 
    int j = 0; 
    for (i = 0; i != n*n; ++i) { 
     if (i==0 || j == n) { // not j + (n+1) 
      temid[i] = 1; 
      j=0; 
     } 
     else { 
      temid[i] = 0; 
      j++; 
     } 
    } 
    fill_matrix_h(hm[0], SIZE, SIZE, tem, sizeof(tem)/sizeof(float)); 
    fill_matrix_h(hm[1], SIZE, SIZE, temid, sizeof(temid)/sizeof(float)); 
    cudaMemcpy(dm[0], hm[0], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice); 
    dm[1] = dm[0]; // For the first iteration Result = A * A; 
    int w; 
    if (k==0) { 
     hm[2] = hm[1]; 
    } 
    else if (k==1) { 
     hm[2] = hm[0]; 
    } 
    else { 
     for (w=1; w<k; ++w) { 
      kernel3<<<gdim, bdim>>>(dm[0], dm[1], dm[2], SIZE); 
      cudaThreadSynchronize(); 
      // No need to copy back to host 
      // cudaMemcpy(hm[2], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToHost); 
      // Copy between device pointers 
      cudaMemcpy(dm[1], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToDevice); 
     } 
     cudaMemcpy(hm[2], dm[1], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToHost); 
    } 



    print_matrix(hm[2], n, n); 

    return 0; 

}

来源

2011-06-03 Milk

如果这是家庭作业或课程作业，您应该如此标记它。关于作业问题是否在计算器上有效，这里有很多不同的观点。 – talonmies 2011-06-04 07:28:22

谢谢我会记住对未来 – Milk 2011-06-29 09:26:13

你正在创建单位矩阵错。

for (i = 0; i != n*n; ++i) { 
     if (i==0 || i == j + (n)) { // not j + (n+1) 
      temid[i] = 1; 
      j = i; 
     } 
     else { 
      temid[i] = 0; 
     } 
    }

实际上，您不需要乘以单位矩阵，因为您知道结果始终是输入。

更改此

fill_matrix_h(hm[0], SIZE, SIZE, tem, sizeof(tem)/sizeof(float)); 
fill_matrix_h(hm[1], SIZE, SIZE, temid, sizeof(temid)/sizeof(float)); 
cudaMemcpy(dm[0], hm[0], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice); 
int w; 
for (w=0; w<k; ++w) { 
    cudaMemcpy(dm[1], hm[1], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice); 
    kernel3<<<gdim, bdim>>>(dm[0], dm[1], dm[2], SIZE); 
    cudaThreadSynchronize(); 
    cudaMemcpy(hm[2], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToHost); 
    hm[1] = hm[2]; 
}

到

fill_matrix_h(hm[0], SIZE, SIZE, tem, sizeof(tem)/sizeof(float)); 
cudaMemcpy(dm[0], hm[0], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice); 
dm[1] = dm[0]; // For the first iteration Result = A * A; 
int w; 
for (w=0; w<k; ++w) { 
    kernel3<<<gdim, bdim>>>(dm[0], dm[1], dm[2], SIZE); 
    cudaThreadSynchronize(); 
    // No need to copy back to host 
    // cudaMemcpy(hm[2], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToHost); 
    // Copy between device pointers 
    cudaMemcpy(dm[1], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToDevice); 
}

来源

2011-06-03 10:33:48

嗨帕文，请参阅上面 – Milk 2011-06-03 12:54:31

嗨，你能评论你的意思是无限循环？如在打印每个循环开始时的W值？ – 2011-06-03 13:28:01

嗨。嗯，也许它不是一个循环，当我在gdb运行它只是说 “[新主题0x7ffff778b700（LWP 9650）]” “[新主题0x7ffff778a700（LWP 9651）]” 等 – Milk 2011-06-03 13:37:18

我相信我们是在同一个球场在这里。（comp2129）。为了回应你的无限循环，你的块大小/瓷砖尺寸太小了。将块设置为16并重试。虽然D：我正在收到seg故障。

来源

2011-06-03 14:58:34

哈哈！是的，我认为不少人正在感受我们的痛苦，在提交页面上有一片红海。哦，考试现在担心回合。 – Milk 2011-06-04 03:59:36

CUDA/C矩阵乘法

回答

相关问题