2016-04-25 90 views
1
#include <iostream> 
#include <assert.h> 
#include <sys/time.h> 

#define BLOCK_SIZE 32 // CUDA block size 

__device__ inline int getValFromMatrix(int* matrix, int row, int col,int matSize) { 
    if (row<matSize && col<matSize) {return matrix[row*matSize + col];} 
    return 0; 
} 

__device__ inline int getValFromVector(int* vector, int row, int matSize) { 
    if (row<matSize) {return vector[row];} 
    return 0; 
} 

__global__ void matVecMultCUDAKernel(int* aOnGPU, int* bOnGPU, int* cOnGPU, int matSize) { 
    __shared__ int aRowShared[BLOCK_SIZE]; 
    __shared__ int bShared[BLOCK_SIZE]; 
    __shared__ int myRow; 
    __shared__ double rowSum; 

    int myIndexInBlock = threadIdx.x; 
    myRow = blockIdx.x; 
    rowSum = 0; 

    for (int m = 0; m < (matSize/BLOCK_SIZE + 1);m++) { 
     aRowShared[myIndexInBlock] = getValFromMatrix(aOnGPU,myRow,m*BLOCK_SIZE+myIndexInBlock,matSize); 
     bShared[myIndexInBlock] = getValFromVector(bOnGPU,m*BLOCK_SIZE+myIndexInBlock,matSize); 

     __syncthreads(); // Sync threads to make sure all fields have been written by all threads in the block to cShared and xShared 

     if (myIndexInBlock==0) { 
      for (int k=0;k<BLOCK_SIZE;k++) { 
       rowSum += aRowShared[k] * bShared[k]; 
      } 
     } 
    } 

    if (myIndexInBlock==0) {cOnGPU[myRow] = rowSum;} 
} 

static inline void cudaCheckReturn(cudaError_t result) { 
    if (result != cudaSuccess) { 
     std::cerr <<"CUDA Runtime Error: " << cudaGetErrorString(result) << std::endl; 
     assert(result == cudaSuccess); 
    } 
} 

static void matVecMultCUDA(int* aOnGPU,int* bOnGPU, int* cOnGPU, int* c, int sizeOfc, int matSize) { 
    matVecMultCUDAKernel<<<matSize,BLOCK_SIZE>>>(aOnGPU,bOnGPU,cOnGPU,matSize); // Launch 1 block per row 

    cudaCheckReturn(cudaMemcpy(c,cOnGPU,sizeOfc,cudaMemcpyDeviceToHost)); 
} 


static void matVecMult(int** A,int* b, int* c, int matSize) { 
    // Sequential implementation: 
    for (int i=0;i<matSize;i++) { 
     c[i]=0; 
     for (int j=0;j<matSize;j++) { 
      c[i]+=(A[i][j] * b[j]); 
     } 
    } 
} 

int main() { 
    int matSize = 1000; 

    int** A,* b,* c; 
    int* aOnGPU,* bOnGPU,* cOnGPU; 

    A = new int*[matSize]; 
    for (int i = 0; i < matSize;i++) {A[i] = new int[matSize]();} 
    b = new int[matSize](); 
    c = new int[matSize](); 

    int aSizeOnGPU = matSize * matSize * sizeof(int), bcSizeOnGPU = matSize * sizeof(int); 

    cudaCheckReturn(cudaMalloc(&aOnGPU,aSizeOnGPU)); // cudaMallocPitch? 
    cudaCheckReturn(cudaMalloc(&bOnGPU,bcSizeOnGPU)); 
    cudaCheckReturn(cudaMalloc(&cOnGPU,bcSizeOnGPU)); 

    srand(time(NULL)); 

    for (int i=0;i<matSize;i++) { 
     b[i] = rand()%100; 
     for (int j=0;j<matSize;j++) { 
      A[i][j] = rand()%100; 
     } 
    } 

    for (int i=0;i<matSize;i++) {cudaCheckReturn(cudaMemcpy((aOnGPU+i*matSize),A[i],bcSizeOnGPU,cudaMemcpyHostToDevice));} 
    cudaCheckReturn(cudaMemcpy(bOnGPU,b,bcSizeOnGPU,cudaMemcpyHostToDevice)); 

    int iters=1; 
    timeval start,end; 

    // Sequential run: 
    gettimeofday(&start,NULL); 
    for (int i=0;i<iters;i++) {matVecMult(A,b,c,matSize);} 
    gettimeofday(&end,NULL); 
    std::cout << (end.tv_sec*1000000 + end.tv_usec) - (start.tv_sec*1000000 + start.tv_usec) << std::endl; 

    // CUDA run: 
    gettimeofday(&start,NULL); 
    for (int i=0;i<iters;i++) {matVecMultCUDA(aOnGPU,bOnGPU,cOnGPU,c,bcSizeOnGPU,matSize);} 
    gettimeofday(&end,NULL); 
    std::cout << (end.tv_sec*1000000 + end.tv_usec) - (start.tv_sec*1000000 + start.tv_usec) << std::endl; 

    cudaCheckReturn(cudaFree(aOnGPU)); 
    cudaCheckReturn(cudaFree(bOnGPU)); 
    cudaCheckReturn(cudaFree(cOnGPU)); 


    for (int i = 0; i < matSize; ++i) { 
     delete[] A[i]; 
    } 
    delete[] A; 
    delete[] b; 
    delete[] c; 
} 

给出:无法获得矩阵*向量乘法去更快CUDA比CPU

267171 
580253 

我跟着指南http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory,如何做一个矩阵乘法。我对矩阵(A)和向量(B)都使用共享内存,但无论选择什么样的矩阵大小(100 * 100-20000 * 20000)或块大小(32-1024),顺序执行总是优于在CUDA实施速度方面,速度大约快两倍。

由于我使用矩阵*向量乘法,共享数组和块的处理方式有点不同;我在矩阵的每一行使用一个块,而不是在矩阵的一部分上使用二维块。

是我的执行错误,或者简直是CUDA不比CPU快?

回答

3

第一项:您在cuda实现中对不在CPU上的边界执行检查。在GPU上分支是非常昂贵的。

第二:你可以算作cuda表演中的cudamemcpy。在将结果返回给CPU之前,仅执行一次乘法是非常罕见的。 通常(例如在CG上),您必须在复制之前在GPU上执行数百次乘法。

第三:不要试图实现(教育目的除外)并使用供应商库(如每个CUDA版本附带的CUBLAS),这是非常难以超越的。