2012-10-06 165 views
3

我遇到了这段我写的CUDA代码的麻烦。这应该是Dijkstra's algorithm的CUDA实现。代码如下:Dijkstra在CUDA中的算法

__global__ void cuda_dijkstra_kernel_1(float* Va, int* Ea, int* Sa, float* Ca, float* Ua, char* Ma, unsigned int* lock){ 

     int tid = blockIdx.x; 
     if(Ma[tid]=='1'){ 
      Ma[tid] = '0'; 
      int ind_Ea = Sa[tid * 2]; 
      int num_edges = Sa[(tid * 2) + 1]; 
      int v; 
      float wt = 0; 
      unsigned int leaveloop; 
      leaveloop = 0u; 
      while(leaveloop==0u){ 
       if(atomicExch(lock, 1u) == 0u){ 
        for(v = 0; v < num_edges; v++){ 
         wt = (Va[tid * 3] - Va[Ea[ind_Ea + v] * 3]) * (Va[tid * 3] - Va[Ea[ind_Ea + v] * 3]) + 
           (Va[(tid * 3) + 1] - Va[(Ea[ind_Ea + v] * 3) + 1]) * (Va[(tid * 3) + 1] - Va[(Ea[ind_Ea + v] * 3) + 1]) + 
           (Va[(tid * 3) + 2] - Va[(Ea[ind_Ea + v] * 3) + 2]) * (Va[(tid * 3) + 2] - Va[(Ea[ind_Ea + v] * 3) + 2]) ; 
         wt = sqrt(wt); 

         if(Ca[Ea[ind_Ea + v]] > (Ca[tid] + wt)){ 
          Ca[Ea[ind_Ea + v]] = Ca[tid] + wt; 
          Ma[Ea[ind_Ea + v]] = '1'; 
         } 
         __threadfence(); 
         leaveloop = 1u; 
         atomicExch(lock, 0u); 
        } 
       } 
      } 
     } 
    } 

的问题是在Dijkstra算法的松弛阶段。我已经实施了关键部分这样的阶段。如果存在一个顶点(可以说是a),该顶点是多个顶点的邻居(即,连接到具有边的其他顶点),那么这些顶点的所有线程将尝试写入顶点a的位置成本阵列Ca。现在我的目标是在该位置写入较小的值。为此,我试图序列化该过程并应用__threadfence(),以便其他人可以看到由一个线程写入的值,然后最终将较小的值保留在顶点a的位置。但问题是,这种逻辑不起作用。顶点a的位置没有获得尝试写入该位置的所有线程的最小值,我不明白为什么。任何帮助将不胜感激。

+2

你为什么要设置tid = blockIdx.x; ?不应该使用像tid =(blockIdx.x * blockdim.x)+ threadIdx.x; ?据我所知,你所有的线程都执行完全相同的代码。这是你的意图吗?你的内核启动调用是什么样的? –

+0

嗨罗伯特,网格中有尽可能多的块与顶点一样多。并且每块只有一个线程。因此,一个顶点正在被块中唯一的线程处理,cuda内核调用如下所示:cuda_kernel <<< dimGrid,dimBlock >>>(argumentlist ..),其中dimGrid =(numVers,1)和dimBlock =(1,1) –

+1

“每个块只有一个线程”---这将不会有效。除非以这种方式使用CUDA真的有很好的理由 - 不要! – CygnusX1

回答

3

有一个“经典”(至少大部分是引用)实现Dijkstra的单源最短路径对文件中所载

的GPU加速大型图形算法的GPU(SSSP)算法通过帕尔旺哈里什和PJ纳拉亚南使用CUDA

然而,在纸的实施已经认识到被窃听,看到

为SSSP问题 CUDA解由Pedro J.马丁,罗伯特托雷斯,和Antonio加维拉内斯

我执行下面报告了根据第二的备注固定在第一文件建议。该代码还包含一个C++版本。

#include <sstream> 
#include <vector> 
#include <iostream> 
#include <stdio.h> 
#include <float.h> 

#include "Utilities.cuh" 

#define NUM_ASYNCHRONOUS_ITERATIONS 20 // Number of async loop iterations before attempting to read results back 

#define BLOCK_SIZE 16 

/***********************/ 
/* GRAPHDATA STRUCTURE */ 
/***********************/ 
// --- The graph data structure is an adjacency list. 
typedef struct { 

    // --- Contains the integer offset to point to the edge list for each vertex 
    int *vertexArray; 

    // --- Overall number of vertices 
    int numVertices; 

    // --- Contains the "destination" vertices each edge is attached to 
    int *edgeArray; 

    // --- Overall number of edges 
    int numEdges; 

    // --- Contains the weight of each edge 
    float *weightArray; 

} GraphData; 

/**********************************/ 
/* GENERATE RANDOM GRAPH FUNCTION */ 
/**********************************/ 
void generateRandomGraph(GraphData *graph, int numVertices, int neighborsPerVertex) { 

    graph -> numVertices = numVertices; 
    graph -> vertexArray = (int *)malloc(graph -> numVertices * sizeof(int)); 
    graph -> numEdges  = numVertices * neighborsPerVertex; 
    graph -> edgeArray  = (int *)malloc(graph -> numEdges * sizeof(int)); 
    graph -> weightArray = (float *)malloc(graph -> numEdges * sizeof(float)); 

    for (int i = 0; i < graph -> numVertices; i++) graph -> vertexArray[i] = i * neighborsPerVertex; 

    int *tempArray = (int *)malloc(neighborsPerVertex * sizeof(int)); 
    for (int k = 0; k < numVertices; k++) { 
     for (int l = 0; l < neighborsPerVertex; l++) tempArray[l] = INT_MAX; 
     for (int l = 0; l < neighborsPerVertex; l++) { 
      bool goOn = false; 
      int temp; 
      while (goOn == false) { 
       goOn = true; 
       temp = (rand() % graph->numVertices); 
       for (int t = 0; t < neighborsPerVertex; t++) 
        if (temp == tempArray[t]) goOn = false; 
       if (temp == k) goOn = false; 
       if (goOn == true) tempArray[l] = temp; 
      } 
      graph -> edgeArray [k * neighborsPerVertex + l] = temp; 
      graph -> weightArray[k * neighborsPerVertex + l] = (float)(rand() % 1000)/1000.0f; 
     } 
    } 
} 

/************************/ 
/* minDistance FUNCTION */ 
/************************/ 
// --- Finds the vertex with minimum distance value, from the set of vertices not yet included in shortest path tree 
int minDistance(float *shortestDistances, bool *finalizedVertices, const int sourceVertex, const int N) { 

    // --- Initialize minimum value 
    int minIndex = sourceVertex; 
    float min = FLT_MAX; 

    for (int v = 0; v < N; v++) 
     if (finalizedVertices[v] == false && shortestDistances[v] <= min) min = shortestDistances[v], minIndex = v; 

    return minIndex; 
} 

/************************/ 
/* dijkstraCPU FUNCTION */ 
/************************/ 
void dijkstraCPU(float *graph, float *h_shortestDistances, int sourceVertex, const int N) { 

    // --- h_finalizedVertices[i] is true if vertex i is included in the shortest path tree 
    //  or the shortest distance from the source node to i is finalized 
    bool *h_finalizedVertices = (bool *)malloc(N * sizeof(bool)); 

    // --- Initialize h_shortestDistancesances as infinite and h_shortestDistances as false 
    for (int i = 0; i < N; i++) h_shortestDistances[i] = FLT_MAX, h_finalizedVertices[i] = false; 

    // --- h_shortestDistancesance of the source vertex from itself is always 0 
    h_shortestDistances[sourceVertex] = 0.f; 

    // --- Dijkstra iterations 
    for (int iterCount = 0; iterCount < N - 1; iterCount++) { 

     // --- Selecting the minimum distance vertex from the set of vertices not yet 
     //  processed. currentVertex is always equal to sourceVertex in the first iteration. 
     int currentVertex = minDistance(h_shortestDistances, h_finalizedVertices, sourceVertex, N); 

     // --- Mark the current vertex as processed 
     h_finalizedVertices[currentVertex] = true; 

     // --- Relaxation loop 
     for (int v = 0; v < N; v++) { 

      // --- Update dist[v] only if it is not in h_finalizedVertices, there is an edge 
      //  from u to v, and the cost of the path from the source vertex to v through 
      //  currentVertex is smaller than the current value of h_shortestDistances[v] 
      if (!h_finalizedVertices[v] && 
       graph[currentVertex * N + v] && 
       h_shortestDistances[currentVertex] != FLT_MAX && 
       h_shortestDistances[currentVertex] + graph[currentVertex * N + v] < h_shortestDistances[v]) 

       h_shortestDistances[v] = h_shortestDistances[currentVertex] + graph[currentVertex * N + v]; 
     } 
    } 
} 

/***************************/ 
/* MASKARRAYEMPTY FUNCTION */ 
/***************************/ 
// --- Check whether all the vertices have been finalized. This tells the algorithm whether it needs to continue running or not. 
bool allFinalizedVertices(bool *finalizedVertices, int numVertices) { 

    for (int i = 0; i < numVertices; i++) if (finalizedVertices[i] == true) { return false; } 

    return true; 
} 

/*************************/ 
/* ARRAY INITIALIZATIONS */ 
/*************************/ 
__global__ void initializeArrays(bool * __restrict__ d_finalizedVertices, float* __restrict__ d_shortestDistances, float* __restrict__ d_updatingShortestDistances, 
           const int sourceVertex, const int numVertices) { 

    int tid = blockIdx.x * blockDim.x + threadIdx.x; 

    if (tid < numVertices) { 

     if (sourceVertex == tid) { 

      d_finalizedVertices[tid]   = true; 
      d_shortestDistances[tid]   = 0.f; 
      d_updatingShortestDistances[tid] = 0.f; } 

     else { 

      d_finalizedVertices[tid]   = false; 
      d_shortestDistances[tid]   = FLT_MAX; 
      d_updatingShortestDistances[tid] = FLT_MAX; 
     } 
    } 
} 

/**************************/ 
/* DIJKSTRA GPU KERNEL #1 */ 
/**************************/ 
__global__ void Kernel1(const int * __restrict__ vertexArray, const int* __restrict__ edgeArray, 
         const float * __restrict__ weightArray, bool * __restrict__ finalizedVertices, float* __restrict__ shortestDistances, 
         float * __restrict__ updatingShortestDistances, const int numVertices, const int numEdges) { 

    int tid = blockIdx.x*blockDim.x + threadIdx.x; 

    if (tid < numVertices) { 

     if (finalizedVertices[tid] == true) { 

      finalizedVertices[tid] = false; 

      int edgeStart = vertexArray[tid], edgeEnd; 

      if (tid + 1 < (numVertices)) edgeEnd = vertexArray[tid + 1]; 
      else       edgeEnd = numEdges; 

      for (int edge = edgeStart; edge < edgeEnd; edge++) { 
       int nid = edgeArray[edge]; 
       atomicMin(&updatingShortestDistances[nid], shortestDistances[tid] + weightArray[edge]); 
      } 
     } 
    } 
} 

/**************************/ 
/* DIJKSTRA GPU KERNEL #1 */ 
/**************************/ 
__global__ void Kernel2(const int * __restrict__ vertexArray, const int * __restrict__ edgeArray, const float* __restrict__ weightArray, 
         bool * __restrict__ finalizedVertices, float* __restrict__ shortestDistances, float* __restrict__ updatingShortestDistances, 
         const int numVertices) { 

    int tid = blockIdx.x * blockDim.x + threadIdx.x; 

    if (tid < numVertices) { 

     if (shortestDistances[tid] > updatingShortestDistances[tid]) { 
      shortestDistances[tid] = updatingShortestDistances[tid]; 
      finalizedVertices[tid] = true; } 

     updatingShortestDistances[tid] = shortestDistances[tid]; 
    } 
} 

/************************/ 
/* dijkstraGPU FUNCTION */ 
/************************/ 
void dijkstraGPU(GraphData *graph, const int sourceVertex, float * __restrict__ h_shortestDistances) { 

    // --- Create device-side adjacency-list, namely, vertex array Va, edge array Ea and weight array Wa from G(V,E,W) 
    int  *d_vertexArray;   gpuErrchk(cudaMalloc(&d_vertexArray, sizeof(int) * graph -> numVertices)); 
    int  *d_edgeArray;   gpuErrchk(cudaMalloc(&d_edgeArray, sizeof(int) * graph -> numEdges)); 
    float *d_weightArray;   gpuErrchk(cudaMalloc(&d_weightArray, sizeof(float) * graph -> numEdges)); 

    // --- Copy adjacency-list to the device 
    gpuErrchk(cudaMemcpy(d_vertexArray, graph -> vertexArray, sizeof(int) * graph -> numVertices, cudaMemcpyHostToDevice)); 
    gpuErrchk(cudaMemcpy(d_edgeArray, graph -> edgeArray, sizeof(int) * graph -> numEdges, cudaMemcpyHostToDevice)); 
    gpuErrchk(cudaMemcpy(d_weightArray, graph -> weightArray, sizeof(float) * graph -> numEdges, cudaMemcpyHostToDevice)); 

    // --- Create mask array Ma, cost array Ca and updating cost array Ua of size V 
    bool *d_finalizedVertices;   gpuErrchk(cudaMalloc(&d_finalizedVertices,  sizeof(bool) * graph->numVertices)); 
    float *d_shortestDistances;   gpuErrchk(cudaMalloc(&d_shortestDistances,  sizeof(float) * graph->numVertices)); 
    float *d_updatingShortestDistances; gpuErrchk(cudaMalloc(&d_updatingShortestDistances, sizeof(float) * graph->numVertices)); 

    bool *h_finalizedVertices = (bool *)malloc(sizeof(bool) * graph->numVertices); 

    // --- Initialize mask Ma to false, cost array Ca and Updating cost array Ua to \u221e 
    initializeArrays <<<iDivUp(graph->numVertices, BLOCK_SIZE), BLOCK_SIZE >>>(d_finalizedVertices, d_shortestDistances, 
                  d_updatingShortestDistances, sourceVertex, graph -> numVertices); 
    gpuErrchk(cudaPeekAtLastError()); 
    gpuErrchk(cudaDeviceSynchronize()); 

    // --- Read mask array from device -> host 
    gpuErrchk(cudaMemcpy(h_finalizedVertices, d_finalizedVertices, sizeof(bool) * graph->numVertices, cudaMemcpyDeviceToHost)); 

    while (!allFinalizedVertices(h_finalizedVertices, graph->numVertices)) { 

     // --- In order to improve performance, we run some number of iterations without reading the results. This might result 
     //  in running more iterations than necessary at times, but it will in most cases be faster because we are doing less 
     //  stalling of the GPU waiting for results. 
     for (int asyncIter = 0; asyncIter < NUM_ASYNCHRONOUS_ITERATIONS; asyncIter++) { 

      Kernel1 <<<iDivUp(graph->numVertices, BLOCK_SIZE), BLOCK_SIZE >>>(d_vertexArray, d_edgeArray, d_weightArray, d_finalizedVertices, d_shortestDistances, 
                  d_updatingShortestDistances, graph->numVertices, graph->numEdges); 
      gpuErrchk(cudaPeekAtLastError()); 
      gpuErrchk(cudaDeviceSynchronize()); 
      Kernel2 <<<iDivUp(graph->numVertices, BLOCK_SIZE), BLOCK_SIZE >>>(d_vertexArray, d_edgeArray, d_weightArray, d_finalizedVertices, d_shortestDistances, d_updatingShortestDistances, 
                  graph->numVertices); 
      gpuErrchk(cudaPeekAtLastError()); 
      gpuErrchk(cudaDeviceSynchronize()); 
     } 

     gpuErrchk(cudaMemcpy(h_finalizedVertices, d_finalizedVertices, sizeof(bool) * graph->numVertices, cudaMemcpyDeviceToHost)); 
    } 

    // --- Copy the result to host 
    gpuErrchk(cudaMemcpy(h_shortestDistances, d_shortestDistances, sizeof(float) * graph->numVertices, cudaMemcpyDeviceToHost)); 

    free(h_finalizedVertices); 

    gpuErrchk(cudaFree(d_vertexArray)); 
    gpuErrchk(cudaFree(d_edgeArray)); 
    gpuErrchk(cudaFree(d_weightArray)); 
    gpuErrchk(cudaFree(d_finalizedVertices)); 
    gpuErrchk(cudaFree(d_shortestDistances)); 
    gpuErrchk(cudaFree(d_updatingShortestDistances)); 
} 

/****************/ 
/* MAIN PROGRAM */ 
/****************/ 
int main() { 

    // --- Number of graph vertices 
    int numVertices = 8; 

    // --- Number of edges per graph vertex 
    int neighborsPerVertex = 6; 

    // --- Source vertex 
    int sourceVertex = 0; 

    // --- Allocate memory for arrays 
    GraphData graph; 
    generateRandomGraph(&graph, numVertices, neighborsPerVertex); 

    // --- From adjacency list to adjacency matrix. 
    //  Initializing the adjacency matrix 
    float *weightMatrix = (float *)malloc(numVertices * numVertices * sizeof(float)); 
    for (int k = 0; k < numVertices * numVertices; k++) weightMatrix[k] = FLT_MAX; 

    // --- Displaying the adjacency list and constructing the adjacency matrix 
    printf("Adjacency list\n"); 
    for (int k = 0; k < numVertices; k++) weightMatrix[k * numVertices + k] = 0.f; 
    for (int k = 0; k < numVertices; k++) 
     for (int l = 0; l < neighborsPerVertex; l++) { 
      weightMatrix[k * numVertices + graph.edgeArray[graph.vertexArray[k] + l]] = graph.weightArray[graph.vertexArray[k] + l]; 
      printf("Vertex nr. %i; Edge nr. %i; Weight = %f\n", k, graph.edgeArray[graph.vertexArray[k] + l], 
                    graph.weightArray[graph.vertexArray[k] + l]); 
     } 

    for (int k = 0; k < numVertices * neighborsPerVertex; k++) 
     printf("%i %i %f\n", k, graph.edgeArray[k], graph.weightArray[k]); 

    // --- Displaying the adjacency matrix 
    printf("\nAdjacency matrix\n"); 
    for (int k = 0; k < numVertices; k++) { 
     for (int l = 0; l < numVertices; l++) 
      if (weightMatrix[k * numVertices + l] < FLT_MAX) 
       printf("%1.3f\t", weightMatrix[k * numVertices + l]); 
      else 
       printf("--\t"); 
      printf("\n"); 
     } 

    // --- Running Dijkstra on the CPU 
    float *h_shortestDistancesCPU = (float *)malloc(numVertices * sizeof(float)); 
    dijkstraCPU(weightMatrix, h_shortestDistancesCPU, sourceVertex, numVertices); 

    printf("\nCPU results\n"); 
    for (int k = 0; k < numVertices; k++) printf("From vertex %i to vertex %i = %f\n", sourceVertex, k, h_shortestDistancesCPU[k]); 

    // --- Allocate space for the h_shortestDistancesGPU 
    float *h_shortestDistancesGPU = (float*)malloc(sizeof(float) * graph.numVertices); 
    dijkstraGPU(&graph, sourceVertex, h_shortestDistancesGPU); 

    printf("\nGPU results\n"); 
    for (int k = 0; k < numVertices; k++) printf("From vertex %i to vertex %i = %f\n", sourceVertex, k, h_shortestDistancesGPU[k]); 

    free(h_shortestDistancesCPU); 
    free(h_shortestDistancesGPU); 

    return 0; 
}