2016-04-22 45 views
-3

我在CUDA下面的内核:Cuda的IDX犯规指数矩阵正确

__global__ void pagerank(Node *ingoing, Node *outgoing, int N) { 
    int j; 
    int idx = threadIdx.x + blockIdx.x * blockDim.x; 
    if ((idx > 0) && (idx < N)){ 
     //for(j=0;j<N;j++){ 
     // outgoing[j].p_t1=ingoing[j].p_t1; 
     //} 
     outgoing[idx].p_t1=ingoing[idx].p_t1; 

    } 
} 

这行不通。以下作品:

__global__ void pagerank(Node *ingoing, Node *outgoing, int N) { 
     int j; 
     int idx = threadIdx.x + blockIdx.x * blockDim.x; 
     if ((idx > 0) && (idx < N)){ 
      for(j=0;j<N;j++){ 
      outgoing[j].p_t1=ingoing[j].p_t1; 
      } 
      //outgoing[idx].p_t1=ingoing[idx].p_t1; 

     } 
    } 

出了什么问题?为什么idx不会正确地对矩阵进行索引?

整个代码写在下面。理解它并不那么容易。问题是,当我打印传出[IDX] .p_t1场在他们打印的时候我做

outgoing[idx].p_t1=ingoing[idx].p_t1; 

0主要功能的结束,但他们是正确的,当我做

for(j=0;j<N;j++){ 
    outgoing[j].p_t1=ingoing[j].p_t1; 
} 

请告诉我错误?

/******************** Includes - Defines ****************/ 
#include "pagerank_serial.h" 
#include <stdio.h> 
#include <stdlib.h> 
#include <time.h> 
#include <math.h> 
#include <assert.h> 
#include <string.h> 
#include <sys/time.h> 
#include <fcntl.h> 
#include <cuda.h> 
#include "string.h" 

/******************** Defines ****************/ 
// Number of nodes 
int N; 

// Convergence threashold and algorithm's parameter d 
double threshold, d; 

// Table of node's data 
Node *Nodes; 

__global__ void pagerank(Node *ingoing, Node *outgoing, int N) { 
     int j; 
     int idx = threadIdx.x + blockIdx.x * blockDim.x; 
     if ((idx > 0) && (idx < N)){ 
      for(j=0;j<N;j++){ 
      outgoing[j].p_t1=ingoing[j].p_t1; 
      } 
      //outgoing[idx].p_t1=ingoing[idx].p_t1; 

     } 
    } 
/***** Read graph connections from txt file *****/ 

void Read_from_txt_file(char* filename) 
{ 

FILE *fid; 

int from_idx, to_idx; 
int temp_size; 

fid = fopen(filename, "r"); 
if (fid == NULL){ 
    printf("Error opening data file\n"); 
} 

while (!feof(fid)) 
{ 

    if (fscanf(fid,"%d\t%d\n", &from_idx,&to_idx)) 
    { 
    Nodes[from_idx].con_size++; 
    temp_size = Nodes[from_idx].con_size; 
    //Nodes[from_idx].To_id =(int*) realloc(Nodes[from_idx].To_id, temp_size * sizeof(int)); 
    Nodes[from_idx].To_id[temp_size - 1] = to_idx; 
    } 
} 

//printf("End of connections insertion!\n"); 

fclose(fid); 

} 

/***** Read P vector from txt file*****/  

void Read_P_from_txt_file() 
{ 

FILE *fid; 
double temp_P; 
int index = 0; 

fid = fopen("P.txt", "r"); 
if (fid == NULL){printf("Error opening the Probabilities file\n");} 

while (!feof(fid)) 
{ 
    // P's values are double! 
    if (fscanf(fid," double sum = 0;%lf\n", &temp_P)) 
    { 
    Nodes[index].p_t1 = temp_P; 
    index++; 
    } 
} 
//printf("End of P insertion!"); 

fclose(fid);  

} 


/***** Read E vector from txt file*****/  

void Read_E_from_txt_file() 
{ 

FILE *fid; 
double temp_E; 
int index = 0; 

fid = fopen("E.txt", "r"); 
if (fid == NULL) 
    printf("Error opening the E file\n"); 

while (!feof(fid)) 
{ 
    // E's values are double! 
    if (fscanf(fid,"%lf\n", &temp_E)) 
    { 
    Nodes[index].e = temp_E; 
    index++; 
    } 
} 
//printf("End of E insertion!"); 

fclose(fid);  

} 

/***** Create P and E with equal probability *****/ 

void Random_P_E() 
{ 

int i; 
// Sum of P (it must be =1) 
double sum_P_1 = 0; 
// Sum of E (it must be =1) 
double sum_E_1 = 0; 

// Arrays initialization 
for (i = 0; i < N; i++) 
{ 
    Nodes[i].p_t0 = 0; 
    Nodes[i].p_t1 = 1; 
    Nodes[i].p_t1 = (double) Nodes[i].p_t1/N; 

    sum_P_1 = sum_P_1 + Nodes[i].p_t1; 

    Nodes[i].e = 1; 
    Nodes[i].e = (double) Nodes[i].e/N; 
    sum_E_1 = sum_E_1 + Nodes[i].e; 
} 

// Assert sum of probabilities is =1 

// Print sum of P (it must be =1) 
//printf("Sum of P = %f\n",sum_P_1); 

// Exit if sum of P is !=1 
assert(sum_P_1 = 1); 

//printf("\n"); 

// Print sum of E (it must be =1) 
//printf("Sum of E = %f\n",sum_E_1); 

// Exit if sum of Pt0 is !=1 
assert(sum_E_1 = 1); 

} 


/***** Main function *****/ 

int main(int argc, char** argv) 
{ 

int blockSize;  // The launch configurator returned block size 
int minGridSize; // The minimum grid size needed to achieve the maximum occupancy for a full device launch 
int gridSize;  // The actual grid size needed, based on input size 

// Check input arguments 
if (argc < 5) 
{ 
    printf("Error in arguments! Three arguments required: graph filename, N, threshold and d\n"); 
    return 0; 
} 

// get arguments 
char filename[256]; 
strcpy(filename, argv[1]); 
N = atoi(argv[2]); 
threshold = atof(argv[3]); 
d = atof(argv[4]); 

int i; 


// a constant value contributed of all nodes with connectivity = 0 
// it's going to be addes to all node's new probability 


// Allocate memory for N nodes 
Nodes = (Node*) malloc(N * sizeof(Node)); 

for (i = 0; i < N; i++) 
{ 
    Nodes[i].con_size = 0; 
    //Nodes[i].To_id = (int*) malloc(sizeof(int)); 
} 

Read_from_txt_file(filename); 

// set random probabilities 
Random_P_E(); 


Node *h_ingoing; 

Node *h_outgoing; 

h_ingoing = Nodes; 

h_outgoing = (Node *)calloc(N, sizeof *h_outgoing); 

Node *d_ingoing; 

Node *d_outgoing; 

cudaMalloc(&d_ingoing, N * sizeof *d_ingoing); 

cudaMalloc(&d_outgoing, N * sizeof *d_outgoing); 

cudaMemcpy(d_ingoing, h_ingoing, N * sizeof *h_ingoing, cudaMemcpyHostToDevice); 

cudaMemcpy(d_outgoing, h_outgoing, N * sizeof *h_outgoing, cudaMemcpyHostToDevice); 

float time; 

cudaEvent_t begin, end; 

cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, pagerank, 0, N); 

// Round up according to array size 
gridSize = (N + blockSize - 1)/blockSize; 
printf("Gridsize, blockzise : %d , %d \n", gridSize, blockSize); 

cudaEventCreate(&begin); 

cudaEventCreate(&end); 
cudaEventRecord(begin, 0); 

pagerank<<<gridSize, blockSize>>>(d_ingoing, d_outgoing, N, threshold, d); 

cudaEventRecord(end, 0); 


cudaEventSynchronize(end); 


cudaEventElapsedTime(&time, begin, end); 

cudaMemcpy(h_outgoing, d_outgoing, N * sizeof *h_outgoing, cudaMemcpyDeviceToHost); 

printf("%f\n", time) ; 



printf("\n"); 

// Print final probabilitities 
for (i = 0; i <100; i++) 
{ 
    printf("P_t1[%d] = %f\n",i,h_outgoing[i].p_t1); 
} 
printf("\n"); 



printf("End of program!\n"); 

return (EXIT_SUCCESS); 
} 
+0

你得到的错误是什么?我不是一个活的编译器... –

+0

我相信我理解了这个问题,虽然没有调用全局内核的代码,但很难知道发生了什么。 –

+0

我写了整个代码,你现在可以解释一下吗?谢谢 – Haris

回答

1

当你说主要功能,他们打印0,当我做,我以为你是指的所有条目,而不仅仅是指数为0。事实上,索引0不是由你的代码最前一页版本处理((idx > 0) && (idx < N))对于idx=0是错误的。

在代码中进一步说明,我们缺少Node类型的定义。这是强制性的,以便更好地理解代码中可能出现的错误。

根据您在编译中使用的Node的大小,其内容和结构包装,主机端的大小可能与设备上的Node大小不同。使用printf来验证这将是有用的,或使用调试器。

此外,你似乎没有检查发射中的错误。在内核调用后,您一定要添加cudaPeekAtLastErrorcudaDeviceSynchronize以确保没有错误发生。 (来自cuda Runtime API的任何其他方法调用也可能会返回代码未检查的错误)。

编辑 试图重现,我写了以下内容,尽可能接近您的代码。我没有足够内存的卡,因此节点数量较少。

typedef struct 
{ 
    double p_t0; 
    double p_t1; 
    double e; 
    int To_id[460]; 
    int con_size; 
} Node ; 

__global__ void pagerank(Node* ingoing, Node* outgoing, int N) 
{ 
    int idx = threadIdx.x + blockIdx.x * blockDim.x ; 
    if ((idx > 0) && (idx < N)) 
     outgoing[idx].p_t1 = ingoing[idx].p_t1; 
} 

#include <cstdlib> 

#define cudaCheck(a) { cudaError_t cuerr = a ; if (cuerr != cudaSuccess) { printf("[ERROR @ %s : %d ] : (%d) - %s\n", __FILE__, __LINE__, cuerr, cudaGetErrorString(cuerr)) ; ::exit(1) ; } } 

int main() 
{ 
    // int N = 916428 ; // does not fit on my GPU 
    int N = 400000 ; 

    int blockSize; 
    int minGridSize; 
    int gridSize; 

    Node* Nodes = (Node*)malloc(N * sizeof (Node)) ; 

    for (int i = 0 ; i < N ; ++i) 
     Nodes[i].p_t1 = (double)i+1; 

    Node* h_ingoing = Nodes; 
    Node* h_outgoing = (Node*)calloc(N, sizeof *h_outgoing) ; 

    Node* d_ingoing ; 
    Node* d_outgoing ; 

    cudaCheck (cudaMalloc(&d_ingoing, N * sizeof *d_ingoing)); 
    cudaCheck (cudaMalloc(&d_outgoing, N * sizeof *d_outgoing)); 

    cudaCheck (cudaMemcpy (d_ingoing, h_ingoing, N * sizeof *h_ingoing, cudaMemcpyHostToDevice)); 
    cudaCheck (cudaMemcpy (d_outgoing, h_outgoing, N * sizeof *h_outgoing, cudaMemcpyHostToDevice)); 

    float time; 

    cudaEvent_t begin, end ; 

    //blockSize = 256 ; 
    cudaOccupancyMaxPotentialBlockSize<> (&minGridSize, &blockSize, pagerank, 0, N) ; 
    gridSize = (N + blockSize -1)/blockSize ; 

    printf ("Configuration = <<< %d , %d >>>\n", gridSize, blockSize) ; 

    cudaCheck (cudaEventCreate (&begin)) ; 
    cudaCheck (cudaEventCreate (&end)) ; 

    cudaCheck (cudaEventRecord (begin, 0)) ; 

    pagerank <<< gridSize, blockSize >>> (d_ingoing, d_outgoing, N) ; 

    cudaCheck (cudaEventRecord (end, 0)) ; 

    cudaCheck (cudaEventSynchronize (end)) ; 

    cudaCheck (cudaMemcpy (h_outgoing, d_outgoing, N * sizeof *h_outgoing, cudaMemcpyDeviceToHost)) ; 

    for (int i = 0 ; i < 100 ; ++i) 
    { 
     printf ("P_t1[%d] = %f\n", i, h_outgoing[i].p_t1) ; 
    } 

    for (int i = 0 ; i < N ; ++i) 
    { 
     if (h_outgoing[i].p_t1 != (double)(i+1)) 
      printf ("Error @ %d : %lf <> %lf\n", i, h_outgoing[i].p_t1, (double)(i+1)); 
    } 

    return 0 ; 
} 

除了索引为0的第一个答案草案出现问题时,每个输出都是正确的。

+0

谢谢。我在下面添加了节点描述。这会改变什么吗? – Haris

+0

节点是一个大型结构。你没有得到任何运行时错误? –

+0

实际上节点矩阵(有916428个节点)的总大小是1.7Gbit。我正在使用特斯拉k20m跑步。不,我没有收到任何错误,至少不是没有调试工具 – Haris