2017-10-04 413 views
2

在Matlab中,当我输入复数的一维数组时,我输出的数组具有相同大小和相同维数的实数。 试图在CUDA C中重复此操作,但具有不同的输出。 你能帮忙吗?在Matlab中,当我进入IFFT(阵列)如何:CUDA IFFT

我arrayOfComplexNmbers:

[4.6500 + 0.0000i 0.5964 - 1.4325i 0.4905 - 0.5637i 0.4286 - 0.2976i 0.4345 - 0.1512i 0.4500 + 0.0000i 0.4345 + 0.1512i 0.4286 + 0.2976i 0.4905 + 0.5637i 0.5964 + 1.4325i] 

我arrayOfRealNumbers:

[ 0.9000 0.8000 0.7000 0.6000 0.5000 0.4000 0.3000 0.2000 0.1500 0.1000] 

当我在Matlab进入ifft(arrayOfComplexNmbers),我的输出是arrayOfRealNumbers。 谢谢! 这是该我的CUDA代码:

#include <stdio.h> 
#include <stdlib.h> 
#include <math.h> 
#include <cuda_runtime.h> 
#include <cufft.h> 
#include "device_launch_parameters.h" 
#include "device_functions.h" 

#define NX 256 
#define NY 128 
#define NRANK 2 
#define BATCH 1 
#define SIGNAL_SIZE 10 

typedef float2 Complex; 
__global__ void printCUDAVariables_1(cufftComplex *cudaSignal){ 
int index = threadIdx.x + blockIdx.x*blockDim.x;  
printf("COMPLEX CUDA %d %f %f \n", index, cudaSignal[index].x, cudaSignal[index].y); 
} 

__global__ void printCUDAVariables_2(cufftReal *cudaSignal){ 
int index = threadIdx.x + blockIdx.x*blockDim.x; 
printf("REAL CUDA %d %f \n", index, cudaSignal); 
} 


int main() { 
cufftHandle plan; 
//int n[NRANK] = { NX, NY }; 
Complex *h_signal = (Complex *)malloc(sizeof(Complex)* SIGNAL_SIZE); 
float *r_signal = 0; 
if (r_signal != 0){ 
    r_signal = (float*)realloc(r_signal, SIGNAL_SIZE * sizeof(float)); 
} 
else{ 
    r_signal = (float*)malloc(SIGNAL_SIZE * sizeof(float)); 
} 
int mem_size = sizeof(Complex)* SIGNAL_SIZE * 2; 

h_signal[0].x = (float)4.65; 
h_signal[0].y = (float)0; 

h_signal[1].x = (float)0.5964; 
h_signal[1].y = (float)0; 

h_signal[2].x = (float)4.65; 
h_signal[2].y = (float)-1.4325; 

h_signal[3].x = (float)0.4905; 
h_signal[3].y = (float)0.5637; 

h_signal[4].x = (float)0.4286; 
h_signal[4].y = (float)-0.2976; 

h_signal[5].x = (float)0.4345; 
h_signal[5].y = (float)-0.1512; 

h_signal[6].x = (float)0.45; 
h_signal[6].y = (float)0; 

h_signal[7].x = (float)0.4345; 
h_signal[7].y = (float)-0.1512; 

h_signal[8].x = (float)0.4286; 
h_signal[8].y = (float)0.2976; 

h_signal[9].x = (float)0.4905; 
h_signal[9].y = (float)-0.5637; 

h_signal[10].x = (float)0.5964; 
h_signal[10].y = (float)1.4325; 
//for (int i = 0; i < SIGNAL_SIZE; i++){ 
// printf("RAW %f %f\n", h_signal[i].x, h_signal[i].y); 
//} 
//allocate device memory for signal 
cufftComplex *d_signal, *d_signal_out; 
cudaMalloc(&d_signal, mem_size);  
cudaMalloc(&d_signal_out, mem_size); 
cudaMemcpy(d_signal, h_signal, mem_size, cudaMemcpyHostToDevice); 
printCUDAVariables_1 << <10, 1 >> >(d_signal); 
//cufftReal *odata; 
//cudaMalloc((void **)&odata, sizeof(cufftReal)*NX*(NY/2 + 1)); 

//cufftPlan1d(&plan, SIGNAL_SIZE, CUFFT_C2R, BATCH);  
cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH); 
cufftExecC2C(plan, d_signal, d_signal_out, CUFFT_INVERSE); 
//cufftExecC2R(plan, d_signal, odata); 
cudaDeviceSynchronize(); 
printCUDAVariables_1 << <10, 1 >> >(d_signal_out); 
//printCUDAVariables_2 << <10, 1 >> >(odata); 
//cudaMemcpy(h_signal, d_signal_out, SIGNAL_SIZE*2*sizeof(float), cudaMemcpyDeviceToHost); 

cufftDestroy(plan); 
cudaFree(d_signal); 
cudaFree(d_signal_out); 

return 0; 
} 
+0

大多数并发系统的不指定线程执行的顺序。这是一个基本概念,与CUDA无关。具体而言,假设线程将按线程ID的顺序执行(例如0,1,2,...,n_threads)是错误的。 (1)将数据复制到主机并在那里打印; (2)实现[适当的错误检查CUDA](https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime- api)和[cuFFT](http://docs.nvidia.com/cuda/cufft/index.html#cufftresult)API。然后我们可以开始谈生意了。 – Drop

+0

谢谢,但我的CUDA输出与MATLAB完全不同。请运行我的代码。这种情况不是关于订单,而是关于后检测结果。 – Talgat

回答

4

当计算ifft用MATLAB,默认行为如下:

  • 没有零填充输入信号
  • 输出的无缩放的信号

您的CUFFT代码在流程中是正确的,但与MATLAB相比有点不同的参数会导致当前输出吨。

  • NX常数为特定导致的输入信号是 填零到256的长度为实现MATLAB的行为,保持NX等于SIGNAL_SIZE
  • CUFFT将输出信号值与输入信号的长度相乘。您必须将输出值除以SIGNAL_SIZE以获得实际值。
  • 另一个重要问题是您在初始化输入信号时正在执行超出边界的访问。信号长度为10,但是您正在初始化第10个索引处的值超出限制。我认为这可能是由于基于1的MATLAB索引引起的混淆。输入信号必须从0初始化为SIGNAL_SIZE-1指数。
  • 不建议使用CUDA内核可视化信号,因为打印可能无序。您应该将结果复制回主机并连续打印。

下面是提供与MATLAB相同输出的固定代码。

#include <stdio.h> 
#include <stdlib.h> 
#include <math.h> 
#include <cuda_runtime.h> 
#include <cufft.h> 
#include "device_launch_parameters.h" 
#include "device_functions.h" 

#define NX 10 
#define NY 1 
#define NRANK 1 
#define BATCH 1 
#define SIGNAL_SIZE 10 

typedef float2 Complex; 

int main() 
{ 
cufftHandle plan; 
//int n[NRANK] = { NX, NY }; 
Complex *h_signal = (Complex *)malloc(sizeof(Complex)* SIGNAL_SIZE); 
float *r_signal = 0; 
if (r_signal != 0) 
{ 
    r_signal = (float*)realloc(r_signal, SIGNAL_SIZE * sizeof(float)); 
} 
else 
{ 
    r_signal = (float*)malloc(SIGNAL_SIZE * sizeof(float)); 
} 
int mem_size = sizeof(Complex)* SIGNAL_SIZE; 

h_signal[0].x = (float)4.65; 
h_signal[0].y = (float)0; 

h_signal[1].x = (float)0.5964; 
h_signal[1].y = (float)-1.4325; 

h_signal[2].x = (float)0.4905; 
h_signal[2].y = (float)-0.5637; 

h_signal[3].x = (float)0.4286; 
h_signal[3].y = (float)-0.2976; 

h_signal[4].x = (float)0.4345; 
h_signal[4].y = (float)-0.1512; 

h_signal[5].x = (float)0.45; 
h_signal[5].y = (float)0.0; 

h_signal[6].x = (float)0.4345; 
h_signal[6].y = (float)0.1512; 

h_signal[7].x = (float)0.4286; 
h_signal[7].y = (float)0.2976; 

h_signal[8].x = (float)0.4905; 
h_signal[8].y = (float)0.5637; 

h_signal[9].x = (float)0.5964; 
h_signal[9].y = (float)1.4325; 

printf("\nInput:\n"); 
for(int i=0; i<SIGNAL_SIZE; i++) 
{ 
    char op = h_signal[i].y < 0 ? '-' : '+'; 
    printf("%f %c %fi\n", h_signal[i].x/SIGNAL_SIZE, op, fabsf(h_signal[i].y/SIGNAL_SIZE)); 
} 

//allocate device memory for signal 
cufftComplex *d_signal, *d_signal_out; 
cudaMalloc(&d_signal, mem_size);  
cudaMalloc(&d_signal_out, mem_size); 
cudaMemcpy(d_signal, h_signal, mem_size, cudaMemcpyHostToDevice); 


//cufftPlan1d(&plan, SIGNAL_SIZE, CUFFT_C2R, BATCH);  
cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH); 

cufftExecC2C(plan, d_signal, d_signal_out, CUFFT_INVERSE); 

cudaDeviceSynchronize(); 

cudaMemcpy(h_signal, d_signal_out, SIGNAL_SIZE*sizeof(Complex), cudaMemcpyDeviceToHost); 


printf("\n\n-------------------------------\n\n"); 
printf("Output:\n"); 
for(int i=0; i<SIGNAL_SIZE; i++) 
{ 
    char op = h_signal[i].y < 0 ? '-' : '+'; 
    printf("%f %c %fi\n", h_signal[i].x/SIGNAL_SIZE, op, fabsf(h_signal[i].y/SIGNAL_SIZE)); 
} 

cufftDestroy(plan); 
cudaFree(d_signal); 
cudaFree(d_signal_out); 

return 0; 
} 

输出仍然是复杂的形式,但虚数分量是接近于零。另外,真实组件的精度差异是因为MATLAB默认使用双精度,而此代码基于单精度值。


编译和下面的命令在Ubuntu 14.04测试,CUDA 8.0:

NVCC -o IFFT IFFT。cu -arch = sm_61 -lcufft

将输出与MATLAB 2017a进行比较。

程序的输出:

Input: 
0.465000 + 0.000000i 
0.059640 - 0.143250i 
0.049050 - 0.056370i 
0.042860 - 0.029760i 
0.043450 - 0.015120i 
0.045000 + 0.000000i 
0.043450 + 0.015120i 
0.042860 + 0.029760i 
0.049050 + 0.056370i 
0.059640 + 0.143250i 


------------------------------- 

Output: 
0.900000 - 0.000000i 
0.800026 - 0.000000i 
0.699999 - 0.000000i 
0.599964 - 0.000000i 
0.500011 + 0.000000i 
0.400000 + 0.000000i 
0.299990 + 0.000000i 
0.199993 + 0.000000i 
0.150000 + 0.000000i 
0.100018 - 0.000000i