加速FFTW修剪以避免大量零填充

假设我有一个序列x(n)，它的长度为K * N，并且只有第一个N元素与零不同。我假设N << K，例如，N = 10和K = 100000。我想通过FFTW计算这样一个序列的FFT。这相当于具有长度为N的序列并具有零填充为K * N。由于N和K可能是“大”，我有一个重要的零填充。我在探索是否可以节省一些计算时间，避免显式零填充。加速FFTW修剪以避免大量零填充

的情况下K = 2

让我们首先考虑的情况下K = 2。在这种情况下，x(n)的DFT可以写成

如果k是偶数，即k = 2 * m，然后

这意味着该DFT的这样的值，可以计算通过长度为N的序列的FFT，而不是K * N。

如果k为奇数，即k = 2 * m + 1，然后

这意味着该DFT的这样的值可以通过长度N的序列的FFT再次计算的，而不是K * N。

因此，总而言之，我可以用长度为N的长度为2 * N的长度为2的FFT交换单个FFT。

的任意K

的情况下，在这种情况下，我们有

上书写k = m * K + t，我们有

所以，综上所述，我可以交换nge长度为K * N的单个FFT与K长度为N的FFT。由于FFTW有fftw_plan_many_dft，我预计会有一些对付单个FFT的情况。

为了验证这一点，我已经设置了下面的代码

#include <stdio.h> 
#include <stdlib.h>  /* srand, rand */ 
#include <time.h>  /* time */ 
#include <math.h> 
#include <fstream> 

#include <fftw3.h> 

#include "TimingCPU.h" 

#define PI_d   3.141592653589793 

void main() { 

    const int N = 10; 
    const int K = 100000; 

    fftw_plan plan_zp; 

    fftw_complex *h_x = (fftw_complex *)malloc(N  * sizeof(fftw_complex)); 
    fftw_complex *h_xzp = (fftw_complex *)calloc(N * K, sizeof(fftw_complex)); 
    fftw_complex *h_xpruning = (fftw_complex *)malloc(N * K * sizeof(fftw_complex)); 
    fftw_complex *h_xhatpruning = (fftw_complex *)malloc(N * K * sizeof(fftw_complex)); 
    fftw_complex *h_xhatpruning_temp = (fftw_complex *)malloc(N * K * sizeof(fftw_complex)); 
    fftw_complex *h_xhat = (fftw_complex *)malloc(N * K * sizeof(fftw_complex)); 

    // --- Random number generation of the data sequence 
    srand(time(NULL)); 
    for (int k = 0; k < N; k++) { 
     h_x[k][0] = (double)rand()/(double)RAND_MAX; 
     h_x[k][1] = (double)rand()/(double)RAND_MAX; 
    } 

    memcpy(h_xzp, h_x, N * sizeof(fftw_complex)); 

    plan_zp = fftw_plan_dft_1d(N * K, h_xzp, h_xhat, FFTW_FORWARD, FFTW_ESTIMATE); 
    fftw_plan plan_pruning = fftw_plan_many_dft(1, &N, K, h_xpruning, NULL, 1, N, h_xhatpruning_temp, NULL, 1, N, FFTW_FORWARD, FFTW_ESTIMATE); 

    TimingCPU timerCPU; 
    timerCPU.StartCounter(); 
    fftw_execute(plan_zp); 
    printf("Stadard %f\n", timerCPU.GetCounter()); 

    timerCPU.StartCounter(); 
    double factor = -2. * PI_d/(K * N); 
    for (int k = 0; k < K; k++) { 
     double arg1 = factor * k; 
     for (int n = 0; n < N; n++) { 
      double arg = arg1 * n; 
      double cosarg = cos(arg); 
      double sinarg = sin(arg); 
      h_xpruning[k * N + n][0] = h_x[n][0] * cosarg - h_x[n][1] * sinarg; 
      h_xpruning[k * N + n][1] = h_x[n][0] * sinarg + h_x[n][1] * cosarg; 
     } 
    } 
    printf("Optimized first step %f\n", timerCPU.GetCounter()); 

    timerCPU.StartCounter(); 
    fftw_execute(plan_pruning); 
    printf("Optimized second step %f\n", timerCPU.GetCounter()); 
    timerCPU.StartCounter(); 
    for (int k = 0; k < K; k++) { 
     for (int p = 0; p < N; p++) { 
      h_xhatpruning[p * K + k][0] = h_xhatpruning_temp[p + k * N][0]; 
      h_xhatpruning[p * K + k][1] = h_xhatpruning_temp[p + k * N][1]; 
     } 
    } 
    printf("Optimized third step %f\n", timerCPU.GetCounter()); 

    double rmserror = 0., norm = 0.; 
    for (int n = 0; n < N; n++) { 
     rmserror = rmserror + (h_xhatpruning[n][0] - h_xhat[n][0]) * (h_xhatpruning[n][0] - h_xhat[n][0]) + (h_xhatpruning[n][1] - h_xhat[n][1]) * (h_xhatpruning[n][1] - h_xhat[n][1]); 
     norm = norm + h_xhat[n][0] * h_xhat[n][0] + h_xhat[n][1] * h_xhat[n][1]; 
    } 
    printf("rmserror %f\n", 100. * sqrt(rmserror/norm)); 

    fftw_destroy_plan(plan_zp); 

}

我已经开发的方法包括三个步骤：

通过“玩弄”复指数相乘的输入序列;
执行fftw_many;
重组结果。

fftw_many比在K * N输入点上的单个FFTW快。然而，步骤＃1和＃3完全摧毁了这样的收益。我希望步骤＃1和＃3在计算上比步骤＃2轻得多。

我的问题是：

这怎么可能，步骤＃1和＃3这样计算量比第2步的要求更高？
如何改善步骤＃1和＃3以获得与“标准”方法相比的净收益？

非常感谢您的任何提示。

编辑

我与Visual Studio 2013的工作，并在Release模式下进行编译。

来源

2016-11-16 JackOLantern

你编译优化测试代码启用，例如'gcc -O3 ...'？ –

@PaulR我在发布模式下使用Visual Studio 2013进行编译。我已经相应地编辑了这篇文章。 – JackOLantern

对于你可能想尝试切换循环的顺序第三步：

for (int p = 0; p < N; p++) { 
    for (int k = 0; k < K; k++) { 
     h_xhatpruning[p * K + k][0] = h_xhatpruning_temp[p + k * N][0]; 
     h_xhatpruning[p * K + k][1] = h_xhatpruning_temp[p + k * N][1]; 
    } 
}

，因为它通常是更有益的商店地址比加载地址连续的。

无论哪种方式，你有一个缓存不友好的访问模式，但。您可以尝试使用块来改善这一点，例如假设N是4的倍数：

for (int p = 0; p < N; p += 4) { 
    for (int k = 0; k < K; k++) { 
     for (int p0 = 0; p0 < 4; p0++) { 
      h_xhatpruning[(p + p0) * K + k][0] = h_xhatpruning_temp[(p + p0) + k * N][0]; 
      h_xhatpruning[(p + p0) * K + k][1] = h_xhatpruning_temp[(p + p0) + k * N][1]; 
     } 
    } 
}

这应该有助于减少缓存行的流失。如果确实如此，那么也可以尝试4以外的块大小来查看是否存在“最佳点”。

来源

2016-11-17 09:38:50

您的第一个建议可以显着提高第三步的时间，请参阅我的帖子的编辑＃3。循环展开无助于进一步改进结果。非常感谢您的回答。 – JackOLantern

对于您建议的成功的'for'循环交换，多线程会使结果恶化。 – JackOLantern

最后，我已经成功地改进了我的代码，并且遵循了您的建议。我在下面提供了一个完整的答案。感谢您的关注。 – JackOLantern

几个选项来运行速度更快：

运行多线程的，如果你只是运行单线程和有多个内核可用。
创建并保存FFTW智慧文件，尤其是在预先知道FFT尺寸的情况下。使用FFTW_EXHAUSTIVE，并重新加载FFTW的智慧，而不是每次重新计算它。如果你希望你的结果一致，这也很重要。由于FFTW可能用不同的计算智慧来计算FFT，智慧结果不一定总是相同，当两个进程得到相同的输入数据时，进程的不同运行可能产生不同的结果。
如果您使用x86，请运行64位。 FFTW算法的注册密集度非常高，并且以64位模式运行的x86 CPU具有比在32位模式下运行时多得多的通用寄存器。
由于FFTW算法是如此密集的登记，我有很好的成功通过编译FFTW与防止使用预取和防止职能隐含内联编译器选项改善FFTW性能。

来源

2016-11-16 16:12:06

非常感谢您的回答。然而，从概念上讲，我主要关心加快两个'for'循环，而不是'fftw'。我编辑了我的帖子，为两个'for'循环添加了一个OpenMP解决方案，但是我感觉我正在做一个不公平的比较：两个FFTW是顺序的，而'for'循环是多线程的。 – JackOLantern

我终于能够改进我的代码。我发布了一个完整的答案。感谢您的关注。 – JackOLantern

也遵循Paul R的评论，我改进了我的代码。现在，替代方法比标准（零填充）更快。以下是完整的C++脚本。对于第一步和第三步，我已经评论过其他尝试过的解决方案，这些解决方案显示出比未注释的解决方案更慢或更快。考虑到未来更简单的CUDA并行化，我拥有特殊的非嵌套for循环。我还没有使用FFTW的多线程。

#include <stdio.h> 
#include <stdlib.h>  /* srand, rand */ 
#include <time.h>  /* time */ 
#include <math.h> 
#include <fstream> 

#include <omp.h> 

#include <fftw3.h> 

#include "TimingCPU.h" 

#define PI_d   3.141592653589793 

/******************/ 
/* STEP #1 ON CPU */ 
/******************/ 
void step1CPU(fftw_complex * __restrict h_xpruning, const fftw_complex * __restrict h_x, const int N, const int K) { 

// double factor = -2. * PI_d/(K * N); 
// int n; 
// omp_set_nested(1); 
//#pragma omp parallel for private(n) num_threads(4) 
// for (int k = 0; k < K; k++) { 
//  double arg1 = factor * k; 
//#pragma omp parallel for num_threads(4) 
//  for (n = 0; n < N; n++) { 
//   double arg = arg1 * n; 
//   double cosarg = cos(arg); 
//   double sinarg = sin(arg); 
//   h_xpruning[k * N + n][0] = h_x[n][0] * cosarg - h_x[n][1] * sinarg; 
//   h_xpruning[k * N + n][1] = h_x[n][0] * sinarg + h_x[n][1] * cosarg; 
//  } 
// } 

    //double factor = -2. * PI_d/(K * N); 
    //int k; 
    //omp_set_nested(1); 
    //#pragma omp parallel for private(k) num_threads(4) 
    //for (int n = 0; n < N; n++) { 
    // double arg1 = factor * n; 
    // #pragma omp parallel for num_threads(4) 
    // for (k = 0; k < K; k++) { 
    //  double arg = arg1 * k; 
    //  double cosarg = cos(arg); 
    //  double sinarg = sin(arg); 
    //  h_xpruning[k * N + n][0] = h_x[n][0] * cosarg - h_x[n][1] * sinarg; 
    //  h_xpruning[k * N + n][1] = h_x[n][0] * sinarg + h_x[n][1] * cosarg; 
    // } 
    //} 

    //double factor = -2. * PI_d/(K * N); 
    //for (int k = 0; k < K; k++) { 
    // double arg1 = factor * k; 
    // for (int n = 0; n < N; n++) { 
    //  double arg = arg1 * n; 
    //  double cosarg = cos(arg); 
    //  double sinarg = sin(arg); 
    //  h_xpruning[k * N + n][0] = h_x[n][0] * cosarg - h_x[n][1] * sinarg; 
    //  h_xpruning[k * N + n][1] = h_x[n][0] * sinarg + h_x[n][1] * cosarg; 
    // } 
    //} 

    //double factor = -2. * PI_d/(K * N); 
    //for (int n = 0; n < N; n++) { 
    // double arg1 = factor * n; 
    // for (int k = 0; k < K; k++) { 
    //  double arg = arg1 * k; 
    //  double cosarg = cos(arg); 
    //  double sinarg = sin(arg); 
    //  h_xpruning[k * N + n][0] = h_x[n][0] * cosarg - h_x[n][1] * sinarg; 
    //  h_xpruning[k * N + n][1] = h_x[n][0] * sinarg + h_x[n][1] * cosarg; 
    // } 
    //} 

    double factor = -2. * PI_d/(K * N); 
    #pragma omp parallel for num_threads(8) 
    for (int n = 0; n < K * N; n++) { 
     int row = n/N; 
     int col = n % N; 
     double arg = factor * row * col; 
     double cosarg = cos(arg); 
     double sinarg = sin(arg); 
     h_xpruning[n][0] = h_x[col][0] * cosarg - h_x[col][1] * sinarg; 
     h_xpruning[n][1] = h_x[col][0] * sinarg + h_x[col][1] * cosarg; 
    } 
} 

/******************/ 
/* STEP #3 ON CPU */ 
/******************/ 
void step3CPU(fftw_complex * __restrict h_xhatpruning, const fftw_complex * __restrict h_xhatpruning_temp, const int N, const int K) { 

    //int k; 
    //omp_set_nested(1); 
    //#pragma omp parallel for private(k) num_threads(4) 
    //for (int p = 0; p < N; p++) { 
    // #pragma omp parallel for num_threads(4) 
    // for (k = 0; k < K; k++) { 
    //  h_xhatpruning[p * K + k][0] = h_xhatpruning_temp[p + k * N][0]; 
    //  h_xhatpruning[p * K + k][1] = h_xhatpruning_temp[p + k * N][1]; 
    // } 
    //} 

    //int p; 
    //omp_set_nested(1); 
    //#pragma omp parallel for private(p) num_threads(4) 
    //for (int k = 0; k < K; k++) { 
    // #pragma omp parallel for num_threads(4) 
    // for (p = 0; p < N; p++) { 
    //  h_xhatpruning[p * K + k][0] = h_xhatpruning_temp[p + k * N][0]; 
    //  h_xhatpruning[p * K + k][1] = h_xhatpruning_temp[p + k * N][1]; 
    // } 
    //} 

    //for (int p = 0; p < N; p++) { 
    // for (int k = 0; k < K; k++) { 
    //  h_xhatpruning[p * K + k][0] = h_xhatpruning_temp[p + k * N][0]; 
    //  h_xhatpruning[p * K + k][1] = h_xhatpruning_temp[p + k * N][1]; 
    // } 
    //} 

    //for (int k = 0; k < K; k++) { 
    // for (int p = 0; p < N; p++) { 
    //  h_xhatpruning[p * K + k][0] = h_xhatpruning_temp[p + k * N][0]; 
    //  h_xhatpruning[p * K + k][1] = h_xhatpruning_temp[p + k * N][1]; 
    // } 
    //} 

    #pragma omp parallel for num_threads(8) 
    for (int p = 0; p < K * N; p++) { 
     int col = p % N; 
     int row = p/K; 
     h_xhatpruning[col * K + row][0] = h_xhatpruning_temp[col + row * N][0]; 
     h_xhatpruning[col * K + row][1] = h_xhatpruning_temp[col + row * N][1]; 
    } 

    //for (int p = 0; p < N; p += 2) { 
    // for (int k = 0; k < K; k++) { 
    //  for (int p0 = 0; p0 < 2; p0++) { 
    //   h_xhatpruning[(p + p0) * K + k][0] = h_xhatpruning_temp[(p + p0) + k * N][0]; 
    //   h_xhatpruning[(p + p0) * K + k][1] = h_xhatpruning_temp[(p + p0) + k * N][1]; 
    //  } 
    // } 
    //} 

} 

/********/ 
/* MAIN */ 
/********/ 
void main() { 

    int N = 10; 
    int K = 100000; 

    // --- CPU memory allocations 
    fftw_complex *h_x = (fftw_complex *)malloc(N  * sizeof(fftw_complex)); 
    fftw_complex *h_xzp = (fftw_complex *)calloc(N * K, sizeof(fftw_complex)); 
    fftw_complex *h_xpruning = (fftw_complex *)malloc(N * K * sizeof(fftw_complex)); 
    fftw_complex *h_xhatpruning = (fftw_complex *)malloc(N * K * sizeof(fftw_complex)); 
    fftw_complex *h_xhatpruning_temp = (fftw_complex *)malloc(N * K * sizeof(fftw_complex)); 
    fftw_complex *h_xhat = (fftw_complex *)malloc(N * K * sizeof(fftw_complex)); 
    //double2  *h_xhatGPU = (double2 *)malloc(N * K * sizeof(double2)); 


    // --- Random number generation of the data sequence on the CPU - moving the data from CPU to GPU 
    srand(time(NULL)); 
    for (int k = 0; k < N; k++) { 
     h_x[k][0] = (double)rand()/(double)RAND_MAX; 
     h_x[k][1] = (double)rand()/(double)RAND_MAX; 
    } 
    //gpuErrchk(cudaMemcpy(d_x, h_x, N * sizeof(double2), cudaMemcpyHostToDevice)); 

    memcpy(h_xzp, h_x, N * sizeof(fftw_complex)); 

    // --- FFTW and cuFFT plans 
    fftw_plan h_plan_zp  = fftw_plan_dft_1d(N * K, h_xzp, h_xhat, FFTW_FORWARD, FFTW_ESTIMATE); 
    fftw_plan h_plan_pruning = fftw_plan_many_dft(1, &N, K, h_xpruning, NULL, 1, N, h_xhatpruning_temp, NULL, 1, N, FFTW_FORWARD, FFTW_ESTIMATE); 

    double totalTimeCPU = 0., totalTimeGPU = 0.; 
    double partialTimeCPU, partialTimeGPU; 

    /****************************/ 
    /* STANDARD APPROACH ON CPU */ 
    /****************************/ 
    printf("Number of processors available = %i\n", omp_get_num_procs()); 
    printf("Number of threads    = %i\n", omp_get_max_threads()); 

    TimingCPU timerCPU; 
    timerCPU.StartCounter(); 
    fftw_execute(h_plan_zp); 
    printf("\nStadard on CPU: \t \t %f\n", timerCPU.GetCounter()); 

    /******************/ 
    /* STEP #1 ON CPU */ 
    /******************/ 
    timerCPU.StartCounter(); 
    step1CPU(h_xpruning, h_x, N, K); 
    partialTimeCPU = timerCPU.GetCounter(); 
    totalTimeCPU = totalTimeCPU + partialTimeCPU; 
    printf("\nOptimized first step CPU: \t %f\n", totalTimeCPU); 

    /******************/ 
    /* STEP #2 ON CPU */ 
    /******************/ 
    timerCPU.StartCounter(); 
    fftw_execute(h_plan_pruning); 
    partialTimeCPU = timerCPU.GetCounter(); 
    totalTimeCPU = totalTimeCPU + partialTimeCPU; 
    printf("Optimized second step CPU: \t %f\n", timerCPU.GetCounter()); 

    /******************/ 
    /* STEP #3 ON CPU */ 
    /******************/ 
    timerCPU.StartCounter(); 
    step3CPU(h_xhatpruning, h_xhatpruning_temp, N, K); 
    partialTimeCPU = timerCPU.GetCounter(); 
    totalTimeCPU = totalTimeCPU + partialTimeCPU; 
    printf("Optimized third step CPU: \t %f\n", partialTimeCPU); 

    printf("Total time CPU: \t \t %f\n", totalTimeCPU); 

    double rmserror = 0., norm = 0.; 
    for (int n = 0; n < N; n++) { 
     rmserror = rmserror + (h_xhatpruning[n][0] - h_xhat[n][0]) * (h_xhatpruning[n][0] - h_xhat[n][0]) + (h_xhatpruning[n][1] - h_xhat[n][1]) * (h_xhatpruning[n][1] - h_xhat[n][1]); 
     norm = norm + h_xhat[n][0] * h_xhat[n][0] + h_xhat[n][1] * h_xhat[n][1]; 
    } 
    printf("\nrmserror %f\n", 100. * sqrt(rmserror/norm)); 

    fftw_destroy_plan(h_plan_zp); 

}

对于情况

N = 10 
K = 100000

我的时间是以下

Stadard on CPU:     23.895417 

Optimized first step CPU:  4.472087 
Optimized second step CPU:  4.926603 
Optimized third step CPU:  2.394958 
Total time CPU:     11.793648

来源

2016-11-18 15:39:12 JackOLantern

加速FFTW修剪以避免大量零填充

回答

相关问题