C＃TPL比C++ PPL更快吗？

我编写了一个非常简单的应用程序所使用的斐波那契fonction比较TPL的Parallel.ForEach VS PPL的parallel_for_each，结果真的很奇怪，在PC上有8个内核，C＃的是11秒更快那么C++。C＃TPL比C++ PPL更快吗？

两个VS2010和VS 2011的预览相同的结果。

C＃代码：

using System; 
using System.Collections.Generic; 
using System.Linq; 
using System.Text; 
using System.Collections.Concurrent; 
using System.Threading.Tasks; 
using System.Diagnostics; 

namespace ConsoleApplication1 
{ 
    class Program 
    { 

      static void Main(string[] args) 
      { 
       var ll = new ConcurrentQueue<Tuple<int, int>>(); 
       var a = new int[12] { 40, 41, 42, 43, 44, 45, 46, 47, 35, 25, 36, 37 }; 

       long elapsed = time_call(() => 
       { 
        Parallel.ForEach(a, (n) => { ll.Enqueue(new Tuple<int, int>(n, fibonacci(n))); }); 
       }); 

       Console.WriteLine("TPL C# elapsed time: " + elapsed + "\n\r"); 
       foreach (var ss in ll) 
       { 
        Console.WriteLine(String.Format("fib<{0}>: {1}", ss.Item1, +ss.Item2)); 
       } 

       Console.ReadLine(); 
      } 

      static long time_call(Action f) 
      { 
       var p = Stopwatch.StartNew(); 
       p.Start(); 
       f(); 
       p.Stop(); 
       return p.ElapsedMilliseconds; 
      } 

      Computes the nth Fibonacci number. 
      static int fibonacci(int n) 
      { 
       if (n < 2) return n; 
       return fibonacci(n - 1) + fibonacci(n - 2); 
      } 
     } 
    }

C++代码：

#include <windows.h> 
#include <ppl.h> 
#include <concurrent_vector.h> 
#include <array> 
#include <tuple> 
#include <algorithm> 
#include <iostream> 

using namespace Concurrency; 
using namespace std; 

template <class Function> 
__int64 time_call(Function&& f) { 
    __int64 begin = GetTickCount(); 
    f(); 
    return GetTickCount() - begin; 
} 

// Computes the nth Fibonacci number. 
int fibonacci(int n) { 
    if (n < 2) return n; 
    return fibonacci(n-1) + fibonacci(n-2); 
} 

int wmain() { 
    __int64 elapsed; 
    array<int, 12> a ={ 40, 41, 42, 43, 44, 45, 46, 47, 35, 25, 36, 37 }; 
    concurrent_vector<tuple<int,int>> results2; 

    elapsed = time_call([&]{ 
     parallel_for_each(a.begin(), a.end(), [&](int n) { 
      results2.push_back(make_tuple(n, fibonacci(n))); 
     }); 
    }); 

    wcout << L"PPL time: " << elapsed << L" ms" << endl << endl; 
    for_each (results2.begin(), results2.end(), [](tuple<int,int>& pair) { 
     wcout << L"fib(" << get<0>(pair) << L"): " << get<1>(pair) << endl; 
    }); 

    cin.ignore(); 
}

能否请你点我，在这里我的C++的一部分代码，我错了？

宽度group_task我具有相同的时间如C＃代码：

task_group tasks; 
    elapsed = time_call([&] 
    { 
     for_each(begin(a), end(a), [&](int n) 
     { 
      tasks.run([&,n]{results2.push_back(make_tuple(n, fibonacci(n)));}); 
     }); 
     tasks.wait();

来源

2012-01-03 user1127781

您能否发布格式正确的源代码？这些陈述之间的空白联系尤其令人恼火，因为它们迫使我滚动很多。 – 2012-01-03 12:28:10

对我而言，跳出的是您在示例中使用的不同集合。 C＃版本使用队列，而C++版本使用向量。 – 2012-01-03 13:00:35

另外，你是否只在每个例子中定时调用'fibonacci'函数？ – 2012-01-03 13:01:43

的GetTickCount（http://msdn.microsoft.com/en-us/library/windows/desktop/ms724408%28v=vs。 85％29.aspx）用于测量在本地传递的时间的函数并不准确。该描述是这样说的：

“GetTickCount函数的分辨率被限制为系统定时器，这是通常在10毫秒到16毫秒的范围内的分辨率。”

从我的经验API使用GetSystemTime产生在Windows Vista更好的效果和最多（在Win XP中有还挺相同的分辨率，如果剔计数我没记错的话）。或者更好的是，您可以使用细粒度测量，其他API提供小于mils的分辨率。可能在您的情况下，构建大型数据集会更有帮助，从而获得一些有意义的数据。

来源

2012-01-03 14:14:23 Ghita

我认为这个问题既不GetTickCount函数也不是并发容器，如果我将parallel_for_each C++代码更改为 task_group我有同样的时间喜欢c＃代码。 – user1127781 2012-01-03 16:42:52

根据OP，两者之间有11秒的差异，所以它可能与滴答计数的准确性无关。 – 2013-06-05 21:42:54

这里是由拉胡尔v帕蒂尔微软团队

你好的解释，

感谢提出这个问题。事实上，您已经确定了与*的默认并行相关联的开销 - 尤其是当迭代次数很少且工作大小可变时。默认并行开始通过将工作分解为8 块（在8个内核上）。工作结束后，工作将动态地进行，负载平衡。默认工作在大多数情况下，大（大量迭代），并在每次迭代的基础工作不顺利了解（假设你叫成库） - 但它不来在某些情况下无法接受的开销。

该解决方案是完全在您的备用 implemtnation已经确定了什么。为此，我们将有一个平行的分区称为在Visual Studio的下一个版本的“简单”，这将是类似于你描述的替代实现，将有更好的性能。

PS：对于每个实现的C＃和C++并行使用略有不同算法，他们如何去通过重复 - 因此你会看到根据工作量稍有不同的性能特征。

问候

来源

2012-01-04 08:12:42 user1127781

有一些问题与您的代码，让我们解决这些问题一个接一个：

使用递归计算斐波纳契使进程使用的内存过多的，因为它是使用调用堆栈来计算结果。递归斐波那契意味着你没有比较C＃/ C++并行框架，你正在比较调用栈机制。你可以更快地计算斐波那契：

int fibonacci(int n) 
{ 
    int curr = 1, prev = 0, total = 0; 
    for (int i = 0; i < n; i++) 
    { 
     int pc = curr; 
     curr += prev; 
     total += curr; 
     prev = pc; 
    } 
    return total; 
}

有了这个函数，我必须运行至少100万次才能获得可测量的值。

使用GetTickCount64代替的GetTickCount：

template <class Function> 
__int64 time_call(Function&& f) 
{ 
    __int64 begin = ::GetTickCount64(); 
    f(); 
    return GetTickCount64() - begin; 
}

运行parallel_for时有这样的小身体可能实际上会降低性能。最好使用更细粒度的函子。

有了这些特质，这里是C++代码：

// ParallelFibo.cpp : Defines the entry point for the console application. 
// 

#include "stdafx.h" 
#include <windows.h> 
#include <ppl.h> 
#include <concurrent_vector.h> 
#include <array> 
#include <tuple> 
#include <algorithm> 
#include <iostream> 
#include <random> 

using namespace Concurrency; 
using namespace std; 

template <class Function> 
__int64 time_call(Function&& f) 
{ 
    __int64 begin = ::GetTickCount64(); 
    f(); 
    return GetTickCount64() - begin; 
} 

// Computes the nth Fibonacci number. 
inline int fibonacci(int n) 
{ 
    int curr = 1, prev = 0, total = 0; 
    for (int i = 0; i < n; i++) 
    { 
     int pc = curr; 
     curr += prev; 
     total += curr; 
     prev = pc; 
    } 
    return total; 
} 

#define NUMBER_OF_REPETITIONS 1000000 
#define MIN_FIBO    37 
#define MAX_FIBO    49 

int main() 
{ 
    __int64 elapsed; 
    vector<int> v; 
    for (int i = MIN_FIBO; i < MAX_FIBO; i++) 
    { 
     v.emplace_back(i); 
    } 

    concurrent_vector<tuple<int, int>> results; 
    elapsed = time_call([&] { 
     parallel_for(MIN_FIBO, MAX_FIBO, [&](int n) { 
      for (int i = 0; i < NUMBER_OF_REPETITIONS; i++) 
      { 
       results.push_back(make_tuple(n, fibonacci(n))); 
      } 
     }); 
    }); 
    wcout << L"PPL time: " << elapsed << L" ms" << endl << endl; 
    cin.ignore(); 
    return 0; 
}

这里是在C＃代码：

using System; 
using System.Collections.Generic; 
using System.Linq; 
using System.Text; 
using System.Threading.Tasks; 
using System.Collections.Concurrent; 
using System.Diagnostics; 

namespace ParallelFiboCS 
{ 
    class Program 
    { 
     const int NUMBER_OF_REPETITIONS = 1000000; 
     const int MIN_FIBO = 37; 
     const int MAX_FIBO = 49; 
     static void Main(string[] args) 
     { 
      var ll = new ConcurrentQueue<Tuple<int, int>>(); 

      var a = new int[MAX_FIBO - MIN_FIBO]; 
      for (int i = MIN_FIBO; i < MAX_FIBO; i++) 
      { 
       a[i - MIN_FIBO] = i; 
      } 

      long elapsed = time_call(() => 
      { 
       Parallel.ForEach(a, (n) => 
       { 
        for (int i = 0; i < NUMBER_OF_REPETITIONS; i++) 
        { 
         ll.Enqueue(new Tuple<int, int>(n, fibonacci(n))); 
        } 
       }); 
      }); 

      Console.WriteLine("TPL C# elapsed time: " + elapsed + "\n\r"); 

      Console.ReadLine(); 
     } 

     static long time_call(Action f) 
     { 
      var p = Stopwatch.StartNew(); 
      p.Start(); 
      f(); 
      p.Stop(); 
      return p.ElapsedMilliseconds; 
     } 

     static int fibonacci(int n) 
     { 
      int curr = 1, prev = 0, total = 0; 
      for (int i = 0; i < n; i++) 
      { 
       int pc = curr; 
       curr += prev; 
       total += curr; 
       prev = pc; 
      } 
      return total; 
     } 
    } 
}

平均时间运行1200万个之间的数字斐波那契数计算37和49：

C++：513ms

C＃：2527ms

来源

2015-07-17 22:59:59

C＃TPL比C++ PPL更快吗？

回答

相关问题