2016-08-21 209 views
1

我试着同时计算积分,但是我的程序最终比用普通的for循环计算积分慢。我究竟做错了什么?Golang中的并发积分计算

package main 

import (
    "fmt" 
    "math" 
    "sync" 
    "time" 
) 

type Result struct { 
    result float64 
    lock sync.RWMutex 
} 

var wg sync.WaitGroup 
var result Result 

func main() { 
    now := time.Now() 
    a := 0.0 
    b := 1.0 
    n := 100000.0 
    deltax := (b - a)/n 
    wg.Add(int(n)) 
    for i := 0.0; i < n; i++ { 
     go f(a, deltax, i) 
    } 
    wg.Wait() 
    fmt.Println(deltax * result.result) 
    fmt.Println(time.Now().Sub(now)) 
} 

func f(a float64, deltax float64, i float64) { 
    fx := math.Sqrt(a + deltax * (i + 0.5)) 
    result.lock.Lock() 
    result.result += fx 
    result.lock.Unlock() 
    wg.Done() 
} 

回答

2

除非在够程的活动所花费的时间需要切换上下文,执行任务,并使用互斥更新值比需要更多的时间,这将是更快地连续做。

看看稍微修改后的版本。我所做的只是在f()函数中添加1微秒的延迟。

package main 

import (
    "fmt" 
    "math" 
    "sync" 
    "time" 
) 

type Result struct { 
    result float64 
    lock sync.RWMutex 
} 

var wg sync.WaitGroup 
var result Result 

func main() { 
    fmt.Println("concurrent") 
    concurrent() 
    result.result = 0 
    fmt.Println("serial") 
    serial() 
} 

func concurrent() { 
    now := time.Now() 
    a := 0.0 
    b := 1.0 
    n := 100000.0 
    deltax := (b - a)/n 
    wg.Add(int(n)) 
    for i := 0.0; i < n; i++ { 
     go f(a, deltax, i, true) 
    } 
    wg.Wait() 
    fmt.Println(deltax * result.result) 
    fmt.Println(time.Now().Sub(now)) 
} 

func serial() { 
    now := time.Now() 
    a := 0.0 
    b := 1.0 
    n := 100000.0 
    deltax := (b - a)/n 
    for i := 0.0; i < n; i++ { 
     f(a, deltax, i, false) 
    } 
    fmt.Println(deltax * result.result) 
    fmt.Println(time.Now().Sub(now)) 
} 

func f(a, deltax, i float64, concurrent bool) { 
    time.Sleep(1 * time.Microsecond) 
    fx := math.Sqrt(a + deltax*(i+0.5)) 
    if concurrent { 
     result.lock.Lock() 
     result.result += fx 
     result.lock.Unlock() 
     wg.Done() 
    } else { 
     result.result += fx 
    } 
} 

有了延迟,结果如下(并发版本快得多):

concurrent 
0.6666666685900424 
624.914165ms 

serial 
0.6666666685900422 
5.609195767s 

没有延迟:

concurrent 
0.6666666685900428 
50.771275ms 

serial 
0.6666666685900422 
749.166µs 

正如你所看到的,时间越长如果可能的话,完成一项任务需要完成一项任务,同时完成这项任务越有意义。

2

3- 出于性能增益,则可以将每个CPU内核任务,而无需使用lock sync.RWMutex

+30x优化使用信道和runtime.NumCPU(),这发生在8个核心2ms 2磁芯和993µs,而你的示例代码需要61ms 2磁芯和40ms 8芯:

参见本工作示例代码和输出:

package main 

import (
    "fmt" 
    "math" 
    "runtime" 
    "time" 
) 

func main() { 
    nCPU := runtime.NumCPU() 
    fmt.Println("nCPU =", nCPU) 
    ch := make(chan float64, nCPU) 
    startTime := time.Now() 
    a := 0.0 
    b := 1.0 
    n := 100000.0 
    deltax := (b - a)/n 

    stepPerCPU := n/float64(nCPU) 
    for start := 0.0; start < n; { 
     stop := start + stepPerCPU 
     go f(start, stop, a, deltax, ch) 
     start = stop 
    } 

    integral := 0.0 
    for i := 0; i < nCPU; i++ { 
     integral += <-ch 
    } 

    fmt.Println(time.Now().Sub(startTime)) 
    fmt.Println(deltax * integral) 
} 

func f(start, stop, a, deltax float64, ch chan float64) { 
    result := 0.0 
    for i := start; i < stop; i++ { 
     result += math.Sqrt(a + deltax*(i+0.5)) 
    } 
    ch <- result 
} 

输出2芯:

nCPU = 2 
2.0001ms 
0.6666666685900485 

输出8芯:

nCPU = 8 
993µs 
0.6666666685900456 

你的示例代码,输出2芯:

0.6666666685900424 
61.0035ms 

你的示例代码,对8个输出核心:

0.6666666685900415 
40.9964ms 

2- 为了获得良好的基准统计,使用大量的样本(大N):

当你看到这里使用2芯这需要110ms 2个核,但是这同一个CPU 使用1核心这需要215msn := 10000000.0

随着n := 10000000.0和单一的goroutine,查看运行示例代码:

package main 

import (
    "fmt" 
    "math" 
    "time" 
) 

func main() { 
    now := time.Now() 
    a := 0.0 
    b := 1.0 
    n := 10000000.0 
    deltax := (b - a)/n 
    result := 0.0 
    for i := 0.0; i < n; i++ { 
     result += math.Sqrt(a + deltax*(i+0.5)) 
    } 
    fmt.Println(time.Now().Sub(now)) 
    fmt.Println(deltax * result) 
} 

输出:

215.0123ms 
0.6666666666685884 

随着n := 10000000.0和2个够程,请参阅本工作的示例代码:

package main 

import (
    "fmt" 
    "math" 
    "runtime" 
    "time" 
) 

func main() { 
    nCPU := runtime.NumCPU() 
    fmt.Println("nCPU =", nCPU) 
    ch := make(chan float64, nCPU) 
    startTime := time.Now() 
    a := 0.0 
    b := 1.0 
    n := 10000000.0 
    deltax := (b - a)/n 

    stepPerCPU := n/float64(nCPU) 
    for start := 0.0; start < n; { 
     stop := start + stepPerCPU 
     go f(start, stop, a, deltax, ch) 
     start = stop 
    } 

    integral := 0.0 
    for i := 0; i < nCPU; i++ { 
     integral += <-ch 
    } 

    fmt.Println(time.Now().Sub(startTime)) 
    fmt.Println(deltax * integral) 
} 

func f(start, stop, a, deltax float64, ch chan float64) { 
    result := 0.0 
    for i := start; i < stop; i++ { 
     result += math.Sqrt(a + deltax*(i+0.5)) 
    } 
    ch <- result 
} 

输出:

nCPU = 2 
110.0063ms 
0.6666666666686073 

1- 没有为够程的数目的最佳点,并从该点向前增加够程的数量不减少程序执行时间:

在第2磁心CPU,用下面的代码,其结果是:

nCPU: 1,   2,   4,   8,   16 
Time: 2.16s, 1.1220642s, 1.1060633s, 1.1140637s, 1.1380651s 

当你从nCPU=1nCPU=2一次见减少是足够大的,但在这之后它并不多,所以nCPU=2 2核CPU是最佳点,此示例代码,因此,使用nCPU := runtime.NumCPU()就够了这里。

package main 

import (
    "fmt" 
    "math" 
    "time" 
) 

func main() { 
    nCPU := 2 //[email protected] [email protected] [email protected] [email protected] [email protected] 
    fmt.Println("nCPU =", nCPU) 
    ch := make(chan float64, nCPU) 
    startTime := time.Now() 
    a := 0.0 
    b := 1.0 
    n := 100000000.0 
    deltax := (b - a)/n 

    stepPerCPU := n/float64(nCPU) 
    for start := 0.0; start < n; { 
     stop := start + stepPerCPU 
     go f(start, stop, a, deltax, ch) 
     start = stop 
    } 

    integral := 0.0 
    for i := 0; i < nCPU; i++ { 
     integral += <-ch 
    } 

    fmt.Println(time.Now().Sub(startTime)) 
    fmt.Println(deltax * integral) 
} 

func f(start, stop, a, deltax float64, ch chan float64) { 
    result := 0.0 
    for i := start; i < stop; i++ { 
     result += math.Sqrt(a + deltax*(i+0.5)) 
    } 
    ch <- result 
} 
+0

@ husain-al-marzooq我希望这会有所帮助。 – 2016-08-21 16:43:16