3- 出于性能增益,则可以将每个CPU内核任务,而无需使用lock sync.RWMutex
:
+30x
优化使用信道和runtime.NumCPU()
,这发生在8个核心2ms
2磁芯和993µs
,而你的示例代码需要61ms
2磁芯和40ms
8芯:
参见本工作示例代码和输出:
package main
import (
"fmt"
"math"
"runtime"
"time"
)
func main() {
nCPU := runtime.NumCPU()
fmt.Println("nCPU =", nCPU)
ch := make(chan float64, nCPU)
startTime := time.Now()
a := 0.0
b := 1.0
n := 100000.0
deltax := (b - a)/n
stepPerCPU := n/float64(nCPU)
for start := 0.0; start < n; {
stop := start + stepPerCPU
go f(start, stop, a, deltax, ch)
start = stop
}
integral := 0.0
for i := 0; i < nCPU; i++ {
integral += <-ch
}
fmt.Println(time.Now().Sub(startTime))
fmt.Println(deltax * integral)
}
func f(start, stop, a, deltax float64, ch chan float64) {
result := 0.0
for i := start; i < stop; i++ {
result += math.Sqrt(a + deltax*(i+0.5))
}
ch <- result
}
输出2芯:
nCPU = 2
2.0001ms
0.6666666685900485
输出8芯:
nCPU = 8
993µs
0.6666666685900456
你的示例代码,输出2芯:
0.6666666685900424
61.0035ms
你的示例代码,对8个输出核心:
0.6666666685900415
40.9964ms
2- 为了获得良好的基准统计,使用大量的样本(大N):
当你看到这里使用2芯这需要110ms
2个核,但是这同一个CPU 使用1核心这需要215ms
与n := 10000000.0
:
随着n := 10000000.0
和单一的goroutine,查看运行示例代码:
package main
import (
"fmt"
"math"
"time"
)
func main() {
now := time.Now()
a := 0.0
b := 1.0
n := 10000000.0
deltax := (b - a)/n
result := 0.0
for i := 0.0; i < n; i++ {
result += math.Sqrt(a + deltax*(i+0.5))
}
fmt.Println(time.Now().Sub(now))
fmt.Println(deltax * result)
}
输出:
215.0123ms
0.6666666666685884
随着n := 10000000.0
和2个够程,请参阅本工作的示例代码:
package main
import (
"fmt"
"math"
"runtime"
"time"
)
func main() {
nCPU := runtime.NumCPU()
fmt.Println("nCPU =", nCPU)
ch := make(chan float64, nCPU)
startTime := time.Now()
a := 0.0
b := 1.0
n := 10000000.0
deltax := (b - a)/n
stepPerCPU := n/float64(nCPU)
for start := 0.0; start < n; {
stop := start + stepPerCPU
go f(start, stop, a, deltax, ch)
start = stop
}
integral := 0.0
for i := 0; i < nCPU; i++ {
integral += <-ch
}
fmt.Println(time.Now().Sub(startTime))
fmt.Println(deltax * integral)
}
func f(start, stop, a, deltax float64, ch chan float64) {
result := 0.0
for i := start; i < stop; i++ {
result += math.Sqrt(a + deltax*(i+0.5))
}
ch <- result
}
输出:
nCPU = 2
110.0063ms
0.6666666666686073
1- 没有为够程的数目的最佳点,并从该点向前增加够程的数量不减少程序执行时间:
在第2磁心CPU,用下面的代码,其结果是:
nCPU: 1, 2, 4, 8, 16
Time: 2.16s, 1.1220642s, 1.1060633s, 1.1140637s, 1.1380651s
当你从nCPU=1
到nCPU=2
一次见减少是足够大的,但在这之后它并不多,所以nCPU=2
2核CPU是最佳点,此示例代码,因此,使用nCPU := runtime.NumCPU()
就够了这里。
package main
import (
"fmt"
"math"
"time"
)
func main() {
nCPU := 2 //[email protected] [email protected] [email protected] [email protected] [email protected]
fmt.Println("nCPU =", nCPU)
ch := make(chan float64, nCPU)
startTime := time.Now()
a := 0.0
b := 1.0
n := 100000000.0
deltax := (b - a)/n
stepPerCPU := n/float64(nCPU)
for start := 0.0; start < n; {
stop := start + stepPerCPU
go f(start, stop, a, deltax, ch)
start = stop
}
integral := 0.0
for i := 0; i < nCPU; i++ {
integral += <-ch
}
fmt.Println(time.Now().Sub(startTime))
fmt.Println(deltax * integral)
}
func f(start, stop, a, deltax float64, ch chan float64) {
result := 0.0
for i := start; i < stop; i++ {
result += math.Sqrt(a + deltax*(i+0.5))
}
ch <- result
}
@ husain-al-marzooq我希望这会有所帮助。 – 2016-08-21 16:43:16