如何让Julia代码更高效？目前它的表现甚至比Python差

我是Julia的新手，并且一直在试验它，因为我知道它有惊人的表现。但我仍然要体验那些承诺的表演。我已经尝试了很多提高“JULIA HIGH PERFORMANCE”一书中描述的性能的方法，这使得代码的可读性略差。但是，我的python代码比我的Julia代码快得多，至少比基准情况快了3倍。要么我在做一些非常错误的代码，这些代码在Julia中一定是罪恶，或者Julia无法做到。请稍后证明我错了。如何让Julia代码更高效？目前它的表现甚至比Python差

我想在代码中做的是将不同的球分配到不同的盒子中，每个盒子的容量有最大和最小限制。球放置在盒子中的顺序也很重要。我需要在尽可能短的时间内用给定的约束条件生成所有可能的作业。

Python代码：

import itertools 
import time 

max_balls = 5 
min_balls = 0 

def get_assignments(balls, boxes, assignments=[[]]): 
    all_assignments = [] 
    upper_ball_limit = len(balls) 
    if upper_ball_limit > max_balls: 
     upper_ball_limit = max_balls 

    n_boxes = len(boxes) 
    lower_ball_limit = len(balls) - upper_ball_limit * (n_boxes - 1) 
    if lower_ball_limit < min_balls: 
     lower_ball_limit = min_balls 

    if len(boxes) == 0: 
     raise Exception("No delivery boys found") 

    elif len(boxes) == 1: 
     for strategy in itertools.permutations(balls, upper_ball_limit): 
      #    valid = evaluate_strategy(strategy, db_id) 
      for subplan in assignments: 
       subplan_copy = subplan[:] 
       box_id = boxes[0] 
       subplan_copy.append((box_id, strategy)) 
       all_assignments.append(subplan_copy) 
     return all_assignments 
    else: 
     box_id = boxes[0] 
     for i in range(lower_ball_limit, upper_ball_limit+ 1): 
      for strategy in itertools.permutations(balls, i): 
       temp_plans = [] 
       for subplan in assignments: 
        subplan_copy = subplan[:] 
        subplan_copy.append((box_id, strategy)) 
        temp_plans.append(subplan_copy) 
        remaining_balls = set(balls).difference(strategy) 
        remaining_boxes = list(set(boxes).difference([box_id])) 
        if remaining_balls: 
         all_assignments.extend(get_assignments(remaining_balls, remaining_boxes, temp_plans)) 
        else: 
         all_assignments.extend(temp_plans) 
     return all_assignments 

balls = range(1, 9) 
boxes = [1, 2, 3, 4] 

t = time.time() 
all_assignments = get_assignments(balls, boxes) 

print('Number of assignments: %s' % len(all_assignments)) 
print('Time taken: %s' % (time.time()-t))

这里是我写的JULIA CODE为上述的尝试。

#!/usr/bin/env julia 

using Combinatorics 

const max_balls=5 
const min_balls=0 

function plan_assignments(balls::Vector{Int32}, boxes ; plans=[Vector{Tuple{Int32,Array{Int32,1}}}(length(boxes))]) 
const n_boxes = length(boxes) 
const n_balls = length(balls) 
const n_plans = length(plans) 

if n_boxes*max_balls < n_balls 
    print("Invalid Inputs: Number of balls exceed the number of boxes.") 
end 

all_plans = Vector{Tuple{Int32,Array{Int32,1}}}[] 
upper_box_limit = n_balls 
if upper_box_limit > max_balls 
    upper_box_limit = max_balls 
end 
lower_box_limit = n_balls - upper_box_limit * (n_boxes-1) 

if lower_box_limit < min_balls 
    lower_box_limit = min_balls 
end 

if n_boxes == 1 
    box_id = boxes[1] 
    @inbounds for strategy in Combinatorics.permutations(balls, upper_box_limit) 
     @inbounds for subplan in plans 
      subplan = subplan[:] 
      subplan[tn_boxes - n_boxes + 1] = (box_id, strategy) 
      all_plans = push!(all_plans, subplan) 
     end 
    end 
    return all_plans 

else 
    box_id = boxes[1] 
    @inbounds for i in lower_box_limit:upper_box_limit 
     @inbounds for strategy in Combinatorics.permutations(balls, i) 
      temp_plans = Array{Vector{Tuple{Int32,Array{Int32,1}}},1}(n_plans) 
      # temp_plans = [] 

      @inbounds for (i,subplan) in zip(1:n_plans, plans) 
       subplan = subplan[:] 
       subplan[tn_boxes - n_boxes + 1] = (box_id, strategy) 
       temp_plans[i] = subplan 
       # subplan = push!(subplan, (db_id, strategy)) 
       # temp_plans = push!(temp_plans, subplan) 

       remaining_balls = filter((x) -> !(x in strategy), balls) 
       remaining_boxes = filter((x) -> x != box_id , boxes) 

       if length(remaining_balls) > 0 
        @inbounds for plan in plan_assignments(remaining_balls, remaining_boxes, plans=temp_plans) 
        push!(all_plans, plan) 
        end 
        # append!(all_plans, plan_assignments(remaining_orders, remaining_delivery_boys, plans=temp_plans)) 
       else 
        @inbounds for plan in temp_plans 
        push!(all_plans, plan) 
        end 
        # append!(all_plans, temp_plans) 

       end 
      end 
     end 
    end 
    return all_plans 
end 
end 

balls = Int32[1,2,3,4,5,6,7,8] 
boxes = Int32[1,2,3,4] 

const tn_boxes = length(boxes) 

@timev all_plans = plan_assignments(balls, boxes) 
print(length(all_plans))

我的基准时机如下：

对于Python：

Number of assignments: 5040000 
Time taken: 22.5003659725

对于朱莉娅：（这是当贴现的编译时间）

76.986338 seconds (122.94 M allocations: 5.793 GB, 77.01% gc time) 
elapsed time (ns): 76986338257 
gc time (ns):  59287603693 
bytes allocated: 6220111360 
pool allocs:  122932049 
non-pool GC allocs:10587 
malloc() calls: 11 
realloc() calls: 18 
GC pauses:   270 
full collections: 28

来源

2016-11-18 manofsins

在垃圾收集上花费了77％的时间......看起来问题在于您正在创建大量对象。 CPython通过与GC一起使用引用计数来避免这么大的打击。请注意，77秒中的23％约为17秒，因此如果您不计算在gc中花费的时间，Julia会比CPython快。不幸的是，我不知道朱莉娅，所以我不能告诉你为什么这么多时间花在gc上。 – Bakuriu

'all_plans = push！（all_plans，subplan）'做你想做的事情吗？ – daycaster

您是否需要在'subplan = subplan [：]'中获取子计划副本？ –

这是Julia中的另一个版本，更具惯用性并被修改以避免递归和一些分配：

using Iterators 
using Combinatorics 

histograms(n,m) = [diff([0;x;n+m]).-1 for x in combinations([1:n+m-1;],m-1)] 

good_histograms(n,m,minval,maxval) = 
    [h for h in histograms(n,m) if all(maxval.>=h.>=minval)] 

typealias PlanGrid Matrix{SubArray{Int,1,Array{Int,1},Tuple{UnitRange{Int}},true}} 

function assignmentplans(balls,boxes,minballs,maxballs) 
    nballs, nboxes = length(balls),length(boxes) 
    nperms = factorial(nballs) 
    partslist = good_histograms(nballs,nboxes,minballs,maxballs) 
    plans = PlanGrid(nboxes,nperms*length(partslist)) 
    permutationvector = vec([balls[p[i]] for i=1:nballs,p in permutations(balls)]) 
    i1 = 1 
    for parts in partslist 
    i2 = 0 
    breaks = [(x[1]+1:x[2]) for x in partition(cumsum([0;parts]),2,1)] 
    for i=1:nperms 
     for j=1:nboxes 
     plans[j,i1] = view(permutationvector,breaks[j]+i2) 
     end 
     i1 += 1 
     i2 += nballs 
    end 
    end 
    return plans 
end

对于时间我们得到：

julia> assignmentplans([1,2],[1,2],0,2) # a simple example 
2×6 Array{SubArray{Int64,1,Array{Int64,1},Tuple{UnitRange{Int64}},true},2}: 
Int64[] Int64[] [1] [2] [1,2] [2,1] 
[1,2] [2,1] [2] [1] Int64[] Int64[] 

julia> @time plans = assignmentplans([1:8;],[1:4;],0,5); 
    8.279623 seconds (82.28 M allocations: 2.618 GB, 14.07% gc time) 

julia> size(plans) 
(4,5040000) 

julia> plans[:,1000000]     # sample ball configuration 
4-element Array{SubArray{Int64,1,Array{Int64,1},Tuple{UnitRange{Int64}},true},1}: 
Int64[]  
[7,3,8,2,5] 
[4]   
[6,1]

计时，当然，每设置各不相同，但是这应该是要快得多。这不完全是苹果比较，但它计算相同的东西。在评论中欢迎海报（或其他人）机器的时间安排。

来源

2016-11-19 06:57:30

感谢您的解决方案。我想这可能是因为，如果以正确的方式书写，朱莉娅只有很快。这并不是普遍无视你写作的方式。虽然我仍然必须写出你已经给出的解决方案，并在Python中进行比较，以给出最终结论。顺带一提，我在MacBook Pro 2015上获得了14.367281秒（82.29 M分配：2.618 GB，52.42％gc时间），并带有16 GB内存和i7处理器。当你只有14％的时候，不要为什么'gc time'太多了。任何线索？ – manofsins

@manofsins是的，我认为其他的看法是，语言之间的翻译应该寻找惯用的等值而不是文字转换，改进算法可以产生比试图加速效率低的好处更多的好处。（我仍然认为你的原始版本做了太多的任务，但是......） – daycaster

“如果以正确的方式书写，朱莉娅只有很快”，是和不是。 *不良*代码会在任何*语言上遇到瓶颈，并且存在所有有效的，非标准的，特定于语言的优化。有一个着名的博文“使用python与许多神秘的python黑客”击败“julia。Julia的说法是* julia * * standard *，unvectorised代码比* standard * python/matlab代码更快（实际上，推荐！）（矢量化或没有）。但是，是的，标准良好实践仍然很重要（例如避免昂贵的重新分配），并且不会，故意低效的代码不会神奇地“加速”。 –

我对Dan Getz的代码做了一些小的修改，以便使其类型稳定。主要的问题是，Combinatorics.permutations和Iterators.partition不是类型稳定，所以我不得不写的这些类型的稳定版本，以及（我改变Combinatorics.combinations实际上是不必要的）

import Base: start, next, done, eltype, length, size 
import Base: iteratorsize, SizeUnknown, IsInfinite, HasLength 

import Combinatorics: factorial, combinations 

# Copied from Combinatorics 
# Only one small change in the `nextpermutation` method 

immutable Permutations{T} 
    a::T 
    t::Int 
end 

eltype{T}(::Type{Permutations{T}}) = Vector{eltype(T)} 

length(p::Permutations) = (0 <= p.t <= length(p.a))?factorial(length(p.a), length(p.a)-p.t):0 

""" 
Generate all permutations of an indexable object. Because the number of permutations can be very large, this function returns an iterator object. Use `collec 
t(permutations(array))` to get an array of all permutations. 
""" 
permutations(a) = Permutations(a, length(a)) 

""" 
Generate all size t permutations of an indexable object. 
""" 
function permutations(a, t::Integer) 
    if t < 0 
     t = length(a) + 1 
    end 
    Permutations(a, t) 
end 

start(p::Permutations) = [1:length(p.a);] 
next(p::Permutations, s) = nextpermutation(p.a, p.t ,s) 

function nextpermutation(m, t, state) 
    s = copy(state)      # was s = copy(s) a few lines down 
    perm = [m[s[i]] for i in 1:t] 
    n = length(s) 
    if t <= 0 
     return(perm, [n+1]) 
    end 
    if t < n 
     j = t + 1 
     while j <= n && s[t] >= s[j]; j+=1; end 
    end 
    if t < n && j <= n 
     s[t], s[j] = s[j], s[t] 
    else 
     if t < n 
      reverse!(s, t+1) 
     end 
     i = t - 1 
     while i>=1 && s[i] >= s[i+1]; i -= 1; end 
     if i > 0 
      j = n 
      while j>i && s[i] >= s[j]; j -= 1; end 
      s[i], s[j] = s[j], s[i] 
      reverse!(s, i+1) 
     else 
      s[1] = n+1 
     end 
    end 
    return(perm, s) 
end 

done(p::Permutations, s) = !isempty(s) && max(s[1], p.t) > length(p.a) || (isempty(s) && p.t > 0) 

# Copied from Iterators 
# Returns an `Array` of `Array`s instead of `Tuple`s now 

immutable Partition{I} 
    xs::I 
    step::Int 
    n::Int 

end 

iteratorsize{T<:Partition}(::Type{T}) = SizeUnknown() 

eltype{I}(::Type{Partition{I}}) = Vector{eltype(I)} 

function partition{I}(xs::I, n::Int, step::Int = n) 
    Partition(xs, step, n) 
end 

function start{I}(it::Partition{I}) 
    N = it.n 
    p = Vector{eltype(I)}(N) 
    s = start(it.xs) 
    for i in 1:(N - 1) 
     if done(it.xs, s) 
      break 
     end 
     (p[i], s) = next(it.xs, s) 
    end 
    (s, p) 
end 

function next{I}(it::Partition{I}, state) 
    N = it.n 
    (s, p0) = state 
    (x, s) = next(it.xs, s) 
    ans = p0; ans[end] = x 

    p = similar(p0) 
    overlap = max(0, N - it.step) 
    for i in 1:overlap 
     p[i] = ans[it.step + i] 
    end 

    # when step > n, skip over some elements 
    for i in 1:max(0, it.step - N) 
     if done(it.xs, s) 
      break 
     end 
     (x, s) = next(it.xs, s) 
    end 

    for i in (overlap + 1):(N - 1) 
     if done(it.xs, s) 
      break 
     end 

     (x, s) = next(it.xs, s) 
     p[i] = x 
    end 

    (ans, (s, p)) 
end 

done(it::Partition, state) = done(it.xs, state[1]) 

# Copied from the answer of Dan Getz 
# Added types to comprehensions and used Vector{Int} instead of Int in vcat  

typealias PlanGrid Matrix{SubArray{Int,1,Array{Int,1},Tuple{UnitRange{Int}},true}} 

histograms(n,m) = [diff(vcat([0],x,[n+m])).-1 for x in combinations([1:n+m-1;],m-1)] 

good_histograms(n,m,minval,maxval) = 
    Vector{Int}[h for h in histograms(n,m) if all(maxval.>=h.>=minval)] 

minballs = 0 
maxballs = 5 

function assignmentplans(balls,boxes,minballs,maxballs) 
    nballs, nboxes = length(balls),length(boxes) 
    nperms = factorial(nballs) 
    partslist = good_histograms(nballs,nboxes,minballs,maxballs) 
    plans = PlanGrid(nboxes,nperms*length(partslist)) 
    permutationvector = vec([balls[p[i]] for i=1:nballs,p in permutations(balls)]) 
    i1 = 1 
    for parts in partslist 
    i2 = 0 
    breaks = UnitRange{Int64}[(x[1]+1:x[2]) for x in partition(cumsum(vcat([0],parts)),2,1)] 
    for i=1:nperms 
     for j=1:nboxes 
     plans[j,i1] = view(permutationvector,breaks[j]+i2) 
     end 
     i1 += 1 
     i2 += nballs 
    end 
    end 
    return plans 
end 

@time plans = assignmentplans(1:8, 1:4, 0, 5)

在第一次运行的结果是（由于gc，时间变化很大）

1.589867 seconds (22.02 M allocations: 1.127 GB, 46.50% gc time) 
4×5040000 Array{SubArray{Int64,1,Array{Int64,1},Tuple{UnitRange{Int64}},true},2}: 
Int64[]  Int64[]  Int64[]  Int64[]  Int64[]  Int64[]  … [8,7,6,5,4] [8,7,6,5,4] [8,7,6,5,4] [8,7,6,5,4] [8,7,6,5,4] 
Int64[]  Int64[]  Int64[]  Int64[]  Int64[]  Int64[]   [1,3,2]  [2,1,3]  [2,3,1]  [3,1,2]  [3,2,1]  
[1,2,3]  [1,2,3]  [1,2,3]  [1,2,3]  [1,2,3]  [1,2,3]   Int64[]  Int64[]  Int64[]  Int64[]  Int64[]  
[4,5,6,7,8] [4,5,6,8,7] [4,5,7,6,8] [4,5,7,8,6] [4,5,8,6,7] [4,5,8,7,6]  Int64[]  Int64[]  Int64[]  Int64[]  Int64[]

我没有彻底测试这些变化。另外，我不明白，为什么s = copy(s)在combinations工作，但不在permutations。有趣的是，如果我试图使原始版本稳定（仍然> 40秒，85％gc时间），那么改进仅有微乎其微的改进。

来源

2016-11-20 21:03:12 tim

谢谢蒂姆。有没有可能将标准软件包中类型稳定的组合部分做成部分？感觉像高优先级的东西。 –

如果您可以编写一些测试来演示您的版本相对于当前标准的优越性，那么可以考虑在Combinatorics.jl回购中提交请求！ –

我只是用户，不太了解朱莉娅的发展。我想，增加更多的功能也很重要，一些当前的类型不稳定性会随着基础的改进而消失，例如，已经有一个公共固定'vcat'用于混合参数类型。而对于其他方面来说，报告这些问题或者甚至将它们解决为用户可能是一个好主意。我只是想出了后者与'PkgDev.submit（pkgname）'是多么的舒服。 – tim

如何让Julia代码更高效？目前它的表现甚至比Python差

回答

相关问题