2010-12-01 52 views
22

说我有一个列表,像这样:拆分列表为嵌套的列表上的值

[1, 4, None, 6, 9, None, 3, 9, 4 ] 

我决定把它们分割嵌套的列表上None,得到这个:

[ [ 1, 4 ], [ 6, 9 ], [ 3, 9, 4 ] ] 

中当然,我也一直想做到这一点就在这种情况下(9, None),我们会得到:

[ [ 1, 4 ], [ 6 ], [ 3 ], [ 4 ] ] 

这为t rivial使用列表附加迭代(在一个for循环)

我很想知道这是否可以做更快的事情 - 就像列表理解?

如果不是,为什么不(例如,列表理解不能返回每次迭代多个列表元素?)

+0

:-D。没有与它真的。我只是问自己这个问题,不能快速提出一些问题。 – PoorLuzer 2010-12-01 09:35:40

回答

32
>>> def isplit(iterable,splitters): 
    return [list(g) for k,g in itertools.groupby(iterable,lambda x:x in splitters) if not k] 

>>> isplit(L,(None,)) 
[[1, 4], [6, 9], [3, 9, 4]] 
>>> isplit(L,(None,9)) 
[[1, 4], [6], [3], [4]] 

基准码:上

import timeit  

kabie=("isplit_kabie", 
""" 
import itertools 
def isplit_kabie(iterable,splitters): 
    return [list(g) for k,g in itertools.groupby(iterable,lambda x:x in splitters) if not k] 
""") 

ssplit=("ssplit", 
""" 
def ssplit(seq,splitters): 
    seq=list(seq) 
    if splitters and seq: 
     result=[] 
     begin=0 
     for end in range(len(seq)): 
      if seq[end] in splitters: 
       if end > begin: 
        result.append(seq[begin:end]) 
       begin=end+1 
     if begin<len(seq): 
      result.append(seq[begin:]) 
     return result 
    return [seq] 
""") 

ssplit2=("ssplit2", 
""" 
def ssplit2(seq,splitters): 
    seq=list(seq) 
    if splitters and seq: 
     splitters=set(splitters).intersection(seq) 
     if splitters: 
      result=[] 
      begin=0 
      for end in range(len(seq)): 
       if seq[end] in splitters: 
        if end > begin: 
         result.append(seq[begin:end]) 
        begin=end+1 
      if begin<len(seq): 
       result.append(seq[begin:]) 
      return result 
    return [seq] 
""") 

emile=("magicsplit", 
""" 
def _itersplit(l, *splitters): 
    current = [] 
    for item in l: 
     if item in splitters: 
      yield current 
      current = [] 
     else: 
      current.append(item) 
    yield current 

def magicsplit(l, splitters): 
    return [subl for subl in _itersplit(l, *splitters) if subl] 
""") 

emile_improved=("magicsplit2", 
""" 
def _itersplit(l, *splitters): 
    current = [] 
    for item in l: 
     if item in splitters: 
      if current: 
       yield current 
       current = [] 
     else: 
      current.append(item) 
    if current: 
     yield current 

def magicsplit2(l, splitters): 
    if splitters and l: 
     return [i for i in _itersplit(l, *splitters)] 
    return [list(l)] 
""") 

karl=("ssplit_karl", 
""" 
def ssplit_karl(original,splitters): 
    indices = [i for (i, x) in enumerate(original) if x in splitters] 
    ends = indices + [len(original)] 
    begins = [0] + [x + 1 for x in indices] 
    return [original[begin:end] for (begin, end) in zip(begins, ends)] 
""") 

ryan=("split_on", 
""" 
from functools import reduce 
def split_on (seq, delims, remove_empty=True): 
    '''Split seq into lists using delims as a delimiting elements. 

    For example, split_on(delims=2, list=xrange(0,5)) yields [ [0,1], [3,4] ]. 

    delims can be either a single delimiting element or a list or 
    tuple of multiple delimiting elements. If you wish to use a list 
    or tuple as a delimiter, you must enclose it in another list or 
    tuple. 

    If remove_empty is False, then consecutive delimiter elements or delimiter elements at the beginning or end of the longlist''' 
    delims=set(delims) 
    def reduce_fun(lists, elem): 
     if elem in delims: 
      if remove_empty and lists[-1] == []: 
       # Avoid adding multiple empty lists 
       pass 
      else: 
       lists.append([]) 
     else: 
      lists[-1].append(elem) 
     return lists 
    result_list = reduce(reduce_fun, seq, [ [], ]) 
    # Maybe remove trailing empty list 
    if remove_empty and result_list[-1] == []: 
     result_list.pop() 
    return result_list 
""") 

cases=(kabie, emile, emile_improved, ssplit ,ssplit2 ,ryan) 

data=(
    ([1, 4, None, 6, 9, None, 3, 9, 4 ],(None,)), 
    ([1, 4, None, 6, 9, None, 3, 9, 4 ]*5,{None,9,7}), 
    ((),()), 
    (range(1000),()), 
    ("Split me",('','')), 
    ("split me "*100,' '), 
    ("split me,"*100,' ,'*20), 
    ("split me, please!"*100,' ,!'), 
    (range(100),range(100)), 
    (range(100),range(101,1000)), 
    (range(100),range(50,150)), 
    (list(range(100))*30,(99,)), 
    ) 

params="seq,splitters" 

def benchmark(func,code,data,params='',times=10000,rounds=3,debug=''): 
    assert(func.isidentifier()) 
    tester = timeit.Timer(stmt='{func}({params})'.format(
           func=func,params=params), 
          setup="{code}\n".format(code=code)+ 
      (params and "{params}={data}\n".format(params=params,data=data)) + 
      (debug and """ret=repr({func}({params})) 
print({func}.__name__.rjust(16),":",ret[:30]+"..."+ret[-15:] if len(ret)>50 else ret) 
         """.format(func=func,params=params))) 
    results = [tester.timeit(times) for i in range(rounds)] 
    if not debug: 
     print("{:>16s} takes:{:6.4f},avg:{:.2e},best:{:.4f},worst:{:.4f}".format(
      func,sum(results),sum(results)/times/rounds,min(results),max(results))) 

def testAll(cases,data,params='',times=10000,rounds=3,debug=''): 
    if debug: 
     times,rounds = 1,1 
    for dat in data: 
     sdat = tuple(map(repr,dat)) 
     print("{}x{} times:".format(times,rounds), 
       ','.join("{}".format(d[:8]+"..."+d[-5:] if len(d)>16 else d)for d in map(repr,dat))) 
     for func,code in cases: 
      benchmark(func,code,dat,params,times,rounds,debug) 

if __name__=='__main__': 
    testAll(cases,data,params,500,10)#,debug=True) 

输出i3-530处理器,Windows7中的Python 3.1.2:

500x10 times: [1, 4, N...9, 4],(None,) 
    isplit_kabie takes:0.0605,avg:1.21e-05,best:0.0032,worst:0.0074 
     magicsplit takes:0.0287,avg:5.74e-06,best:0.0016,worst:0.0036 
    magicsplit2 takes:0.0174,avg:3.49e-06,best:0.0017,worst:0.0018 
      ssplit takes:0.0149,avg:2.99e-06,best:0.0015,worst:0.0016 
     ssplit2 takes:0.0198,avg:3.96e-06,best:0.0019,worst:0.0021 
     split_on takes:0.0229,avg:4.59e-06,best:0.0023,worst:0.0024 
500x10 times: [1, 4, N...9, 4],{9, None, 7} 
    isplit_kabie takes:0.1448,avg:2.90e-05,best:0.0144,worst:0.0146 
     magicsplit takes:0.0636,avg:1.27e-05,best:0.0063,worst:0.0065 
    magicsplit2 takes:0.0891,avg:1.78e-05,best:0.0064,worst:0.0162 
      ssplit takes:0.0593,avg:1.19e-05,best:0.0058,worst:0.0061 
     ssplit2 takes:0.1004,avg:2.01e-05,best:0.0069,worst:0.0142 
     split_on takes:0.0929,avg:1.86e-05,best:0.0090,worst:0.0096 
500x10 times:(),() 
    isplit_kabie takes:0.0041,avg:8.14e-07,best:0.0004,worst:0.0004 
     magicsplit takes:0.0040,avg:8.04e-07,best:0.0004,worst:0.0004 
    magicsplit2 takes:0.0022,avg:4.35e-07,best:0.0002,worst:0.0002 
      ssplit takes:0.0023,avg:4.59e-07,best:0.0002,worst:0.0003 
     ssplit2 takes:0.0023,avg:4.53e-07,best:0.0002,worst:0.0002 
     split_on takes:0.0072,avg:1.45e-06,best:0.0007,worst:0.0009 
500x10 times: range(0, 1000),() 
    isplit_kabie takes:0.8892,avg:1.78e-04,best:0.0881,worst:0.0895 
     magicsplit takes:0.6614,avg:1.32e-04,best:0.0654,worst:0.0673 
    magicsplit2 takes:0.0958,avg:1.92e-05,best:0.0094,worst:0.0099 
      ssplit takes:0.0943,avg:1.89e-05,best:0.0093,worst:0.0095 
     ssplit2 takes:0.0943,avg:1.89e-05,best:0.0093,worst:0.0096 
     split_on takes:1.3348,avg:2.67e-04,best:0.1328,worst:0.1340 
500x10 times: 'Split me',('', '') 
    isplit_kabie takes:0.0234,avg:4.68e-06,best:0.0023,worst:0.0024 
     magicsplit takes:0.0126,avg:2.52e-06,best:0.0012,worst:0.0013 
    magicsplit2 takes:0.0138,avg:2.76e-06,best:0.0013,worst:0.0015 
      ssplit takes:0.0119,avg:2.39e-06,best:0.0012,worst:0.0012 
     ssplit2 takes:0.0075,avg:1.50e-06,best:0.0007,worst:0.0008 
     split_on takes:0.0191,avg:3.83e-06,best:0.0018,worst:0.0023 
500x10 times: 'split m... me ',' ' 
    isplit_kabie takes:2.0803,avg:4.16e-04,best:0.2060,worst:0.2098 
     magicsplit takes:0.9219,avg:1.84e-04,best:0.0920,worst:0.0925 
    magicsplit2 takes:1.0221,avg:2.04e-04,best:0.1018,worst:0.1034 
      ssplit takes:0.8294,avg:1.66e-04,best:0.0818,worst:0.0834 
     ssplit2 takes:0.9911,avg:1.98e-04,best:0.0983,worst:0.1014 
     split_on takes:1.5672,avg:3.13e-04,best:0.1543,worst:0.1694 
500x10 times: 'split m... me,',' , , , ... , ,' 
    isplit_kabie takes:2.1847,avg:4.37e-04,best:0.2164,worst:0.2275 
     magicsplit takes:3.7135,avg:7.43e-04,best:0.3693,worst:0.3783 
    magicsplit2 takes:3.8104,avg:7.62e-04,best:0.3795,worst:0.3884 
      ssplit takes:0.9522,avg:1.90e-04,best:0.0939,worst:0.0956 
     ssplit2 takes:1.0140,avg:2.03e-04,best:0.1009,worst:0.1023 
     split_on takes:1.5747,avg:3.15e-04,best:0.1563,worst:0.1615 
500x10 times: 'split m...ase!',' ,!' 
    isplit_kabie takes:3.3443,avg:6.69e-04,best:0.3324,worst:0.3380 
     magicsplit takes:2.0594,avg:4.12e-04,best:0.2054,worst:0.2076 
    magicsplit2 takes:2.1850,avg:4.37e-04,best:0.2180,worst:0.2191 
      ssplit takes:1.4881,avg:2.98e-04,best:0.1484,worst:0.1493 
     ssplit2 takes:1.8779,avg:3.76e-04,best:0.1868,worst:0.1920 
     split_on takes:2.9596,avg:5.92e-04,best:0.2946,worst:0.2980 
500x10 times: range(0, 100),range(0, 100) 
    isplit_kabie takes:0.9445,avg:1.89e-04,best:0.0933,worst:0.1023 
     magicsplit takes:0.5878,avg:1.18e-04,best:0.0583,worst:0.0593 
    magicsplit2 takes:0.5597,avg:1.12e-04,best:0.0554,worst:0.0588 
      ssplit takes:0.8568,avg:1.71e-04,best:0.0852,worst:0.0874 
     ssplit2 takes:0.1399,avg:2.80e-05,best:0.0121,worst:0.0242 
     split_on takes:0.1462,avg:2.92e-05,best:0.0145,worst:0.0148 
500x10 times: range(0, 100),range(101, 1000) 
    isplit_kabie takes:19.9749,avg:3.99e-03,best:1.9789,worst:2.0330 
     magicsplit takes:9.4997,avg:1.90e-03,best:0.9369,worst:0.9640 
    magicsplit2 takes:9.4394,avg:1.89e-03,best:0.9267,worst:0.9665 
      ssplit takes:19.2363,avg:3.85e-03,best:1.8936,worst:1.9516 
     ssplit2 takes:0.2032,avg:4.06e-05,best:0.0201,worst:0.0205 
     split_on takes:0.3329,avg:6.66e-05,best:0.0323,worst:0.0344 
500x10 times: range(0, 100),range(50, 150) 
    isplit_kabie takes:1.1394,avg:2.28e-04,best:0.1130,worst:0.1153 
     magicsplit takes:0.7288,avg:1.46e-04,best:0.0721,worst:0.0760 
    magicsplit2 takes:0.7220,avg:1.44e-04,best:0.0705,worst:0.0774 
      ssplit takes:1.0835,avg:2.17e-04,best:0.1059,worst:0.1116 
     ssplit2 takes:0.1092,avg:2.18e-05,best:0.0105,worst:0.0116 
     split_on takes:0.1639,avg:3.28e-05,best:0.0162,worst:0.0168 
500x10 times: [0, 1, 2..., 99],(99,) 
    isplit_kabie takes:3.2579,avg:6.52e-04,best:0.3225,worst:0.3360 
     magicsplit takes:2.2937,avg:4.59e-04,best:0.2274,worst:0.2344 
    magicsplit2 takes:2.6054,avg:5.21e-04,best:0.2587,worst:0.2642 
      ssplit takes:1.5251,avg:3.05e-04,best:0.1495,worst:0.1729 
     ssplit2 takes:1.7298,avg:3.46e-04,best:0.1696,worst:0.1858 
     split_on takes:4.1041,avg:8.21e-04,best:0.4033,worst:0.4291 

稍加修改Ryan的代码,希望大家不要介意。 ssplit是基于卡尔的想法。添加了一些处理特殊情况的语句,成为ssplit2,这是我可能提供的最佳解决方案。

+1

+1优雅。 – 2010-12-01 09:27:01

+0

很好! +1 – mshsayem 2010-12-01 09:28:02

2

你可以找到“分隔符”元素的索引。例如,None的索引是2和5(从零开始)。 您可以使用这些以及列表的长度来构造表示子列表的开始索引和结束索引的元组列表[ (0,1), (3,4), (6,8) ]。然后你可以使用列表理解来覆盖这个元组列表来提取子列表。

5

事情是这样的:

def _itersplit(l, splitters): 
    current = [] 
    for item in l: 
     if item in splitters: 
      yield current 
      current = [] 
     else: 
      current.append(item) 
    yield current 

def magicsplit(l, *splitters): 
    return [subl for subl in _itersplit(l, splitters) if subl] 

打动了我:

>>> l = [1, 4, None, 6, 9, None, 3, 9, 4 ] 
>>> magicsplit(l, None) 
[[1, 4], [6, 9], [3, 9, 4]] 
>>> magicsplit(l, None, 9) 
[[1, 4], [6], [3], [4]] 
1

与试图用一个列表理解这样做的问题是,理解本质上是无状态的,你需要的状态进行分裂操作。特别是,您需要记住,从一个元素到下一个元素,在先前的“分割”标记之后找到了哪些元素。

但是,您可以使用一个列表理解来提取拆分元素的索引,然后使用这些索引来切分列表。我们需要将分割指数转化为(开始,结束)指数以确定必要的“部分”。我们要做的是将分割索引列表转换为两个单独的“开始”和“结束”列表,然后将它们压缩在一起。

整个事情是这样的:

splitters = (9, None) # for example 
indices = [i for (i, x) in enumerate(original) if x in splitters] 
ends = indices + [len(original)] 
begins = [0] + [x + 1 for x in indices] 
result = [original[begin:end] for (begin, end) in zip(begins, ends)] 
1

下面是一个使用减少的实现,有一些花里胡哨:

#!/usr/bin/env python 

def split_on (delims, seq, remove_empty=True): 
    '''Split seq into lists using delims as a delimiting elements. 

    For example, split_on(delims=2, list=xrange(0,5)) yields [ [0,1], [3,4] ]. 

    delims can be either a single delimiting element or a list or 
    tuple of multiple delimiting elements. If you wish to use a list 
    or tuple as a delimiter, you must enclose it in another list or 
    tuple. 

    If remove_empty is False, then consecutive delimiter elements or delimiter elements at the beginning or end of the longlist''' 
    if type(delims) not in (type(list()), type(tuple())): 
     delims = (delims,) 
    def reduce_fun(lists, elem): 
     if elem in delims: 
      if remove_empty and lists[-1] == []: 
       # Avoid adding multiple empty lists 
       pass 
      else: 
       lists.append([]) 
     else: 
      lists[-1].append(elem) 
     return lists 
    result_list = reduce(reduce_fun, seq, [ [], ]) 
    # Maybe remove trailing empty list 
    if remove_empty and result_list[-1] == []: 
     result_list.pop() 
    return result_list 

mylist = [1, 4, None, 6, 9, None, 3, 9, 4 ] 

print(split_on(None, mylist)) 
print(split_on((9, None), mylist))