2011-09-29 38 views
0

我从SQL查询中获取大量数据,并且需要很长时间才能运行。由于SQL查询需要很长时间才能运行,因此我以最细化的形式从数据库中获取数据。然后,我循环这些数据一次,并以对我有用的形式聚合它。Python试图重构(DRY输出)很长的控制流程

我的问题是,我一遍又一遍地重复自己。但是,我不确定重构此控制流的最佳方式。提前致谢!

def processClickOutData(cls, raw_data): 
    singles = {} 
    total={} 
    absolute_total = 0 
    channels = {} 

    singles_true = {} 
    total_true={} 
    channels_true = {} 
    absolute_total_true = 0 

    list_channels = set([]) 
    list_tids = set([]) 


    total_position = {} 
    total_position_true = {} 
    tid_position = {} 
    channel_position = {} 
    channel_position_true = {} 
    tid_position_true = {} 

    for row in raw_data: 
     gap=row[0] 
     count=row[1] 
     tid=row[2] 
     prefered=row[3] 
     channel=row[4] 
     position=row[5] 

     list_channels.add(channel) 
     list_tids.add(tid) 


     absolute_total += int(count) 

     if total.has_key(gap): 
      total[gap] += count 
     else: 
      total[gap] = count 

     if singles.has_key(gap) and singles[gap].has_key(tid): 
      singles[gap][tid] += count 
     elif singles.has_key(gap): 
      singles[gap][tid] = count 
     else: 
      singles[gap] = {} 
      singles[gap][tid] = count 

     if channels.has_key(gap) and channels[gap].has_key(channel): 
      channels[gap][channel] += count 
     elif channels.has_key(gap): 
      channels[gap][channel] = count 
     else: 
      channels[gap] = {} 
      channels[gap][channel] = count 
     if total_position.has_key(position): 
      total_position[position] += count 
     else: 
      total_position[position] = count 
     if tid_position.has_key(position) and tid_position[position].has_key(tid): 
      tid_position[position][tid] += count  
     elif tid_position.has_key(position): 
      tid_position[position][tid] = count 
     else: 
      tid_position[position] = {} 
      tid_position[position][tid] = count 

     if channel_position.has_key(position) and channel_position[position].has_key(channel): 
      channel_position[position][channel] += count  
     elif channel_position.has_key(position): 
      channel_position[position][channel] = count 
     else: 
      channel_position[position] = {} 
      channel_position[position][channel] = count 

     if prefered == 0: 
      absolute_total_true += count 
      if total_true.has_key(gap): 
       total_true[gap] += count 
      else: 
       total_true[gap] = count 

      if singles_true.has_key(gap) and singles_true[gap].has_key(tid): 
       singles_true[gap][tid] += count 
      elif singles_true.has_key(gap): 
       singles_true[gap][tid] = count 
      else: 
       singles_true[gap] = {} 
       singles_true[gap][tid] = count 

      if channels_true.has_key(gap) and channels_true[gap].has_key(channel): 
       channels_true[gap][channel] += count 
      elif channels_true.has_key(gap): 
       channels_true[gap][channel] = count 
      else: 
       channels_true[gap] = {} 
       channels_true[gap][channel] = count 

      if total_position_true.has_key(position): 
       total_position_true[position] += count 
      else: 
       total_position_true[position] = count 

      if tid_position_true.has_key(position) and tid_position_true[position].has_key(tid): 
       tid_position_true[position][tid] += count  
      elif tid_position_true.has_key(position): 
       tid_position_true[position][tid] = count 
      else: 
       tid_position_true[position] = {} 
       tid_position_true[position][tid] = count 

      if channel_position_true.has_key(position) and channel_position_true[position].has_key(channel): 
       channel_position_true[position][channel] += count  
      elif channel_position_true.has_key(position): 
       channel_position_true[position][channel] = count 
      else: 
       channel_position_true[position] = {} 
       channel_position_true[position][channel] = count 




    final_values = {"singles" : singles, "singles_true" : singles_true, "total" : total, "total_true": total_true, "absolute_total": absolute_total, "absolute_total_true": absolute_total_true, "channel_totals" : channels, "list_channels" : list_channels, "list_tids" : list_tids, "channel_totals_true" : channels_true, 
        "total_position" : total_position, "total_position_true" : total_position_true, "tid_position" : tid_position, "channel_position" : channel_position, "tid_position_true" : tid_position_true, "channel_position_true" : channel_position_true } 
    return final_values 

回答

1

您用来存储数据的整个结构可能是错误的,但由于我不知道如何使用它,因此我无法帮助您。

您可以通过使用collections.defaultdict来摆脱全部这些has_key()调用。注意thedict.has_key(key)已被弃用,您应该使用key in thedict来代替。

看我如何改变for环太 - 你可以分配到的名字就在for声明中,无需单独做到这一点。

from collections import defaultdict 

def processClickOutData(cls, raw_data): 
    absolute_total = 0 
    absolute_total_true = 0 

    list_channels = set() 
    list_tids = set() 

    total = defaultdict(int) 
    total_true = defaultdict(int) 
    total_position = defaultdict(int) 
    total_position_true = defaultdict(int) 

    def defaultdict_int(): 
     return defaultdict(int) 

    singles = defaultdict(defaultdict_int) 
    singles_true = defaultdict(defaultdict_int) 
    channels = defaultdict(defaultdict_int) 
    channels_true = defaultdict(defaultdict_int) 
    tid_position = defaultdict(defaultdict_int) 
    tid_position_true = defaultdict(defaultdict_int) 
    channel_position = defaultdict(defaultdict_int) 
    channel_position_true = defaultdict(defaultdict_int)  

    for gap, count, prefered, channel, position in raw_data: 
     list_channels.add(channel) 
     list_tids.add(tid) 

     absolute_total += count 
     total[gap] += count 
     singles[gap][tid] += count 
     channels[gap][channel] += count 
     total_position[position] += count 
     tid_position[position][tid] += count 
     channel_position[position][channel] += count 

     if prefered == 0: 
      absolute_total_true += count 
      total_true[gap] += count 
      singles_true[gap][tid] += count 
      channels_true[gap][channel] += count 
      total_position_true[position] += count 
      tid_position_true[position][tid] += count 
      channel_position_true[position][channel] += count 




    final_values = {"singles" : singles, "singles_true" : singles_true, "total" : total, "total_true": total_true, "absolute_total": absolute_total, "absolute_total_true": absolute_total_true, "channel_totals" : channels, "list_channels" : list_channels, "list_tids" : list_tids, "channel_totals_true" : channels_true, 
        "total_position" : total_position, "total_position_true" : total_position_true, "tid_position" : tid_position, "channel_position" : channel_position, "tid_position_true" : tid_position_true, "channel_position_true" : channel_position_true } 
    return final_values 

这样做会自动填写正确的默认值,如果键不存在。你在这里有两种。如果您要添加int s,您希望从0开始(如果它不存在) - 那么int将返回,因此defaultdict(int)。如果您添加的字典中添加了int s,则需要使用返回defaultdict(int)的函数,这是defaultdict_int的作用。

编辑:建议的替代字典结构:

position = defaultdict(lambda: defaultdict(defaultdict_int)) 
gap = defaultdict(lambda: defaultdict(defaultdict_int)) 
absolute_total = 0 

for gap, count, prefered, channel, position in raw_data: 
    absolute_total += count 

    posd = position[position] 
    posd.setdefault('total', 0) 
    posd['total'] += count 
    posd['tid'][tid] += count 
    posd['channel'][channel] += count 

    gapd = gap[gap] 
    gapd.setdefault('total', 0) 
    gapd['total'] += count 
    gapd['tid'][tid] += count 
    gapd['channel'][channel] += count 

请与_true版本,以及相同的,你从12个dict劳燕分飞至4

+0

你是什么意思的我用来存储数据的整个结构是错误的?输出是正确的,我正在检查数据与SQL查询执行相同的功能。 – Spencer

+0

@Peter我已经为我的答案添加了一个例子。基本上,有十二个这样的词典非常混乱,四个完全一样。 – agf

+0

谢谢!我对Python和编程一般都很陌生,所以这非常有帮助。我投了票,现在会接受。 – Spencer