2014-09-19 62 views
1

我有一个Excel电子表格我准备迁移到Access,日期列有多种格式的条目,例如:1963年至1969年,1968年8月至1968年9月,1972年,3月73日,24日-Jul,1980年10月2日,1980年8月29日,1946年7月等,“未注明”。我将把作为关键字(地图编号)和日期列的列拖入csv并写回csv。 我可以去掉4位数的年份,但不是范围。我不知道如何提取几天和两个数字年的手工重新格式化。我的代码是不是很优雅,也许不是最好的做法:日期到类别

import csv, xlwt, re 

# create new Excel document and add sheet 
# from tempfile import TemporaryFile 
from xlwt import Workbook 
book = Workbook() 
sheet1 = book.add_sheet('Sheet 1') 

# populate first row with header 
sheet1.write(0,0,"Year") 
sheet1.write(0,1,"Map") 
sheet1.write(0,2,"As Entered") 

# count variable for populating sheet 
rowCount=0 

# open csv file and read 
with open('C:\dateTestMSDOs.csv', 'rb') as f: 
    reader=csv.reader(f) 
    for row in reader: 

     map = row[0] # first row is map number 
     dateRaw = row[1] # second row is raw date as entered 

     # write undated and blank entries 
     if dateRaw == 'undated': 
      yearStr = '0000' 
      rowCount +=1 
      sheet1.write(rowCount, 0, yearStr) 
      sheet1.write(rowCount, 1, map) 
      sheet1.write(rowCount, 2, dateRaw) 
      #print rowCount, yearStr, map, dateRaw, '\n' 
      yearStr='' 

     if dateRaw == '': 
      yearStr = 'NoEntry' 
      rowCount +=1 
      sheet1.write(rowCount, 0, yearStr) 
      sheet1.write(rowCount, 1, map) 
      sheet1.write(rowCount, 2, dateRaw) 
      #print rowCount, yearStr, map, dateRaw, '\n' 
      yearStr='' 

     # search and write instances of four consecutive digits 
     try: 
      year = re.search(r'\d\d\d\d', dateRaw) 
      yearStr= year.group() 
      #print yearStr, map, dateRaw 
      rowCount +=1 
      sheet1.write(rowCount, 0, yearStr) 
      sheet1.write(rowCount, 1, map) 
      sheet1.write(rowCount, 2, dateRaw) 
      #print rowCount, yearStr, map, dateRaw, '\n' 
      yearStr='' 

     # if none exist flag for cleaning spreadsheet and print 
     except: 
      #print 'Nope', map, dateRaw 
      rowCount +=1 
      yearStr='Format' 
      sheet1.write(rowCount, 0, yearStr) 
      sheet1.write(rowCount, 1, map) 
      sheet1.write(rowCount, 2, dateRaw) 
      #print rowCount, yearStr, map, dateRaw, '\n' 
      yearStr='' 
yearStr='' 
dateRaw='' 

book.save('D:\dateProperty.xls') 
print "Done!" 

我还想写日期和月份以附加列以及拉范围条目的第二个4位数的日期。

+0

你看过格式日期[日期时间](https://docs.python.org/2/library/datetime.html)吗? – N1B4 2014-09-19 22:50:00

+0

[Pandas](http://pandas.pydata.org)具有强大的日期分析功能。我会看看这个。 – b10n 2014-09-19 23:17:43

回答

1

您可以尝试使用dateutil。我认为你仍然需要以不同的方式处理一些更难的格式。请参阅下面的实现:

代码:

import dateutil.parser as dateparser 

date_list = ['1963 to 1969', 
      'Aug. 1968 to Sept. 1968', 
      'Mar-73', 
      '24-Jul', 
      'Oct. 2 1980', 
      'Aug 29, 1980', 
      'July 1946', 
      'undated']   

for d in date_list: 
    if 'to' in d: 
     a, b = d.split('to') 
     # Get the higher number. Use min to get lower of two. 
     print max(dateparser.parse(a.strip()).year, dateparser.parse(b.strip()).year) 
    elif d == 'undated': 
     print '0000' 
    else: 
     yr = dateparser.parse(d).year 
     print yr 

结果:

1969 
1968 
1973 
2014 
1980 
1980 
1946 
0000 
[Finished in 0.4s] 

只有突出的问题,我可以看到的是24-Jul因为解析器假定返回2014日期当前日期,月份或年份代替缺失的组件,即。 Mar-73将成为1973-03-20如果今天是本月20日,等

+0

绝对是我见过的最好的处理方式,比我的代码更好。我希望当我们将焦点改变为不使日期和月份成为可搜索参数时,我会记得您的帖子。 – 2014-11-20 15:05:55

+0

不可否认,我会用'pandas'来代替,但是如果没有访问你的数据,这是我能想到的最好的。 – Manhattan 2014-11-20 15:08:35

0

不能完全确定,如果这是你所追求的或没有,但我只是用一个“简单”的正则表达式搜索,然后遍历通过套组匹配,应用给定的函数定义。如果找到匹配,那么被调用的函数(在regex_groups变量中找到)应返回带有以下键的字典:start_day, start_month, start_year, end_day, end_month, end_year

然后,您可以对这些值进行任何操作。据我所知,绝对不是最干净的解决方案,但它起作用。

#!/usr/local/bin/python2.7 

import re 

# Crazy regex 
regex_pattern = '(?:(\d{4}) to (\d{4}))|(?:(\w+)\. (\d{4}) to (\w+)\. (\d{4}))|(?:(\w+)-(\d{2}))|(?:(\d{2})-(\w+))|(?:(\w+)\. (\d+), (\d{4}))|(?:(\w+) (\d+), (\d{4}))|(?:(\w+) (\d{4}))|(?:(\d{4}))' 

date_strings = [ 
    '1963 to 1969', 
    'Aug. 1968 to Sept. 1968', 
    '1972', 
    'Mar-73', 
    '24-Jul', 
    'Oct. 2, 1980', 
    'Aug 29, 1980', 
    'July 1946', 
] 

# Here you set the group matching functions that will be called for a matching group 
regex_groups = { 
    (1,2):  lambda group_matches: { 
    'start_day': '', 'start_month': '', 'start_year': group_matches[0], 
    'end_day': '', 'end_month': '', 'end_year': group_matches[1] 
    }, 
    (3,4,5,6): lambda group_matches: { 
    'start_day': '', 'start_month': group_matches[0], 'start_year': group_matches[1], 
    'end_day': '', 'end_month': group_matches[2], 'end_year': group_matches[3] 
    }, 
    (7,8):  lambda group_matches: { 
    'start_day': '', 'start_month': group_matches[0], 'start_year': group_matches[1], 
    'end_day': '', 'end_month': '', 'end_year': '' 
    }, 
    (9,10):  lambda group_matches: { 
    'start_day': group_matches[1], 'start_month': '', 'start_year': group_matches[0], 
    'end_day': '', 'end_month': '', 'end_year': '' 
    }, 
    (11,12,13): lambda group_matches: { 
    'start_day': group_matches[1], 'start_month': group_matches[0], 'start_year': group_matches[2], 
    'end_day': '', 'end_month': '', 'end_year': '' 
    }, 
    (14,15,16): lambda group_matches: { 
    'start_day': group_matches[1], 'start_month': group_matches[0], 'start_year': group_matches[2], 
    'end_day': '', 'end_month': '', 'end_year': '' 
    }, 
    (17,18): lambda group_matches: { 
    'start_day': '', 'start_month': group_matches[0], 'start_year': group_matches[1], 
    'end_day': '', 'end_month': '', 'end_year': '' 
    }, 
    (19,):  lambda group_matches: { 
    'start_day': '', 'start_month': '', 'start_year': group_matches[0], 
    'end_day': '', 'end_month': '', 'end_year': '' 
    }, 
} 

for ds in date_strings: 
    matches = re.search(regex_pattern, ds) 
    start_month = '' 
    start_year = '' 
    end_month = '' 
    end_year = '' 

    for regex_group, group_func in regex_groups.items(): 
    group_matches = [matches.group(sub_group_num) for sub_group_num in regex_group] 
    if all(group_matches): 
     match_data = group_func(group_matches) 
     print 
     print 'Matched:', ds 
     print '%s to %s' % ('-'.join([match_data['start_day'], match_data['start_month'], match_data['start_year']]), '-'.join([match_data['end_day'], match_data['end_month'], match_data['end_year']])) 

     # match_data is a dictionary with keys: 
     # * start_day 
     # * start_month 
     # * start_year 
     # * end_day 
     # * end_month 
     # * end_year 
     # If a group doesn't contain one of those items, then it is set to a blank string 

输出:

Matched: 1963 to 1969 
--1963 to --1969 

Matched: Aug. 1968 to Sept. 1968 
-Aug-1968 to -Sept-1968 

Matched: 1972 
--1972 to -- 

Matched: Mar-73 
-Mar-73 to -- 

Matched: 24-Jul 
Jul--24 to -- 

Matched: Oct. 2, 1980 
2-Oct-1980 to -- 

Matched: Aug 29, 1980 
29-Aug-1980 to -- 

Matched: July 1946 
-July-1946 to -- 
+0

谢谢布莱斯。在时间不足之前,这是我想去的地方,工作参数改变了焦点。不过,我会保留这些代码。我有一堆令人讨厌的数据要在几个月内清理干净,而不是更有效的方法,这将完全适应。 – 2014-11-20 14:47:58

0

你可以使用正则表达式定义的所有日期的可能的情况,是这样的:

import re 
s = ['1963 to 1969', 'Aug. 1968 to Sept. 1968', 
    '1972', 'Mar-73', '03-Jun', '24-Jul', 'Oct. 2, 1980', 'Oct. 26, 1980', 
    'Aug 29 1980', 'July 1946'] 


def get_year(date): 
    mm = re.findall("\d{4}", date) 
    if mm: 
     return mm 
    mm = re.search("\w+-(\d{2})", date) 
    if mm: 
     return [mm.group(1)] 

def get_month(date): 
    mm = re.findall("[A-Z][a-z]+", date) 
    if mm: 
     return mm 

def get_day(date): 
    d_expr = ["(\d|\d{2})\-[A-Z][a-z]+","[A-Z][a-z]+[\. ]+(\d|\d{2}),"] 
    for expr in d_expr: 
     mm = re.search(expr, date) 
     if mm: 
      return [mm.group(1)] 

d = {} 
m = {} 
y = {} 

for idx, date in enumerate(s): 
    d[idx] = get_day(date) 
    m[idx] = get_month(date) 
    y[idx] = get_year(date) 

print "Year Dict: ", y 
print "Month Dict: ", m 
print "Day Dict: ", d 

至于导致你的天,月,年词典。他们可以用来填充行。

输出:

Year Dict: {0: ['1963', '1969'], 1: ['1968', '1968'], 2: ['1972'], 3: ['73'], 4: None, 5: None, 6: ['1980'], 7: ['1980'], 8: ['1980'], 9: ['1946']} 
Month Dict: {0: None, 1: ['Aug', 'Sept'], 2: None, 3: ['Mar'], 4: ['Jun'], 5: ['Jul'], 6: ['Oct'], 7: ['Oct'], 8: ['Aug'], 9: ['July']} 
Day Dict: {0: None, 1: None, 2: None, 3: None, 4: ['03'], 5: ['24'], 6: ['2'], 7: ['26'], 8: None, 9: None} 
+0

嘿,谢谢你的建议!我以为我会发布我的笨拙的代码工作。输出.xls作为LU表导入到我们的数据库中,工作正常。缺点是任何空格或多余的字符都会破坏它,但缺点是它强制原始数据输入符合数据库中单个可搜索年份类别的输出(这是我们必须解决的问题,因为我们涉及范围,日期,月份等)。 – 2014-11-20 04:45:09

0

谢谢你的创新建议。经过考虑,我们决定从数据库中的可搜索内容中删除日期和月份,因为只有相对少量的数据具有该详细程度。这里是我用来从一个漫长而杂乱的列表中提取和生成我需要的数据的代码。

import csv, xlwt, re 
# create new Excel document and add sheet 
from xlwt import Workbook 
book = Workbook() 
sheet1 = book.add_sheet('Sheet 1') 

# populate first row with header 
sheet1.write(0,0,"MapYear_(Parsed)") 
sheet1.write(0,1,"Map_Number") 
sheet1.write(0,2,"As_Entered") 

# count variable for populating sheet 
rowCount=0 

# open csv file and read 
yearStr = '' 
with open('C:\mapsDateFix.csv', 'rb') as f: 
    reader=csv.reader(f) 
    for row in reader: 

     map = row[0] # first row is map number 
     dateRaw = row[1] # second row is raw date as entered 

     # write undated and blank entries 
     if dateRaw == 'undated': 
      yearStr = 'undated' 
      rowCount +=1 
      sheet1.write(rowCount, 0, yearStr) 
      sheet1.write(rowCount, 1, map) 
      sheet1.write(rowCount, 2, dateRaw) 
      #print rowCount, yearStr, map, dateRaw, '\n' 
      #yearStr='' 

     if yearStr != 'undated': 
      if dateRaw == '': 
       yearStr = 'NoEntry' 
       rowCount +=1 
       sheet1.write(rowCount, 0, yearStr) 
       sheet1.write(rowCount, 1, map) 
       sheet1.write(rowCount, 2, dateRaw) 
       #print rowCount, yearStr, map, dateRaw, '\n' 
       #yearStr='' 

     # search and write instances of four consecutive digits 
     if yearStr != dateRaw: 
      try: 
       year = re.search(r'\d\d\d\d', dateRaw) 
       yearStr= year.group() 
       #print yearStr, map, dateRaw 
       rowCount +=1 
       sheet1.write(rowCount, 0, yearStr) 
       sheet1.write(rowCount, 1, map) 
       sheet1.write(rowCount, 2, dateRaw) 
       #print rowCount, yearStr, map, dateRaw, '\n' 
       yearStr='' 

      # if none exist flag for cleaning spreadsheet and print 
      except: 
       #print 'Nope', map, dateRaw 
       rowCount +=1 
       yearStr='Format' 
       sheet1.write(rowCount, 0, yearStr) 
       sheet1.write(rowCount, 1, map) 
       sheet1.write(rowCount, 2, dateRaw) 
       #print rowCount, yearStr, map, dateRaw, '\n' 
       yearStr='' 
yearStr='' 
dateRaw='' 

book.save('D:\dateProperty.xls') 
print "Done!"