我正在清除“PERCENTAGE_AFFECTED”熊猫数据框的列。它包含整数范围(例如:“70-80”,“70和80”,“65至70”)。检查熊猫数据库中的字符串是否包含子字符串并删除
我想创建一个函数来清理所有这些以创建整数平均值。
这个作品>>>
def clean_split_range(row):
# Initial value contains the current value for the PERCENTAGE AFFECTED column
initial_perc = str(row['PERCENTAGE_AFFECTED'])
chars = '<>!,?":;() '
#Remove chars in initial value
if any(c in chars for c in initial_perc):
split_range =[]
cleanWord = ""
for char in initial_perc:
if char in chars:
char = ""
cleanWord += char
split_range.append(cleanWord)
initial_perc = ''.join(split_range)
#Split initial_perc into two elements if "-" is found
split_range = initial_perc.split('-')
# If a "-" is found, split_date will contain a list with two items
if len(split_range) > 1:
try:
final_perc = int(reduce(lambda x, y: x + y, list(map(int, split_range)))/(len(split_range)))
except ValueError:
split_range = split_range[0].split('+')
final_perc = split_range[0]
finally:
if str(final_perc).isalpha():
final_perc = 0
elif initial_perc.find('and') != -1:
split_other = initial_perc.split('and')
if len(split_other) > 1:
try:
final_perc = int(reduce(lambda x, y: x + y, list(map(int, split_other)))/(len(split_other)))
except ValueError:
split_other = split_other[0].split('+')
final_perc = split_other[0]
finally:
if str(final_perc).isalpha():
final_perc = 0
elif initial_perc.find('to') != -1:
split_other = initial_perc.split('to')
if len(split_other) > 1:
try:
final_perc = int(reduce(lambda x, y: x + y, list(map(int, split_other)))/(len(split_other)))
except ValueError:
split_other = split_other[0].split('+')
final_perc = split_other[0]
finally:
if str(final_perc).isalpha():
final_perc = 0
elif initial_perc.find('±') != -1:
split_other = initial_perc.split('±')
final_perc = split_other[0]
elif initial_perc.startswith('over'):
split_other = initial_perc.split('over')
final_perc = split_other[1]
elif initial_perc.find('around') != -1:
split_other = initial_perc.split('around')
final_perc = split_other[1]
elif initial_perc.isalpha():
final_perc = 0
# If no "-" is found, split_date will just contain 1 item, the initial_date
else:
final_perc = initial_perc
return final_perc
但是: 我试图简化这一因此,如果条目包含“ - ”,“和”,“到”串。我创建了我希望通过拆分和删除子(split_list)的列表:
def new_clean_split_range(row):
# Initial value contains the current value for the PERCENTAGE AFFECTED column
initial_perc = str(row['PERCENTAGE_AFFECTED'])
chars = '<>!,?":;() '
split_list = ['-','and']
# Split initial_perc into two elements if "-" is found
if any(a in initial_perc for a in split_list):
for a in split_list:
split_range = initial_perc.split(a)
# If a "-" is found in split_list, initial_perc will contain a list with two items
if len(split_range) > 1:
try:
final_perc = int(reduce(lambda x, y: x + y, list(map(int, split_range)))/(len(split_range)))
except ValueError:
split_range = split_range[0].split('+')
final_perc = split_range[0]
finally:
if str(final_perc).isalpha():
final_perc = 0
else:
final_perc = initial_perc
#Remove chars in initial value
if any(c in chars for c in initial_perc):
split_range =[]
cleanWord = ""
for char in initial_perc:
if char in chars:
char = ""
cleanWord += char
split_range.append(cleanWord)
initial_perc = ''.join(split_range)
split_range = ''
elif initial_perc.find('±') != -1:
split_other = initial_perc.split('±')
final_perc = split_other[0]
elif initial_perc.startswith('over'):
split_other = initial_perc.split('over')
final_perc = split_other[1]
elif initial_perc.find('around') != -1:
split_other = initial_perc.split('around')
final_perc = split_other[1]
elif initial_perc.isalpha():
final_perc = 0
# If no "-" is found, split_date will just contain 1 item, the initial_date
else:
final_perc = initial_perc
return final_perc
任何帮助将是巨大的:)
请提供的“initial_perc”和所有的输入和预期输出(你mantioned只是符合) – DexJ
不知道如何为你连接,但它包含整数,范围如: “70-80”, “70和80“, ”65到70“,例如: ”<1“, ”12.2 + -5.2“, “超过95”, “大约50” 预期的输出仅仅是适合的整数的估计值。 “12.2±5.2”可以是12.2; “超过95”可以简单地是95 –
那么我会建议另一种解决方案,然后你的?因为它有点复杂和毛病 – DexJ