- 这是我对这个问题的解决方案
import pandas as pd
import datetime
import os
class Update(object):
def __init__(self, category):
"""Path to file"""
masterfile = os.path.realpath('lys_masterfile.txt')
update_file = os.path.realpath('Outputs/liveyoursport/Update_Spider/{}_Update.csv'.format(category))
self.comparision(masterfile, update_file, category)
def comparision(self, output_file, update_file, category):
''' Function to extract correct data by category '''
sku_dict = {
'Electronics': 'ELECTRNCS',
'Sports Equipment': 'SPRTSEQIP',
'Health and Beauty': 'HLTHBTY',
"Women's Fashion Accessories": 'WMNFSHACCSS',
'Toys and Games': 'TOYS',
"Men's Fashion Shoes": 'MNFSHSHOE',
"Other Sports Shoes": 'OTHSPRTSSHOE',
"Women's Sports Shoes": 'WMNSPORTSHOE',
"Men's Running Shoes": 'MNSRUNSHOE',
"Amazon Global-Toys": 'GLBTOYS',
"Women's Running Shoes": 'WMNRUNSHOE',
"Women's Fashion Shoes": 'WMNFSHSHOE',
"Computer & Accessories": 'CMPTRACCS',
"Office Supplies": "OFFSUPPLIES",
"Clothing Accessories": "CLTHACCSS",
"TigerDirect": "TDRCT"
}
sku = sku_dict.get(category)
def extraction(value):
if isinstance(value, str) and sku in value:
asin = value.split('-')[0].replace('LYS', '')
return asin
else:
return 'None'
"""Extract only necessary field from file """
masterfile_sku = pd.read_csv(output_file, usecols=['Product Code/SKU'], delimiter='\t', skip_blank_lines=True)
""" Trying to extract SKU """
masterfile_asin = masterfile_sku['Product Code/SKU'].apply(extraction)
""" Making another dataFrame for comparision """
products_df = pd.DataFrame(
{'sku': masterfile_asin, 'Product Code/SKU': masterfile_sku['Product Code/SKU']}).query("sku != 'None'")
"""Fetching Update file and separating in_stock and out_stock """
update_df = pd.read_csv(update_file, usecols=[2, 3], names=['sku', 'price'])
update_in_stock_df = update_df.query("price != 'nan'")
update_out_stock_df = update_df.query("price == 'nan'")
""" Check for instock Product """
in_stock = pd.merge(products_df, update_in_stock_df, on='sku', how='inner')
# print in_stock
""" Check for out-of-stock Product """
out_of_stock = pd.merge(in_stock, products_df, on='sku', how='right', indicator=True).query(
"_merge == 'right_only'")
out_of_stock = pd.merge(out_of_stock, update_out_stock_df, on='sku', how='outer')
out_of_stock = out_of_stock.drop_duplicates(subset='sku')
"""Writing all dataFrames"""
in_stock.to_csv(os.path.realpath('Outputs/liveyoursport/in_stock/Lys_{}_in_stock.csv'.format(category)))
out_of_stock.to_csv(
os.path.realpath('Outputs/liveyoursport/out_of_stock/Lys_{}_out_of_stock.csv'.format(category)))
if __name__ == '__main__':
a = datetime.datetime.now()
Update("Women's Running Shoes")
print 'Done'
print 'Completed in {}'.format(datetime.datetime.now() - a)
所以你知道熊猫,但你有任何代码吗? –
@ cricket_007 **不多** –
顺便说一句,Sqlite可能比csv文件更有意义查询/过滤 –