2016-05-22 12 views
0

我在写一个web蜘蛛来获取堆栈溢出的用户信息。我试图使用mongodb存储信息。 这里是我的网络蜘蛛,工作正常的代码:scrapy使用mongodb存储信息

class webSpider(Spider): 
    name = "user_spider1" 
    allowed_domains = ["stackoverflow.com"] 
    start_urls = [] 
def start_requests(self): 
    for i in range(1,2): 
     self.start_urls.append( "http://stackoverflow.com/users?page="+ str(i)+"&tab=reputation&filter=week") 
    for url in self.start_urls: 
     yield self.make_requests_from_url(url) 

def parse(self, response): 

    htmlTxt = response.body 
    baseDomain = etree.HTML(htmlTxt) 

    userSubUrl = baseDomain.xpath('//div[@class="user-details"]/a/@href') 
    baseUrl = 'http://stackoverflow.com' 
    for subUrl in userSubUrl: 


     yield Request(baseUrl+subUrl,callback=self.parse_userinfo) 

def parse_userinfo(self,response): 
    htmlTxt = response.body 
    infoDomain = etree.HTML(htmlTxt) 

    item['user_name'] = stringprocessor(str(infoDomain.xpath('//h2[@class="user-card-name"]/text()[1]'))) 
    item['user_location'] = stringprocessor(str(infoDomain.xpath('//ul[@class="list-unstyled"]/li[1]/text()[2]'))) 
    item['user_reputation'] = stringprocessor(str(infoDomain.xpath('//div[@class="reputation"]/text()[1]'))) 
    tags = infoDomain.xpath('//div[@class="tag-container row"]/div/a[@class="post-tag"]/text()') 

    item['user_tags'] = tags 
    yield item 

,这里是我的管道文件和设置,这可能是错误的:

import pymongo 


from scrapy import log 
from scrapy.conf import settings 

class Spider1Pipeline(object): 
    def __init__(self): 

     connection = pymongo.Connection(
      settings['MONGODB_SERVER'],settings['MONGODB_PORT']) 
     db = connection[settings['MONGODB_DB']] 

     self.collection = db[settings['MONGODB_COLLECTION']] 

    def process_item(self, item, spider): 
     self.collection.insert(dict(item)) 
     log.msg('Item written to MongoDB database ',level=log.DEBUG, spider=spider) 
     return item 

设置:

BOT_NAME = 'test1' 

SPIDER_MODULES = ['test1.spiders'] 
NEWSPIDER_MODULE = 'test1.spiders' 


ROBOTSTXT_OBEY = True 

ITEM_PIPELINES = ['test1.pipelines.Spider1Pipeline',] 

MONGODB_SERVER='localhost' 
MONGODB_PORT=27017 
MONGODB_DB='test1' 
MONGODB_COLLECTION='user_info' 

我得到的错误是这样的:

AttributeError: 'list' object has no attribute 'iteritems' 

我真的很困惑。 Plz在这里帮助我。

+0

你不应该这样做,它违背计算器[TOS](http://meta.stackexchange.com/questions/277369/a-terms-of-service-update-restricting-companies - 即刮-您的姿态,Informa的?RQ = 1) –

回答

0

您的管道看起来没问题。你的蜘蛛有点奇怪。这里是一个可行的好一点的版本:

import scrapy 
from scrapy import Request 

class WebSpider(scrapy.Spider): 
    name = "user_spider1" 
    allowed_domains = ["stackoverflow.com"] 
    start_urls = [] 

    def start_requests(self): 
     for i in range(1,2): 
      self.start_urls.append( "http://stackoverflow.com/users?page="+ str(i)+"&tab=reputation&filter=week") 
     for url in self.start_urls: 
      yield self.make_requests_from_url(url) 

    def parse(self, response): 
     userSubUrl = response.xpath('//div[@class="user-details"]/a/@href').extract() 
     baseUrl = 'http://stackoverflow.com' 
     for subUrl in userSubUrl: 
      yield Request(baseUrl+subUrl, callback=self.parse_userinfo) 

    def parse_userinfo(self,response): 
     item = {} 

     stringprocessor = lambda x: x 
     item['user_name'] = stringprocessor(str(response.xpath('//h2[@class="user-card-name"]/text()[1]').extract_first())) 
     item['user_location'] = stringprocessor(str(response.xpath('//ul[@class="list-unstyled"]/li[1]/text()[2]').extract_first())) 
     item['user_reputation'] = stringprocessor(str(response.xpath('//div[@class="reputation"]/text()[1]').extract_first())) 
     tags = response.xpath('//div[@class="tag-container row"]/div/a[@class="post-tag"]/text()').extract_first() 

     item['user_tags'] = tags 
     yield item