1
目前,我有以下结构的scrapy项目:Scrapy是不是能找到我的蜘蛛在当前项目
.
├── articlescraper
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── items.py
│ ├── items.pyc
│ ├── pipelines.py
│ ├── pipelines.pyc
│ ├── scheduler.py
│ ├── scheduler.pyc
│ ├── settings.py
│ ├── settings.pyc
│ └── spiders
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── nujijspider.py
│ └── nujijspider.pyc
└── scrapy.cfg
现在在我的scheduler.py我调用这个函数:
from Queue import Queue
import threading
import time
import sys
import imp
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
class Scheduler(object):
"""Scheduler is the base class for the Scheduler
This class loops on the queue object and calls the needed crawlers from within
Reschedules articles to be crawled again
"""
def __init__(self):
self.articleInformation = {}
self.taskQueue = Queue()
def append_work(self, work):
if work['Url'] not in self.articleInformation:
self.articleInformation[work['Id']] = work
print self.articleInformation
def schedule(self):
article = self.taskQueue.get()
settings = get_project_settings()
process = CrawlerProcess(settings)
process.crawl("articlecommentspider",url="///")
process.start()
但是这会导致此错误scrapy:
File "/usr/local/lib/python2.7/site-packages/scrapy/spiderloader.py", line 43, in load
raise KeyError("Spider not found: {}".format(spider_name))
KeyError: 'Spider not found: articlecommentspider'
蜘蛛:
class ArticleCommentSpider(scrapy.Spider):
"""ArticleCommentSpider Can look for all the the comments on an article page
Those article pages are specific to www.nujij.nl and nu.nl related websites
"""
name = 'articlecommentspider'
allowed_domains = ['nujij.nl']
def __init__(self, *args, **kwargs):
super(ArticleCommentSpider, self).__init__(*args, **kwargs)
arg = args.get('url')
if not arg:
print arg
self.start_urls = arg
def parse(self,response):
title = response.xpath("//h1"+matchClass('title')+"//text()").extract()[1] ## Title is weird defined inside Nujij.nl (<h1 class="title">)
articleId = prog.search(response.url).group().split('.')[0] ## This regex matches things like (873238.lynkx in url)
response.replace(body=response.body.replace('<br>', '\n')) # Needed for comments which have alot of <br> tags
for item in response.xpath('//ol[@class="reacties"]//li'+ matchClass('hidenum')): ## Every list item underneath the reactions
commentId = item.xpath('@id').extract_first() ## Id from the first list item (unique on every article)
c = item.xpath('.//div[@class="reactie-body "]/text()').extract()
c = ''.join(map(unicode.strip, c))
date = item.xpath('normalize-space(.//span[@class="tijdsverschil"])').extract()
date = dateparser.parse("".join(date))
articleComment = Comment()
articleComment['Id'] = articleId+"+"+str(commentId)
articleComment['Source'] = str(title)
articleComment['IndexedAt'] = date
articleComment['Url'] = response.url
articleComment['Parser'] = "nujij.nl"
articleComment['Content'] = str(c)
articleComment['Subject'] = {
"url" : response.url,
"title": str(title)
}
print articleComment
当列出与scrapy列表scrapers我得到他们两个。 scheduler文件也在articlescraper项目中。我怎么能不在这个过程中调用刮板
也有在文件中表示这 '从scrapy.crawler进口CrawlerProcess 从scrapy.utils.project进口get_project_settings 过程= CrawlerProcess(get_project_settings()) #'followall'是项目蜘蛛之一的名称。 process.crawl('followall',domain ='scrapinghub.com') process.start()#脚本将在此处阻塞,直到抓取完成为止# –
嗯,我从来没有真正尝试过使用它的方式,但我只是试着在'testspiders'回购文档推荐,它确实工作得很好。你的部分结构几乎相同。 'scrapy list'是否会返回正确的蜘蛛名字? – Granitosaurus
是的,这就是我为什么这么奇怪。我已经改变了你的方法,它现在起作用了! –