2012-05-03 50 views
2

我正在使用scrapy,我试图将抓取的数据从蜘蛛保存到MySql数据库。我正在使用管道来实现这一点,但没有运气。这里是我的代码为管道:Scrapy mysql管道错误

from scrapy import log 
from scrapy.core.exceptions import DropItem 
from twisted.enterprise import adbapi 

import time 
import MySQLdb.cursors 

class FilterWordsPipeline(object): 
"""A pipeline for filtering out items which contain certain words in their 
description""" 

# put all words in lowercase 
words_to_filter = ['politics', 'religion'] 

def process_item(self, spider, item): 
    print spider 
    for word in self.words_to_filter: 
     if word in unicode(item['description']).lower(): 
      raise DropItem("Contains forbidden word: %s" % word) 
    else: 
     return item 

class MySQLStorePipeline(object): 

def __init__(self): 
    # @@@ hardcoded db settings 
    # TODO: make settings configurable through settings 
    self.dbpool = adbapi.ConnectionPool('adress_to_db', 
      db='my_db', 
      user='my_user', 
      passwd='my_pw', 
      cursorclass=MySQLdb.cursors.DictCursor, 
      charset='utf8', 
      use_unicode=True 
     ) 

def process_item(self, spider, item): 
    # run db query in thread pool 
    query = self.dbpool.runInteraction(self._conditional_insert, item) 
    query.addErrback(self.handle_error) 

    return item 

def _conditional_insert(self, tx, item): 
    # create record if doesn't exist. 
    # all this block run on it's own thread 
    tx.execute("select * from scrapytest where link = %s", (item['link'][0],)) 
    result = tx.fetchone() 
    if result: 
     log.msg("Item already stored in db: %s" % item, level=log.DEBUG) 
    else: 
     tx.execute(\ 
      "insert into scrapytest (title, link, desc) " 
      "values (%s, %s, %s)", 
      (item['title'][0], 
      item['link'][0], 
      item['desc'][0] 
     ) 
     log.msg("Item stored in db: %s" % item, level=log.DEBUG) 

def handle_error(self, e): 
    log.err(e) 

而这里的错误消息我得到:

SyntaxError: invalid syntax 
PS C:\Python27\testscrapy\tutorial> scrapy crawl dmoz 
2012-05-03 16:03:11+0200 [scrapy] INFO: Scrapy 0.14.3 started (bot: tutorial) 
2012-05-03 16:03:12+0200 [scrapy] DEBUG: Enabled extensions: LogStats, TelnetConsole,       
CloseSpider, WebService, CoreStats 
, SpiderState 
2012-05-03 16:03:12+0200 [scrapy] DEBUG: Enabled downloader middlewares: HttpAuthMiddleware,       
DownloadTimeoutMiddleware, 
UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, RedirectMiddleware,  
CookiesMiddleware, HttpCompressionMi 
ddleware, ChunkedTransferMiddleware, DownloaderStats 
2012-05-03 16:03:12+0200 [scrapy] DEBUG: Enabled spider middlewares: HttpErrorMiddleware,   
OffsiteMiddleware, RefererMidd 
leware, UrlLengthMiddleware, DepthMiddleware 
Traceback (most recent call last): 
File "C:\Python27\Scripts\scrapy", line 5, in <module> 
pkg_resources.run_script('Scrapy==0.14.3', 'scrapy') 
File "C:\Python27\lib\site-packages\pkg_resources.py", line 489, in run_script 
self.require(requires)[0].run_script(script_name, ns) 
File "C:\Python27\lib\site-packages\pkg_resources.py", line 1207, in run_script 
execfile(script_filename, namespace, namespace) 
File "c:\python27\lib\site-packages\scrapy-0.14.3-py2.7-win32.egg\EGG-INFO\scripts\scrapy", line 
4, in <module> 
execute() 
File "C:\Python27\lib\site-packages\scrapy-0.14.3-py2.7-win32.egg\scrapy\cmdline.py", line 132, 
in execute 
run_print_help(parser, _run_command, cmd, args, opts) 
File "C:\Python27\lib\site-packages\scrapy-0.14.3-py2.7-win32.egg\scrapy\cmdline.py", line 97, in 
_run_print_help 
func(*a, **kw) 
File "C:\Python27\lib\site-packages\scrapy-0.14.3-py2.7-win32.egg\scrapy\cmdline.py", line 139, 
in _run_command 
cmd.run(args, opts) 
File "C:\Python27\lib\site-packages\scrapy-0.14.3-py2.7-win32.egg\scrapy\commands\crawl.py", line 
43, in run 
spider = self.crawler.spiders.create(spname, **opts.spargs) 
File "C:\Python27\lib\site-packages\scrapy-0.14.3-py2.7-win32.egg\scrapy\command.py", line 34, 
in crawler 
self._crawler.configure() 
File "C:\Python27\lib\site-packages\scrapy-0.14.3-py2.7-win32.egg\scrapy\crawler.py", line 37, in 
configure 
self.engine = ExecutionEngine(self, self._spider_closed) 
File "C:\Python27\lib\site-packages\scrapy-0.14.3-py2.7-win32.egg\scrapy\core\engine.py", line 
62, in __init__ 
self.scraper = Scraper(crawler) 
File "C:\Python27\lib\site-packages\scrapy-0.14.3-py2.7-win32.egg\scrapy\core\scraper.py", line 
68, in __init__ 
self.itemproc = itemproc_cls.from_crawler(crawler) 
File "C:\Python27\lib\site-packages\scrapy-0.14.3-py2.7-win32.egg\scrapy\middleware.py", line 48, 
in from_crawler 
return cls.from_settings(crawler.settings, crawler) 
File "C:\Python27\lib\site-packages\scrapy-0.14.3-py2.7-win32.egg\scrapy\middleware.py", line 29, 
in from_settings 
mwcls = load_object(clspath) 
File "C:\Python27\lib\site-packages\scrapy-0.14.3-py2.7-win32.egg\scrapy\utils\misc.py", line 37, 
in load_object 
mod = __import__(module, {}, {}, ['']) 
File "C:\Python27\testscrapy\tutorial\tutorial\pipelines.py", line 64 
log.msg("Item stored in db: %s" % item, level=log.DEBUG) 
^
SyntaxError: invalid syntax 

我不知道从哪里从这里开始,所以任何帮助真的感谢!

回答

1
tx.execute(\ 
      "insert into scrapytest (title, link, desc) " 
      "values (%s, %s, %s)", 
      (item['title'][0], 
      item['link'][0], 
      item['desc'][0]) 
     ) 

需要右括号^^

一个良好的开端通常是线错误点或前行

+0

谢谢对尖!不幸的是,我没有解决这个问题。现在我得到这个错误: 文件“C:\ Python27 \ testscrapy \ tutorial \ tutorial \ tutorials \ pipelines.py”,第64行 message =“项目已存储在db:%s”%(item) ^ SyntaxError:invalid语法 – user1009453

+0

@ user1009453我在看错log.msg,没有行号对不起!你错过了一个右括号 – dm03514

+0

你是绝对正确的,我错过了一个括号。现在我收到以下错误: “ImportError:加载对象'tutorial.pipelines.MySQLStorePipeline'时出错:没有模块名为例外” 我已检查并且管道在settings.py中具有相同的名称。这是否意味着我需要导入一个名为exceptions的模块?感谢您在这里帮助我! – user1009453