Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bugs of attribute error 、mongondb duplicate record error、character coding error #5

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5,595 changes: 156 additions & 5,439 deletions woaidu_crawler/logs/scrapy.log

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions woaidu_crawler/woaidu_crawler/pipelines/bookfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def __init__(self,store_uri,download_func=None):
self.bookfile_store = store_uri
self.store = self._get_store(store_uri)
self.item_download = {}

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里改一下

@classmethod
def from_settings(cls, settings):
cls.EXPIRES = settings.getint('BOOK_FILE_EXPIRES', 90)
Expand All @@ -72,8 +72,8 @@ def process_item(self, item, spider):
"""
custom process_item func,so it will manage the Request result.
"""

info = self.spiderinfo[spider]
log.msg("***[pipeline] media_name : %s || spider: %r || self.spiderinfo: %r" % (self.MEDIA_NAME, spider, self.spiderinfo), level=log.DEBUG, spider=spider)
info = self.spiderinfo
requests = arg_to_iter(self.get_media_requests(item, info))
dlist = [self._process_request(r, info) for r in requests]
dfd = DeferredList(dlist, consumeErrors=1)
Expand Down
8 changes: 5 additions & 3 deletions woaidu_crawler/woaidu_crawler/pipelines/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
from woaidu_crawler.utils.select_result import list_first_item
from scrapy.exceptions import NotConfigured, IgnoreRequest

from woaidu_crawler.utils.coding import make_unicode, make_unicode_obj

class FileException(Exception):
"""General file error exception"""
def __init__(self, file_url=None, *args):
Expand Down Expand Up @@ -264,17 +266,17 @@ def get_file_name(self,request,response):
#MPLS TE Switching%E6%96%B9%E6%A1%88%E7%99%BD%E7%9A%AE%E4%B9%A6.pdf
#use urllib.unquote(filename) instead
if urlparse(request.url).netloc in self.ATTACHMENT_FILENAME_UTF8_DOMAIN:
filename = filename.decode("utf-8")
filename = make_unicode(filename)
else:
filename = filename.decode("gbk")
filename = make_unicode(filename)
#print "Content-Disposition:","*"*30,filename
else:
guessname = request.url.split('/')[-1]
#os.path.splitext:
#Split the pathname path into a pair (root, ext) such that root + ext == path
if os.path.splitext(guessname)[1].lower() in self.FILE_EXTENTION:
if urlparse(request.url).netloc in self.URL_GBK_DOMAIN:
filename = urllib.unquote(guessname).decode("gbk").encode("utf-8")
filename = make_unicode(urllib.unquote(guessname)).encode("utf-8")
else:
filename = urllib.unquote(guessname)
#print "url:","*"*30,filename
Expand Down
20 changes: 17 additions & 3 deletions woaidu_crawler/woaidu_crawler/pipelines/mongodb.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
#!/usr/bin/python
#-*-coding:utf-8-*-

import sys
import datetime
import traceback
from pprint import pprint
from woaidu_crawler.utils import color
from scrapy import log
from woaidu_crawler.utils import color
from pymongo.connection import MongoClient
import pymongo

class SingleMongodbPipeline(object):
"""
Expand Down Expand Up @@ -56,9 +58,21 @@ def process_item(self, item, spider):
'original_url':item.get('original_url',''),
'update_time':datetime.datetime.utcnow(),
}

result = self.db['book_detail'].insert(book_detail)
item["mongodb_id"] = str(result)

log.msg("*** [pipeline] mogondb process_item: %r" % book_detail, level=log.DEBUG, spider=spider)

try:
bk_n = book_detail.pop('book_name')
author = book_detail.pop('author')
self.db['book_detail'].update({'book_name' : bk_n, 'author' : author},
{'$set' : book_detail},
upsert=True)
result = self.db['book_detail'].find_one({'book_name' : bk_n, 'author' : author})['_id']
item["mongodb_id"] = str(result)
except Exception, e:
traceback.print_exc()
sys.stderr.write('*** [pipeline] mongodb error: %s || %s || %s' % (type(e).__module__, type(e).__name__, e.args))
raise

log.msg("Item %s wrote to MongoDB database %s/book_detail" %
(result, self.MONGODB_DB),
Expand Down
6 changes: 3 additions & 3 deletions woaidu_crawler/woaidu_crawler/pipelines/mongodb_book_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ class MongodbWoaiduBookFile(FilePipeline):
def __init__(self,shard_server,shard_port,shard_db,shard_gridfs_collection,download_func=None):
self.style = color.color_style()
##########from MediaPipeline###########
self.spiderinfo = {}
# self.spiderinfo = {}
self.download_func = download_func
##########from MediaPipeline###########

Expand Down Expand Up @@ -126,8 +126,8 @@ def process_item(self, item, spider):
"""
custom process_item func,so it will manage the Request result.
"""

info = self.spiderinfo[spider]
log.msg("FUCK*** come here meida_naem=%s" % self.MEDIA_NAME, level=log.DEBUG, spider=spider)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FUCK 。。。这就不要了吧

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On Tue, Aug 27, 2013 at 12:54 PM, gnemoug [email protected] wrote:

In woaidu_crawler/woaidu_crawler/pipelines/mongodb_book_file.py:

@@ -126,8 +126,8 @@ def process_item(self, item, spider):
"""
custom process_item func,so it will manage the Request result.

"""

  •    info = self.spiderinfo[spider]
    
  •    log.msg("FUCK**\* come here meida_naem=%s" % self.MEDIA_NAME, level=log.DEBUG, spider=spider)
    

FUCK 。。。这就不要了吧


Reply to this email directly or view it on GitHubhttps://github.com//pull/5/files#r5997204
.

这个不是已经去掉了吗?

info = self.spiderinfo.spider
requests = arg_to_iter(self.get_media_requests(item, info))
dlist = [self._process_request(r, info) for r in requests]
dfd = DeferredList(dlist, consumeErrors=1)
Expand Down
10 changes: 5 additions & 5 deletions woaidu_crawler/woaidu_crawler/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@
#if you want to use shard mongodb,you need MongodbWoaiduBookFile and ShardMongodbPipeline
#if you want to use single mongodb,you need WoaiduBookFile and SingleMongodbPipeline
ITEM_PIPELINES = ['woaidu_crawler.pipelines.cover_image.WoaiduCoverImage',
# 'woaidu_crawler.pipelines.bookfile.WoaiduBookFile',
'woaidu_crawler.pipelines.mongodb_book_file.MongodbWoaiduBookFile',
'woaidu_crawler.pipelines.bookfile.WoaiduBookFile',
# 'woaidu_crawler.pipelines.mongodb_book_file.MongodbWoaiduBookFile',
'woaidu_crawler.pipelines.drop_none_download.DropNoneBookFile',
# 'woaidu_crawler.pipelines.mongodb.SingleMongodbPipeline',
'woaidu_crawler.pipelines.mongodb.ShardMongodbPipeline',
'woaidu_crawler.pipelines.mongodb.SingleMongodbPipeline',
# 'woaidu_crawler.pipelines.mongodb.ShardMongodbPipeline',
'woaidu_crawler.pipelines.final_test.FinalTestPipeline',]
#ITEM_PIPELINES = ['woaidu_crawler.pipelines.WoaiduBookFile',]

Expand Down Expand Up @@ -121,7 +121,7 @@
STATS_CLASS = 'woaidu_crawler.statscol.graphite.RedisGraphiteStatsCollector'

GRAPHITE_HOST = '127.0.0.1'
GRAPHITE_PORT = 2003
GRAPHITE_PORT = 2003 ## carbon-cache listen on this
GRAPHITE_IGNOREKEYS = []

SingleMONGODB_SERVER = "localhost"
Expand Down
102 changes: 102 additions & 0 deletions woaidu_crawler/woaidu_crawler/utils/coding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
from __future__ import with_statement

import os
import os.path
import datetime
import logging
import shutil
import urllib
import IPy
import hashlib

def make_dir(dir):
if not os.path.exists(dir):
try:
os.makedirs(dir)
except (IOError, OSError):
if os.path.exists(dir) and os.path.isdir(dir):
return
else:
raise

def make_unicode(value, prefer_encodings=None):
if prefer_encodings is None:
prefer_encodings = ['utf8', 'gbk', 'gbk?']

if isinstance(value, unicode) or value is None:
return value

if not isinstance(value, str):
return value

for enc in prefer_encodings:
try:
if enc.endswith('!'):
return value.decode(enc[:-1], 'ignore')
elif enc.endswith('?'):
return value.decode(enc[:-1], 'replace')
elif enc.endswith('&'):
return value.decode(enc[:-1], 'xmlcharrefreplace')
elif enc.endswith('\\'):
return value.decode(enc[:-1], 'backslashreplace')
else:
return value.decode(enc)
except UnicodeError:
pass
else:
raise

def _make_unicode_elem(obj, **options):
if isinstance(obj, list):
obj = [_make_unicode_elem(elem, **options) for elem in obj]
elif isinstance(obj, dict):
obj = dict((make_unicode(k, **options), _make_unicode_elem(v, **options)) for k,v in obj.items())
elif isinstance(obj, str):
obj = make_unicode(obj, **options)
return obj

def make_unicode_obj(obj, **options):
return _make_unicode_elem(obj, **options)

def make_utf8(value, prefer_encodings=None):
uv = make_unicode(value, prefer_encodings)
if uv is None:
return None

if not isinstance(uv, unicode):
return uv

return uv.encode('utf8', 'xmlcharrefreplace')

def _make_utf8_elem(obj, **options):
if isinstance(obj, list):
obj = [_make_utf8_elem(elem, **options) for elem in obj]
elif isinstance(obj, dict):
obj = dict((make_utf8(k, **options), _make_utf8_elem(v, **options)) for k,v in obj.items())
elif isinstance(obj, unicode):
obj = make_utf8(obj, **options)

return obj

def make_utf8_obj(obj, prefer_encodings=None):
return _make_utf8_elem(obj, prefer_encodings=prefer_encodings)

def is_ip_address(s):
try:
IPy.IP(s)
except Exception, e:
return False
else:
return True

def get_file_md5(path):
"""Calc file MD5 with a save-memory method."""

md5 = hashlib.md5()
with open(path, 'r') as f:
data = f.read(512*1024)
while data:
md5.update(data)
data = f.read(512*1024)

return md5.hexdigest()