-
Notifications
You must be signed in to change notification settings - Fork 1.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix bugs of attribute error 、mongondb duplicate record error、character coding error #5
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -96,7 +96,7 @@ class MongodbWoaiduBookFile(FilePipeline): | |
def __init__(self,shard_server,shard_port,shard_db,shard_gridfs_collection,download_func=None): | ||
self.style = color.color_style() | ||
##########from MediaPipeline########### | ||
self.spiderinfo = {} | ||
# self.spiderinfo = {} | ||
self.download_func = download_func | ||
##########from MediaPipeline########### | ||
|
||
|
@@ -126,8 +126,8 @@ def process_item(self, item, spider): | |
""" | ||
custom process_item func,so it will manage the Request result. | ||
""" | ||
|
||
info = self.spiderinfo[spider] | ||
log.msg("FUCK*** come here meida_naem=%s" % self.MEDIA_NAME, level=log.DEBUG, spider=spider) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FUCK 。。。这就不要了吧 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On Tue, Aug 27, 2013 at 12:54 PM, gnemoug [email protected] wrote:
|
||
info = self.spiderinfo.spider | ||
requests = arg_to_iter(self.get_media_requests(item, info)) | ||
dlist = [self._process_request(r, info) for r in requests] | ||
dfd = DeferredList(dlist, consumeErrors=1) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
from __future__ import with_statement | ||
|
||
import os | ||
import os.path | ||
import datetime | ||
import logging | ||
import shutil | ||
import urllib | ||
import IPy | ||
import hashlib | ||
|
||
def make_dir(dir): | ||
if not os.path.exists(dir): | ||
try: | ||
os.makedirs(dir) | ||
except (IOError, OSError): | ||
if os.path.exists(dir) and os.path.isdir(dir): | ||
return | ||
else: | ||
raise | ||
|
||
def make_unicode(value, prefer_encodings=None): | ||
if prefer_encodings is None: | ||
prefer_encodings = ['utf8', 'gbk', 'gbk?'] | ||
|
||
if isinstance(value, unicode) or value is None: | ||
return value | ||
|
||
if not isinstance(value, str): | ||
return value | ||
|
||
for enc in prefer_encodings: | ||
try: | ||
if enc.endswith('!'): | ||
return value.decode(enc[:-1], 'ignore') | ||
elif enc.endswith('?'): | ||
return value.decode(enc[:-1], 'replace') | ||
elif enc.endswith('&'): | ||
return value.decode(enc[:-1], 'xmlcharrefreplace') | ||
elif enc.endswith('\\'): | ||
return value.decode(enc[:-1], 'backslashreplace') | ||
else: | ||
return value.decode(enc) | ||
except UnicodeError: | ||
pass | ||
else: | ||
raise | ||
|
||
def _make_unicode_elem(obj, **options): | ||
if isinstance(obj, list): | ||
obj = [_make_unicode_elem(elem, **options) for elem in obj] | ||
elif isinstance(obj, dict): | ||
obj = dict((make_unicode(k, **options), _make_unicode_elem(v, **options)) for k,v in obj.items()) | ||
elif isinstance(obj, str): | ||
obj = make_unicode(obj, **options) | ||
return obj | ||
|
||
def make_unicode_obj(obj, **options): | ||
return _make_unicode_elem(obj, **options) | ||
|
||
def make_utf8(value, prefer_encodings=None): | ||
uv = make_unicode(value, prefer_encodings) | ||
if uv is None: | ||
return None | ||
|
||
if not isinstance(uv, unicode): | ||
return uv | ||
|
||
return uv.encode('utf8', 'xmlcharrefreplace') | ||
|
||
def _make_utf8_elem(obj, **options): | ||
if isinstance(obj, list): | ||
obj = [_make_utf8_elem(elem, **options) for elem in obj] | ||
elif isinstance(obj, dict): | ||
obj = dict((make_utf8(k, **options), _make_utf8_elem(v, **options)) for k,v in obj.items()) | ||
elif isinstance(obj, unicode): | ||
obj = make_utf8(obj, **options) | ||
|
||
return obj | ||
|
||
def make_utf8_obj(obj, prefer_encodings=None): | ||
return _make_utf8_elem(obj, prefer_encodings=prefer_encodings) | ||
|
||
def is_ip_address(s): | ||
try: | ||
IPy.IP(s) | ||
except Exception, e: | ||
return False | ||
else: | ||
return True | ||
|
||
def get_file_md5(path): | ||
"""Calc file MD5 with a save-memory method.""" | ||
|
||
md5 = hashlib.md5() | ||
with open(path, 'r') as f: | ||
data = f.read(512*1024) | ||
while data: | ||
md5.update(data) | ||
data = f.read(512*1024) | ||
|
||
return md5.hexdigest() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
这里改一下