Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Read proxy list from an URL #62

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions rotating_proxies/middlewares.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import re
import logging
import codecs
from functools import partial
from six.moves.urllib.parse import urlsplit
from six.moves.urllib.request import urlopen

from scrapy.exceptions import CloseSpider, NotConfigured
from scrapy import signals
Expand Down Expand Up @@ -47,7 +49,8 @@ class RotatingProxyMiddleware(object):
Settings:

* ``ROTATING_PROXY_LIST`` - a list of proxies to choose from;
* ``ROTATING_PROXY_LIST_PATH`` - path to a file with a list of proxies;
* ``ROTATING_PROXY_LIST_PATH`` - path to a file with a list of proxies or
URL returning list of proxies;
* ``ROTATING_PROXY_LOGSTATS_INTERVAL`` - stats logging interval in seconds,
30 by default;
* ``ROTATING_PROXY_CLOSE_SPIDER`` - When True, spider is stopped if
Expand Down Expand Up @@ -83,8 +86,12 @@ def from_crawler(cls, crawler):
s = crawler.settings
proxy_path = s.get('ROTATING_PROXY_LIST_PATH', None)
if proxy_path is not None:
with codecs.open(proxy_path, 'r', encoding='utf8') as f:
proxy_list = [line.strip() for line in f if line.strip()]
if re.match("^http", proxy_path, re.IGNORECASE):
with urlopen(proxy_path) as f:
proxy_list = [line.decode("utf-8").strip() for line in f if line.decode("utf-8").strip()]
else:
with codecs.open(proxy_path, 'r', encoding='utf8') as f:
proxy_list = [line.strip() for line in f if line.strip()]
else:
proxy_list = s.getlist('ROTATING_PROXY_LIST')
if not proxy_list:
Expand Down