Skip to content

Commit

Permalink
Always mount HTTPS adapter (#249)
Browse files Browse the repository at this point in the history
  • Loading branch information
Dwayne authored and Anorov committed May 17, 2019
1 parent 7bb1da9 commit f6941b4
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 48 deletions.
65 changes: 18 additions & 47 deletions cfscrape/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-

import json
import logging
import random
Expand Down Expand Up @@ -58,23 +60,25 @@
https://github.com/Anorov/cloudflare-scrape/issues\
"""

class ExcludeCaptchaProvokingCiphersAdapter(HTTPAdapter):
url = "https://"
# Remove a few problematic TLSv1.0 ciphers from the defaults
DEFAULT_CIPHERS += ":!ECDHE+SHA:!AES128-SHA"


def enable(self):
setattr(self, "is_enabled", True)
class CloudflareAdapter(HTTPAdapter):
""" HTTPS adapter that creates a SSL context with custom ciphers """

def disable(self):
setattr(self, "is_enabled", False)
def get_connection(self, *args, **kwargs):
conn = super(CloudflareAdapter, self).get_connection(*args, **kwargs)

def get_connection(self, url, proxies=None):
conn = super(ExcludeCaptchaProvokingCiphersAdapter, self).get_connection(url, proxies)
# Only if enable() has been explicitly invoked, the ciphers are updated.
if getattr(self, "is_enabled", False):
conn.conn_kw['ssl_context'] = create_urllib3_context(ciphers=DEFAULT_CIPHERS + ":!ECDHE+SHA:!AES128-SHA")
if conn.conn_kw.get("ssl_context"):
conn.conn_kw["ssl_context"].set_ciphers(DEFAULT_CIPHERS)
else:
context = create_urllib3_context(ciphers=DEFAULT_CIPHERS)
conn.conn_kw["ssl_context"] = context

return conn


class CloudflareError(RequestException):
pass

Expand All @@ -97,9 +101,7 @@ def __init__(self, *args, **kwargs):
# Define headers to force using an OrderedDict and preserve header order
self.headers = headers

# Install the captcha adapter. Only if explicitly enabled it modifies
# the ssl context.
self.mount(ExcludeCaptchaProvokingCiphersAdapter.url, ExcludeCaptchaProvokingCiphersAdapter())
self.mount("https://", CloudflareAdapter())

@staticmethod
def is_cloudflare_iuam_challenge(resp):
Expand All @@ -119,11 +121,6 @@ def is_cloudflare_captcha_challenge(resp):
)

def request(self, method, url, *args, **kwargs):
# Mount the adapter responsible for removing ciphers which might provoke
# a captcha, but only if it's a https request.
if self.should_enable_adapter(url):
self.enable_adapter()

resp = super(CloudflareScraper, self).request(method, url, *args, **kwargs)

# Check if Cloudflare captcha challenge is presented
Expand All @@ -134,18 +131,13 @@ def request(self, method, url, *args, **kwargs):
if self.is_cloudflare_iuam_challenge(resp):
resp = self.solve_cf_challenge(resp, **kwargs)

# Unmount the adapter if Cloudflare has accepted the challenge solution
# and has sent the cf_clearance cookie.
if self.cloudflare_is_bypassed(url, resp):
self.disable_adapter()

return resp

def cloudflare_is_bypassed(self, url, resp=None):
cookie_domain = ".{}".format(urlparse(url).netloc)
return (
self.cookies.get("cf_clearance", None, domain=cookie_domain) or
(resp and resp.cookies.get("cf_clearance", None, domain=cookie_domain))
(resp and resp.cookies.get("cf_clearance", None, domain=cookie_domain))
)

def handle_captcha_challenge(self, resp, url):
Expand Down Expand Up @@ -311,27 +303,6 @@ def solve_challenge(self, body, domain):

return result, delay

def should_enable_adapter(self, url=None):
if self.cloudflare_is_bypassed(url):
return False

if url and urlparse(url).scheme != "https":
return False

return True

def enable_adapter(self):
adapter = self.get_adapter(ExcludeCaptchaProvokingCiphersAdapter.url)

if isinstance(adapter, ExcludeCaptchaProvokingCiphersAdapter):
adapter.enable()

def disable_adapter(self):
adapter = self.get_adapter(ExcludeCaptchaProvokingCiphersAdapter.url)

if isinstance(adapter, ExcludeCaptchaProvokingCiphersAdapter):
adapter.disable()

@classmethod
def create_scraper(cls, sess=None, **kwargs):
"""
Expand All @@ -357,7 +328,7 @@ def create_scraper(cls, sess=None, **kwargs):

return scraper

## Functions for integrating cloudflare-scrape with other applications and scripts
# Functions for integrating cloudflare-scrape with other applications and scripts

@classmethod
def get_tokens(cls, url, user_agent=None, **kwargs):
Expand Down
1 change: 0 additions & 1 deletion tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-

import responses
import pytest
import re

from requests.compat import urlencode
Expand Down
46 changes: 46 additions & 0 deletions tests/test_adapters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# -*- coding: utf-8 -*-

import ssl
import sure # noqa
import urllib3

from cfscrape import CloudflareAdapter


class TestCloudflareAdapter:

def test_create_adapter(self):
adapter = CloudflareAdapter()
adapter.should.be.a("requests.adapters.HTTPAdapter")
adapter.close()

def test_get_connection(self):
adapter = CloudflareAdapter()

conn = adapter.get_connection("https://127.0.0.1", None)

conn.conn_kw.should.be.a("dict")
conn.conn_kw.should.have.key("ssl_context")
ssl_context = conn.conn_kw["ssl_context"]

# This should be ssl.SSLContext unless pyOpenSSL is installed.
# If pyOpenSSL is injected into urllib3, this should still work.
try:
assert isinstance(ssl_context, urllib3.contrib.pyopenssl.PyOpenSSLContext)
except:
assert isinstance(ssl_context, ssl.SSLContext)

adapter.close()

def test_set_ciphers(self):
adapter = CloudflareAdapter()

# Reinitialize the pool manager with a different context
ctx = ssl.create_default_context()
adapter.init_poolmanager(1, 1, ssl_context=ctx)
# Check to see if the context remains the same without error
conn = adapter.get_connection('https://127.0.0.1', None)
conn.conn_kw.should.be.a("dict")
assert conn.conn_kw["ssl_context"] is ctx

adapter.close()

0 comments on commit f6941b4

Please sign in to comment.