Skip to content

Commit

Permalink
update to new javacheck (#315)
Browse files Browse the repository at this point in the history
* update to new javacheck

* Update __init__.py

replace re with lxml.etree
for more daynamic solving

* updated cloudflare to use only regrex

* updated cloudflare to use only regrex_

* Create requirements.txt

* Update requirements.txt

* Revert "Create requirements.txt"

This reverts commit 040bf20.

# Conflicts:
#	requirements.txt

* fix return page from create_scraper object

the code can get the CF tokens but had problem in returning the page
that has CF
i updated the code with fix

* added test to check if "Set-Cookie" in redirect.headers

added test to check if "Set-Cookie" in redirect.headers
as @grmnz suggested
i felt that node.js run fewer times than befoer
  • Loading branch information
alzamer2 authored Feb 22, 2020
1 parent e4f31ed commit 3241f7d
Show file tree
Hide file tree
Showing 2 changed files with 171 additions and 38 deletions.
108 changes: 70 additions & 38 deletions cfscrape/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def __init__(self, *args, **kwargs):

# Define headers to force using an OrderedDict and preserve header order
self.headers = headers
self.org_method = None

self.mount("https://", CloudflareAdapter())

Expand Down Expand Up @@ -152,21 +153,43 @@ def solve_cf_challenge(self, resp, **original_kwargs):
body = resp.text
parsed_url = urlparse(resp.url)
domain = parsed_url.netloc
submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme, domain)
challenge_form = re.search(r'\<form.*?id=\"challenge-form\".*?\/form\>',body, flags=re.S).group(0) # find challenge form
method = re.search(r'method=\"(.*?)\"', challenge_form, flags=re.S).group(1)
if self.org_method is None:
self.org_method = resp.request.method
submit_url = "%s://%s%s" % (parsed_url.scheme,
domain,
re.search(r'action=\"(.*?)\"', challenge_form, flags=re.S).group(1).split('?')[0])

cloudflare_kwargs = copy.deepcopy(original_kwargs)

headers = cloudflare_kwargs.setdefault("headers", {})
headers["Referer"] = resp.url

try:
params = cloudflare_kwargs["params"] = OrderedDict(
re.findall(r'name="(s|jschl_vc|pass)"(?: [^<>]*)? value="(.+?)"', body)
)
cloudflare_kwargs["params"] = dict()
cloudflare_kwargs["data"] = dict()
if len(re.search(r'action=\"(.*?)\"', challenge_form, flags=re.S).group(1).split('?')) != 1:
for param in re.search(r'action=\"(.*?)\"', challenge_form, flags=re.S).group(1).split('?')[1].split('&'):
cloudflare_kwargs["params"].update({param.split('=')[0]:param.split('=')[1]})

for input_ in re.findall(r'\<input.*?(?:\/>|\<\/input\>)', challenge_form, flags=re.S):
if re.search(r'name=\"(.*?)\"',input_, flags=re.S).group(1) != 'jschl_answer':
if method == 'POST':
cloudflare_kwargs["data"].update({re.search(r'name=\"(.*?)\"',input_, flags=re.S).group(1):
re.search(r'value=\"(.*?)\"',input_, flags=re.S).group(1)})
elif method == 'GET':
cloudflare_kwargs["params"].update({re.search(r'name=\"(.*?)\"',input_, flags=re.S).group(1):
re.search(r'value=\"(.*?)\"',input_, flags=re.S).group(1)})
if method == 'POST':
for k in ("jschl_vc", "pass"):
if k not in cloudflare_kwargs["data"]:
raise ValueError("%s is missing from challenge form" % k)
elif method == 'GET':
for k in ("jschl_vc", "pass"):
if k not in cloudflare_kwargs["params"]:
raise ValueError("%s is missing from challenge form" % k)

for k in ("jschl_vc", "pass"):
if k not in params:
raise ValueError("%s is missing from challenge form" % k)
except Exception as e:
# Something is wrong with the page.
# This may indicate Cloudflare has changed their anti-bot
Expand All @@ -179,55 +202,71 @@ def solve_cf_challenge(self, resp, **original_kwargs):

# Solve the Javascript challenge
answer, delay = self.solve_challenge(body, domain)
params["jschl_answer"] = answer
if method == 'POST':
cloudflare_kwargs["data"]["jschl_answer"] = answer
elif method == 'GET':
cloudflare_kwargs["params"]["jschl_answer"] = answer

# Requests transforms any request into a GET after a redirect,
# so the redirect has to be handled manually here to allow for
# performing other types of requests even as the first request.
method = resp.request.method
cloudflare_kwargs["allow_redirects"] = False

# Cloudflare requires a delay before solving the challenge
time.sleep(max(delay - (time.time() - start_time), 0))

# Send the challenge response and handle the redirect manually
redirect = self.request(method, submit_url, **cloudflare_kwargs)
redirect_location = urlparse(redirect.headers["Location"])

if not redirect_location.netloc:
redirect_url = urlunparse(
(
parsed_url.scheme,
domain,
redirect_location.path,
redirect_location.params,
redirect_location.query,
redirect_location.fragment,
if "Location" in redirect.headers:
redirect_location = urlparse(redirect.headers["Location"])

if not redirect_location.netloc:
redirect_url = urlunparse(
(
parsed_url.scheme,
domain,
redirect_location.path,
redirect_location.params,
redirect_location.query,
redirect_location.fragment,
)
)
)
return self.request(method, redirect_url, **original_kwargs)
return self.request(method, redirect.headers["Location"], **original_kwargs)
return self.request(method, redirect_url, **original_kwargs)
return self.request(method, redirect.headers["Location"], **original_kwargs)
elif "Set-Cookie" in redirect.headers:
if 'cf_clearance' in redirect.headers['Set-Cookie']:
resp = self.request(self.org_method, submit_url, cookies = redirect.cookies)
return resp
else:
return self.request(method, redirect_url, **original_kwargs)
else:
resp = self.request(self.org_method, submit_url, **cloudflare_kwargs)
return resp


def solve_challenge(self, body, domain):
try:
javascript = re.search(r'\<script type\=\"text\/javascript\"\>\n(.*?)\<\/script\>',body, flags=re.S).group(1) # find javascript

challenge, ms = re.search(
r"setTimeout\(function\(\){\s*(var "
r"s,t,o,p,b,r,e,a,k,i,n,g,f.+?\r?\n[\s\S]+?a\.value\s*=.+?)\r?\n"
r"(?:[^{<>]*},\s*(\d{4,}))?",
body,
javascript, flags=re.S
).groups()

# The challenge requires `document.getElementById` to get this content.
# Future proofing would require escaping newlines and double quotes
innerHTML = re.search(r"<div(?: [^<>]*)? id=\"cf-dn.*?\">([^<>]*)", body)
innerHTML = innerHTML.group(1) if innerHTML else ""
innerHTML = ''
for i in javascript.split(';'):
if i.strip().split('=')[0].strip() == 'k': # from what i found out from pld example K var in
k = i.strip().split('=')[1].strip(' \'') # javafunction is for innerHTML this code to find it
innerHTML = re.search(r'\<div.*?id\=\"'+k+r'\".*?\>(.*?)\<\/div\>',body).group(1) #find innerHTML

# Prefix the challenge with a fake document object.
# Interpolate the domain, div contents, and JS challenge.
# The `a.value` to be returned is tacked onto the end.
challenge = """
"use strict";
var document = {
createElement: function () {
return { firstChild: { href: "http://%s/" } }
Expand Down Expand Up @@ -257,11 +296,6 @@ def solve_challenge(self, body, domain):
# The sandboxed code cannot use the Node.js standard library
js = (
"""\
try { Buffer.from("", "base64"); }\
catch (e) {\
throw new Error("Outdated Node.js detected: " +\
process.version + ", minimum supported version is 4.5");\
}\
var atob = Object.setPrototypeOf(function (str) {\
try {\
return Buffer.from("" + str, "base64").toString("binary");\
Expand All @@ -281,13 +315,13 @@ def solve_challenge(self, body, domain):
"""
% challenge
)

stderr = ''

try:
node = subprocess.Popen(
["node", "-e", js], stdout=subprocess.PIPE, stderr=subprocess.PIPE,
universal_newlines=True
)
)
result, stderr = node.communicate()
if node.returncode != 0:
stderr = "Node.js Exception:\n%s" % (stderr or None)
Expand All @@ -300,9 +334,7 @@ def solve_challenge(self, body, domain):
)
raise
except Exception:
logging.error(stderr)
if not re.search(r"[^\"]Outdated Node.js detected", stderr):
logging.error("Error executing Cloudflare IUAM Javascript. %s" % BUG_REPORT)
logging.error("Error executing Cloudflare IUAM Javascript. %s" % BUG_REPORT)
raise

try:
Expand Down
101 changes: 101 additions & 0 deletions tests/fixtures/js_challenge_30_11_2019.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
<!DOCTYPE HTML>
<html lang="en-US">
<head>
<meta charset="UTF-8" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta http-equiv="X-UA-Compatible" content="IE=Edge,chrome=1" />
<meta name="robots" content="noindex, nofollow" />
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
<title>Just a moment...</title>
<style type="text/css">
html, body {width: 100%; height: 100%; margin: 0; padding: 0;}
body {background-color: #ffffff; font-family: Helvetica, Arial, sans-serif; font-size: 100%;}
h1 {font-size: 1.5em; color: #404040; text-align: center;}
p {font-size: 1em; color: #404040; text-align: center; margin: 10px 0 0 0;}
#spinner {margin: 0 auto 30px auto; display: block;}
.attribution {margin-top: 20px;}
@-webkit-keyframes bubbles { 33%: { -webkit-transform: translateY(10px); transform: translateY(10px); } 66% { -webkit-transform: translateY(-10px); transform: translateY(-10px); } 100% { -webkit-transform: translateY(0); transform: translateY(0); } }
@keyframes bubbles { 33%: { -webkit-transform: translateY(10px); transform: translateY(10px); } 66% { -webkit-transform: translateY(-10px); transform: translateY(-10px); } 100% { -webkit-transform: translateY(0); transform: translateY(0); } }
.bubbles { background-color: #404040; width:15px; height: 15px; margin:2px; border-radius:100%; -webkit-animation:bubbles 0.6s 0.07s infinite ease-in-out; animation:bubbles 0.6s 0.07s infinite ease-in-out; -webkit-animation-fill-mode:both; animation-fill-mode:both; display:inline-block; }
</style>

<script type="text/javascript">
//<![CDATA[
(function(){
var a = function() {try{return !!window.addEventListener} catch(e) {return !1} },
b = function(b, c) {a() ? document.addEventListener("DOMContentLoaded", b, c) : document.attachEvent("onreadystatechange", b)};
b(function(){
var a = document.getElementById('cf-content');a.style.display = 'block';
setTimeout(function(){
var s,t,o,p,b,r,e,a,k,i,n,g,f, hWuAPfm={"xVxyYj":+((!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+[])+(!+[]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(+[])+(!+[]+!![]+!![]+!![])+(+!![])+(!+[]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![])+(+!![]))/+((!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+[])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![])+(!+[]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]))};
g = String.fromCharCode;
o = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";
e = function(s) {
s += "==".slice(2 - (s.length & 3));
var bm, r = "", r1, r2, i = 0;
for (; i < s.length;) {
bm = o.indexOf(s.charAt(i++)) << 18 | o.indexOf(s.charAt(i++)) << 12
| (r1 = o.indexOf(s.charAt(i++))) << 6 | (r2 = o.indexOf(s.charAt(i++)));
r += r1 === 64 ? g(bm >> 16 & 255)
: r2 === 64 ? g(bm >> 16 & 255, bm >> 8 & 255)
: g(bm >> 16 & 255, bm >> 8 & 255, bm & 255);
}
return r;
};
t = document.createElement('div');
t.innerHTML="<a href='/'>x</a>";
t = t.firstChild.href;r = t.match(/https?:\/\//)[0];
t = t.substr(r.length); t = t.substr(0,t.length-1);
a = document.getElementById('jschl-answer');
f = document.getElementById('challenge-form');
;hWuAPfm.xVxyYj*=+((!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+[])+(!+[]+!![])+(+[])+(!+[]+!![]+!![]+!![])+(+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![])+(!+[]+!![]+!![]))/+((!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+[])+(!+[]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![])+(!+[]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]));hWuAPfm.xVxyYj*=+((!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+[])+(+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![])+(+[])+(!+[]+!![]+!![]+!![])+(!+[]+!![])+(!+[]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]))/+((+!![]+[])+(+!![])+(!+[]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]));hWuAPfm.xVxyYj+=+((!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+[])+(!+[]+!![])+(+[])+(!+[]+!![]+!![]+!![])+(+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![])+(!+[]+!![]+!![]))/+((!+[]+!![]+!![]+[])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![])+(!+[]+!![])+(!+[]+!![]+!![]+!![]+!![])+(!+[]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(+[]));a.value = (+hWuAPfm.xVxyYj + t.length).toFixed(10); '; 121'
f.action += location.hash;
f.submit();
}, 4000);
}, false);
})();
//]]>
</script>


</head>
<body>
<table width="100%" height="100%" cellpadding="20">
<tr>
<td align="center" valign="middle">
<div class="cf-browser-verification cf-im-under-attack">
<noscript><h1 data-translate="turn_on_js" style="color:#bd2426;">Please turn JavaScript on and reload the page.</h1></noscript>
<div id="cf-content" style="display:none">

<div>
<div class="bubbles"></div>
<div class="bubbles"></div>
<div class="bubbles"></div>
</div>
<h1><span data-translate="checking_browser">Checking your browser before accessing</span> kissmanga.com.</h1>

<p data-translate="process_is_automatic">This process is automatic. Your browser will redirect to your requested content shortly.</p>
<p data-translate="allow_5_secs">Please allow up to 5 seconds&hellip;</p>
</div>

<form id="challenge-form" action="/Manga/The-Fairy-Captivity?__cf_chl_jschl_tk__=072b4332f1c34ea3a99d456a575ab0294c4868ec-1575019984-0-ASKQNetI_COO72dgnF_r2PC4LVc8vKexMyimbcpQjZIQqOwdo1Fh7PKubmgVlLUS58qVJBoAlUMzb0sWrn8vwtDRHKxVrFuTMcEYBinjd6tbp9aBJEZiQctvCPBqAJJKqKaMXOf-aldIFWEt5EegylBjixChwZxA4ihaAqyC59gBg36Wjnvs1lkUEou573kvOyDGcJTiVyzf3O9a86tyO2N-uF8xRuqfWhMYNPwoXTRhXxNRF6rinqmPuURCYej0fxLu0lf21UQUV6JQlfrWdA_UopDhB_KSHgYT1NAzrUcm" method="POST" enctype="application/x-www-form-urlencoded">
<input type="hidden" name="r" value="901e5fc6103829c66a465ebaf9755c835e66f46c-1575019984-0-AUzvQW0hz4d1zGbwqui49PImeTFf+c4K/C6QEzvszNkWYIDBcEURSr4o/H5IdAWEHt8QBvt1k2LqiCygPXAg5VvhyPDUdk7+ngeTHRhJa/wqFRLtQDCJO8F9E5cpsDG4PjR83Paez4fsN7LfeVOg56jt4KZgSqgjomBNcBlsHuo/9c8FAQJtLCx/npoW2Pjsl8+FkojoziW4N1LWxlYd1qnjBgjmnxJvu8ZjdO9DEvXNCT7GZkNKhelSSO6PN6LWzJjyA+oKUN9TxeNjIVWPIL5dTFRA4vhRF5sfXNaJbpExqE9b37/vUyaaWZTGCL4U/dO+dnchaKwZiE+liztsPVpS1gxj+nUh8FjKN/QYpbcAJLOFAYsNQpM5THNGz9WANuF31jkhJT+w++EgL42B6uRFAYdutnqNnAwaZyR4Acph17v0pk4x/JEUtg4TvCG8MyW4g2S4u7Ak+drN5A/y6Li/rgiJKUJorGAE5BOhTR3IzkIsFhHJ+jZ2IMXLltPciTrbK3oN4tY33dfgwzEcTonCi9qB88Yuex/Sep9Mocmk/CSFo1PS44p+9kOFWkx9G1cCaGrifpbbOBMuTkVye5wPgDaXJRePSAc1FClCuxhskbadMST6DDvcRPwdziRlbalrCxe6f00HM131oP7aUkDropkfVlTvImfSh0nQLfjxayTZzOzb43SBe6/FKUBQig=="></input>
<input type="hidden" name="jschl_vc" value="c22565a73c49afb92b427eff230eb330"/>
<input type="hidden" name="pass" value="1575019988.841-6b8Fv8fEyM"/>
<input type="hidden" id="jschl-answer" name="jschl_answer"/>
</form>

</div>


<div class="attribution">
<a href="https://www.cloudflare.com/5xx-error-landing?utm_source=iuam" target="_blank" style="font-size: 12px;">DDoS protection by Cloudflare</a>
<br>
Ray ID: 53d393f93ae1c82f
</div>
</td>

</tr>
</table>
</body>
</html>

0 comments on commit 3241f7d

Please sign in to comment.