-
Notifications
You must be signed in to change notification settings - Fork 31
/
proxy-scraper.py
119 lines (105 loc) · 3.61 KB
/
proxy-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/python3
import requests
from tqdm import tqdm
import threading
import json
import time
import sys
import re
proxies = []
def config():
try:
with open('config.json', 'r', encoding='utf-8') as setting:
config = json.load(setting)
proxies = config['proxies']
debug = config['debug']
output = config['output']
timeout = config['timeout']
return config
except:
print('Failed loading "config.json"')
def fetchAndParseProxies(url, custom_regex):
n = 0
try:
proxylist = requests.get(url, timeout=15).text
proxylist = proxylist.replace('null', '"N/A"')
custom_regex = custom_regex.replace('%ip%', '([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3})')
custom_regex = custom_regex.replace('%port%', '([0-9]{1,5})')
for proxy in re.findall(re.compile(custom_regex), proxylist):
proxies.append(proxy[0] + ":" + proxy[1])
n += 1
except:
sys.stdout.write("{0: >5} proxies fetched from {1}\n".format('0',url))
config = config()
proxysources = [
["http://spys.me/proxy.txt","%ip%:%port% "],
["http://www.httptunnel.ge/ProxyListForFree.aspx"," target=\"_new\">%ip%:%port%</a>"],
["https://www.us-proxy.org/", "<tr><td>%ip%<\\/td><td>%port%<\\/td><td>(.*?){2}<\\/td><td class='hm'>.*?<\\/td><td>.*?<\\/td><td class='hm'>.*?<\\/td><td class='hx'>(.*?)<\\/td><td class='hm'>.*?<\\/td><\\/tr>"],
["https://free-proxy-list.net/", "<tr><td>%ip%<\\/td><td>%port%<\\/td><td>(.*?){2}<\\/td><td class='hm'>.*?<\\/td><td>.*?<\\/td><td class='hm'>.*?<\\/td><td class='hx'>(.*?)<\\/td><td class='hm'>.*?<\\/td><\\/tr>"],
["https://www.sslproxies.org/", "<tr><td>%ip%<\\/td><td>%port%<\\/td><td>(.*?){2}<\\/td><td class='hm'>.*?<\\/td><td>.*?<\\/td><td class='hm'>.*?<\\/td><td class='hx'>(.*?)<\\/td><td class='hm'>.*?<\\/td><\\/tr>"],
["https://www.proxy-list.download/api/v0/get?l=en&t=https", '"IP": "%ip%", "PORT": "%port%",'],
["https://api.proxyscrape.com/?request=getproxies&proxytype=http&timeout=5000&country=all&anonymity=elite&ssl=all", "%ip%:%port%"],
]
loop = tqdm(total=len(proxysources), position=0, leave=False)
for source in proxysources:
loop.set_description('fetching...')
fetchAndParseProxies(source[0], source[1])
loop.update(1)
loop.close()
def save(i):
file = open(config['proxies'], "a")
file.write(i+"\n")
file.close()
def checker(i):
try:
global running
global good
global bad
running += 1
except:
running = 0
bad = 0
good = 0
s = requests.session()
s.proxies = {
'http':'http://'+i,
'https':'https://'+i
}
try:
rr = s.get(url, timeout=config['timeout'])
print('Valid => ',i)
save(i)
good += 1
except requests.exceptions.ReadTimeout:
print('ReadTimeout => ',i)
bad += 1
except requests.exceptions.ConnectTimeout:
print('ConnectTimeout => ',i)
bad += 1
except requests.exceptions.ProxyError:
print('ProxyError => ',i)
bad += 1
except requests.exceptions.SSLError:
print('SSLError => ',i)
bad += 1
except requests.exceptions.ConnectionError:
print('ConnectionError => ',i)
bad += 1
running -= 1
print(len(proxies)," Proxies Fetched.")
url = "https://httpbin.org/ip"
good = 0
bad = 0
running = 0
ch = 0
max = 25
for i in proxies:
if running < max:
x = threading.Thread(target=checker, args=(i,))
x.start()
ch += 1
else:
time.sleep(.1)
time.sleep(3)
print("good : ", good)
print("bad : ", bad)