-
Notifications
You must be signed in to change notification settings - Fork 0
/
ccutils.py
115 lines (74 loc) · 2.4 KB
/
ccutils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import gzip
import json
import requests
from io import BytesIO,StringIO
import random
import re
import multiprocessing as mp
import http.client
# In[2]:
def is_mime_html(page):
link = page['url']
return all([
page['mime'] == 'text/html'
])
# In[3]:
def get_pages(domain_key,cc_index,url_filter=is_mime_html,params={'output':'json'}):
params["url"] = domain_key
resp = requests.get('http://index.commoncrawl.org/CC-MAIN-%s-index'%cc_index,params=params)
if resp.status_code != 200:
raise Exception("Searching for this key failed with error code %s"%resp.status_code)
else:
pages = [json.loads(x) for x in resp.text.strip().split('\n')]
pages = [x for x in pages if is_mime_html(x)]
return pages
# In[4]:
def get_page_from_cc(filename,offset,offset_end):
prefix = 'https://commoncrawl.s3.amazonaws.com/'
try:
resp = requests.get(prefix + filename, headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})
resp.raise_for_status()
except:
return None
else:
return resp
# In[36]:
def unzip_page(response):
raw_data = BytesIO(response.content)
f = gzip.GzipFile(fileobj=raw_data)
warc, response = f.read().decode("utf-8",errors='ignore').strip().split('\r\n\r\n', 1)
return {'warc':warc,'response':response}
# In[6]:
def process_page_links(page):
offset, length = int(page['offset']), int(page['length'])
offset_end = offset + length - 1
response = get_page_from_cc(page['filename'],offset,offset_end)
return {
"url":page['url'],
"response":response
}
# In[7]:
def get_documents(pages,num_jobs):
with mp.Pool(num_jobs) as pool:
responses = pool.map(process_page_links,pages)
return responses
# In[1]:
def unzip_pages(responses):
output = {}
for each in responses:
try:
output[each['url']] = unzip_page(each['response'])
except:
output[each['url']] = None
return output
# In[9]:
def download_pages(domain_key,cc_index,unzip,num_jobs=2,url_filter=is_mime_html,params={'output':'json'}):
pages = get_pages(domain_key,cc_index,url_filter=is_mime_html,params={'output':'json'})
unzipped_responses = get_documents(pages,num_jobs)
if not unzip:
return unzipped_responses
return unzip_pages(unzipped_responses)
# In[ ]: