-
Notifications
You must be signed in to change notification settings - Fork 0
/
nbc_crawl.py
64 lines (55 loc) · 1.68 KB
/
nbc_crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import sys
import scrapelib
from bs4 import BeautifulSoup
from dateutil import parser
import re
import csv
BASE_URL = 'http://www.nbcnews.com'
all_links = set()
visited = set()
def crawl_full_archive(url):
if url in visited:
print("Count: %d" % len(all_links))
return
visited.add(url)
res = s.urlopen(BASE_URL + url)
soup = BeautifulSoup(res)
print soup.title.text
# Link to contents
for th in soup.find_all('div', {'class': 'textHang'}):
link = th.find('a')
if not link.has_attr('ce') or link['ce'] == 'Highlight1':
href = link['href']
#print href
all_links.add(href)
# Get links to all archive
archive = soup.find('div', {'cn': 'Archive'})
if archive:
for a in archive.find_all('a'):
href = a['href']
crawl_full_archive(href)
if __name__ == "__main__":
reload(sys)
sys.setdefaultencoding("utf-8")
s = scrapelib.Scraper(requests_per_minute=60)
start_url = BASE_URL + '/id/3719710'
res = s.urlopen(start_url)
soup = BeautifulSoup(res)
print soup.title.text
# Link to contents
for th in soup.find_all('div', {'class': 'textHang'}):
link = th.find('a')
if not link.has_attr('ce') or link['ce'] == 'Highlight1':
href = link['href']
#print href
all_links.add(href)
# Link to Full archive
for ts in soup.find_all('div', {'class': 'textSmall'}):
url = ts.find('a')['href']
#print url
crawl_full_archive(url)
print "Total: %d" % len(all_links)
f = open('all_links.csv', 'wb')
for href in all_links:
f.write("%s\n" % href)
f.close()