-
Notifications
You must be signed in to change notification settings - Fork 0
/
yandex-talks.py
56 lines (48 loc) · 1.64 KB
/
yandex-talks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# yandex-talks.py
"""
This script reads all available Yandex talks.
The range of searching talks is hardcoded 0-1000, you may change it
by changing talks_min and talks_max.
The output is yandex-talks.csv file with number, name and link for each talk
split by tab.
"""
__author__ = 'Bulat Zamilov <[email protected]>'
__version__ = '0.1.3'
__date__ = '2013 April 5th'
__copyright__ = 'Copyright (c) 2013 Bulat Zamilov'
__license__ = 'GPLv3'
import urllib.request, urllib.error, urllib.parse
from html.parser import HTMLParser
import codecs
class YandexHTMLParser(HTMLParser):
gotcha = False
def handle_starttag(self, tag, attrs):
#print tag + '' + attrs
attr = ('class', 'b-talk__title')
if attr in attrs and tag == 'div':
self.gotcha = True
#print "Found Title"
def handle_data(self, data):
if self.gotcha:
self.title = data.encode()
self.gotcha = False
talks_min = 0
talks_max = 1001
for i in range(talks_min, talks_max):
url = "http://events.yandex.ru/talks/" + str(i)
try:
page = urllib.request.urlopen(url)
parser = YandexHTMLParser()
parser.feed(codecs.decode(page.read(), 'utf8'))
log = open('yandex-talks.csv', 'a')
title = codecs.decode(parser.title, 'utf8')
final_url = page.url
print ("Processing link #%d - %s (%s)" % (i, final_url, title))
log_string = str(i) + '\t' + title + '\t' + final_url + '\n'
log.write(log_string)
log.close()
parser.close()
except urllib.error.URLError:
pass