-
Notifications
You must be signed in to change notification settings - Fork 0
/
orgfreqs.py
executable file
·83 lines (73 loc) · 2.87 KB
/
orgfreqs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from lxml.etree import tostring
from lxml.html.soupparser import fromstring
import requests, time, sys
urls=['http://amatorradiozas.hu/frekvencia-kiosztasok/bkv-mahart',
'http://amatorradiozas.hu/frekvencia-kiosztasok/autopalyakezelo',
'http://amatorradiozas.hu/frekvencia-kiosztasok/biztonsagi-szolgalatok',
'http://amatorradiozas.hu/frekvencia-kiosztasok/fovarosi-kozterulet-felugyelet',
'http://amatorradiozas.hu/frekvencia-kiosztasok/gaz-es-elektromos-muvek',
'http://amatorradiozas.hu/frekvencia-kiosztasok/mav-volan-busz',
'http://amatorradiozas.hu/frekvencia-kiosztasok/tuzoltosag',
'http://amatorradiozas.hu/frekvencia-kiosztasok/taxik']
PROXIES = {'http': 'http://localhost:8123/'}
HEADERS = { 'User-agent': 'hunfreqdb/0.1' }
def fetch_raw(url, retries=5, ignore=[], params=None):
try:
if params:
r=requests.POST(url, params=params, proxies=PROXIES, headers=HEADERS)
else:
r=requests.get(url, proxies=PROXIES, headers=HEADERS)
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout), e:
if e == requests.exceptions.Timeout:
print >>sys.stderr, 'timeout'
retries = min(retries, 1)
if retries>0:
print >>sys.stderr, 'wait', 4*(6-retries), 'sec and retry'
time.sleep(4*(6-retries))
f=fetch_raw(url, retries-1, ignore=ignore, params=params)
else:
raise ValueError("failed to fetch %s" % url)
if r.status_code >= 400 and r.status_code not in [504, 502]+ignore:
print >>sys.stderr, "[!] %d %s" % (r.status_code, url)
r.raise_for_status()
return r.text
def fetch(url, retries=5, ignore=[], params=None):
f = fetch_raw(url, retries, ignore, params)
return fromstring(f)
def unws(txt, spc=u' '):
return spc.join(txt.split())
def text(node):
return unws(''.join(node.xpath('.//text()')))
def parse_freqrange(freq, c1):
if c1=='Khz':
c1=1000
elif c1=='Mhz':
c1=1000000
elif c1=='Ghz':
c1=1000000000
else:
print 'bad mag', freq, mag
try: exact = int(float(freq)*c1)
except:
print freq
raise
return exact,exact
for url in urls:
print >>sys.stderr, 'fetching', url
root=fetch(url)
for node in root.xpath('//div[@class="postcontent"]/p'):
txt=unws(text(node))
if not txt[-2:]=='hz':
continue
freq, mag = txt.split()[-2:]
txt = ' '.join(txt.split()[:-2])
# data workarounds
if freq[-1]=='-': freq = freq[:-1]
if freq==u'háló': continue
mi,mx = parse_freqrange(freq.replace(',','.'), mag)
print (u"%s\t%s\t%s\t%s" % (mi,
mx,
'amatorradiozas.hu',
txt)).encode('utf8')