This repository has been archived by the owner on Jan 3, 2019. It is now read-only.
forked from alphagov/govuk-lda-tagger
-
Notifications
You must be signed in to change notification settings - Fork 0
/
data.py
56 lines (47 loc) · 1.91 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import csv
import json
import urllib2
import urlparse
def load_links():
with open('input/early-years-audit-all-content.csv', 'r') as f:
reader = csv.reader(f)
# skip headers
next(reader, None)
documents = list(reader)
return [document[1] for document in documents if document[1] != '']
def load_base_paths():
return [urlparse.urlparse(url).path for url in load_links()]
def dev_urls():
return ["http://www.dev.gov.uk" + base_path + '?skip_slimmer=1' for base_path in load_base_paths()]
def save_base_paths():
base_paths_file = open('output/base_paths_file.csv', 'w')
base_paths_file.write("\n".join(load_base_paths()))
base_paths_file.close()
def download_early_years_content():
content = {}
for base_path in load_base_paths():
print "Fetching content for " + base_path
url = 'https://www.gov.uk/api/search.json?filter_link={}&fields=indexable_content'.format(base_path)
response = urllib2.urlopen(url)
json_string = response.read()
data = json.loads(json_string)
results = data['results']
if len(results) > 0:
result = results[0]
if 'indexable_content' in result:
content[base_path] = result['indexable_content']
return content
def download_early_years_title_description():
content = {}
for base_path in load_base_paths():
print "Fetching content for " + base_path
url = 'https://www.gov.uk/api/search.json?filter_link={}&fields=title,description'.format(base_path)
response = urllib2.urlopen(url)
json_string = response.read()
data = json.loads(json_string)
results = data['results']
if len(results) > 0:
result = results[0]
if all(field in result for field in ['title', 'description']):
content[base_path] = result['title'] + ' ' + result['description']
return content