-
Notifications
You must be signed in to change notification settings - Fork 0
/
filter_and_combine_stories.py
69 lines (61 loc) · 2.66 KB
/
filter_and_combine_stories.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from os import listdir, makedirs
from os.path import isfile, join, exists
import json
import bson
import csv
# This program aggregates all of the Json story files emitted by Climate_tree_scraper.py into a single
# JSON file without duplicate links. If duplicates are found the place_id, solution, sector, and strategy
# are combined into a JSON array. If stories with no title are found one is constructed out of the place name
# and solution.
# location of input stories from the scraper
jsonDir = './output/'
# location to put output (assumed to exist)
outDir = './filtered_stories/'
# dict to build final json string
urlDict = {}
def parsePlaceCSV(filename):
with open(filename, mode='r', encoding='utf8') as csvfile:
reader = csv.DictReader(csvfile)
result = {}
for row in reader:
result[row['id']] = row['place']
return result
def getFiles(path):
return [f for f in listdir(path) if isfile(join(path, f))]
def getJson(path):
with open(path) as data:
return json.load(data)
def addToUrlDict(story, placeIdToNameDict):
story["posted_by"] = "ClimateTree"
if story["hyperlink"] not in urlDict: # if we havent seen link yet add the whole story
if story["story_title"] is None:
story["story_title"] = placeIdToNameDict[story["place_ids"][0]] + story["solution"][0]
urlDict[story["hyperlink"]] = story
else:
# if link exists already combine it with indexed story
placeId = story["place_ids"]
strategy = story["strategy"]
sector = story["sector"]
solution = story["solution"]
urlDict[story["hyperlink"]]["place_ids"] = list(set().union(placeId,urlDict[story["hyperlink"]]["place_ids"]))
urlDict[story["hyperlink"]]["strategy"] = list(set().union(strategy, urlDict[story["hyperlink"]]["strategy"]))
urlDict[story["hyperlink"]]["sector"] = list(set().union(sector, urlDict[story["hyperlink"]]["sector"]))
urlDict[story["hyperlink"]]["solution"] = list(set().union(solution, urlDict[story["hyperlink"]]["solution"]))
def writeToJson(obj):
with open(outDir + "stories.json", "w+", encoding='utf-8') as f:
json.dump(obj, f)
def main():
if not exists(outDir):
makedirs('filtered_stories')
finalJson = []
placeIdToNameDict = parsePlaceCSV("place_name_id.csv")
files = getFiles(jsonDir)
for file in files:
addToUrlDict(getJson(jsonDir+file)[0], placeIdToNameDict)
for key in urlDict:
finalJson.append(urlDict[key])
for data in finalJson:
tmp = [int(i) for i in data["place_ids"]] #Convert string placeids to int
data["place_ids"] = tmp
writeToJson(finalJson)
main()