forked from mattm401/collabortweet
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tweets2sql.py
executable file
·119 lines (87 loc) · 2.92 KB
/
tweets2sql.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/python
import sqlite3
import sys
import json
import codecs
import itertools
import random
taskDescPath = sys.argv[1]
sqlitePath = sys.argv[2]
tweetPath = sys.argv[3]
pairCount = None
if ( len(sys.argv) > 4 ):
pairCount = int(sys.argv[4])
taskDesc = {}
with codecs.open(taskDescPath, "r", "utf8") as inFile:
taskDesc = json.load(inFile)
print taskDesc
tweetList = []
# Get the contents of this tweet
def readTweet(tweetObj):
tweetText = None
tweetId = None
if ( "text" in tweetObj ): # Twitter format
tweetText = "%s - %s" % (tweetObj["user"]["screen_name"], tweetObj["text"])
tweetId = tweetObj["id"]
elif ( "body" in tweetObj ): # Gnip Format
tweetText = "%s - %s" % (tweetObj["actor"]["preferredUsername"], tweetObj["body"])
idstr = tweetObj["id"]
tweetId = idstr[idstr.rfind(":")+1:]
return (tweetText, tweetId)
with codecs.open(tweetPath, "r", "utf8") as inFile:
for line in inFile:
tweet = json.loads(line)
(tweetText, tweetId) = readTweet(tweet)
if ( tweetText == None ):
print "Skipping:", line
continue
tweetList.append((tweetText, tweetId))
# Open the sqlite3 file
conn = sqlite3.connect(sqlitePath)
c = conn.cursor()
c.execute("INSERT INTO tasks (taskName, question, taskType) VALUES (:name,:question,:type)",
taskDesc)
taskId = c.lastrowid
print "Task ID:", taskId
elementList = map(lambda x: (taskId, x[0], x[1]), tweetList)
elementIds = []
for elTup in elementList:
c.execute('INSERT INTO elements (taskId, elementText, externalId) VALUES (?,?,?)',
elTup)
elId = c.lastrowid
elementIds.append(elId)
print "Element Count:", len(elementIds)
# Only create pairs if the task type == 1
if ( taskDesc["type"] == 1 ):
# Create the pairs
pairList = None
# If we didn't specify a number of pairs, find all
if ( pairCount == None ):
pairList = itertools.combinations(elementIds, 2)
else: # Otherwise, randomly select k pairs
pairAccum = set()
for eIndex in range(len(elementIds)):
eId = elementIds[eIndex]
startIndex = max(0, eIndex-1)
others = elementIds[:startIndex] + elementIds[eIndex+1:]
# Put the pair in canonical order to avoid duplicates
newPairs = set(map(lambda x: (min(eId, x), max(eId, x)),
random.sample(others, pairCount)))
pairAccum = pairAccum.union(newPairs)
pairList = list(pairAccum)
pairList = [(taskId, x[0], x[1]) for x in pairList]
print "Pair Count:", len(pairList)
c.executemany('INSERT INTO pairs (taskId, leftElement, rightElement) VALUES (?,?,?)',
pairList)
# If we are dealing with a labeling task (type == 2), insert the labels
elif ( taskDesc["type"] == 2 ):
print "Insert labels..."
labelList = map(lambda x: {"taskId": taskId, "labelText": x}, taskDesc["labels"])
print labelList
c.executemany('INSERT INTO labels (taskId, labelText) VALUES (:taskId,:labelText)',
labelList)
# Otherwise, we have an invalid task type
else:
print "ERROR! Task type [" + taskDesc["type"] + "] is not valid!"
conn.commit()
conn.close()