-
Notifications
You must be signed in to change notification settings - Fork 0
/
extractPosts2LIWC.py
executable file
·117 lines (103 loc) · 2.91 KB
/
extractPosts2LIWC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import os
import string
import sys
import csv
import re
import labelNU
#schoolName is arg 1
#LIWC category is arg 2
#number of LIWC words to consider is arg 3
def myDictFilter(lst, index):
return list(dict.fromkeys(filter(None, lst[index-1][3:])))
def main():
schoolName = sys.argv[1]
category = sys.argv[2]
# count = sys.argv[3]
with open('Excel_Data/LIWCdict.csv', 'rU') as f:
reader = csv.reader(f, delimiter=',', quoting=csv.QUOTE_NONE)
lst = [[x.strip() for x in row] for row in reader]
lst = zip(*lst)
LIWC = {}
LIWC["sex"] = myDictFilter(lst,78)
LIWC["religious"] = myDictFilter(lst,100) + myDictFilter(lst, 101)
LIWC["death"] = myDictFilter(lst, 102)
LIWC["sad"] = myDictFilter(lst, 50)
LIWC["anger"] = myDictFilter(lst, 48) + myDictFilter(lst, 49)
posts = []
catPosts = {}
dictionary = {}
wordSet = []
taggedPost = []
fileList = open("All/lenPosts.dat").readlines()
for school in fileList:
if schoolName in school:
break
totalPosts = school.split(":")[2]
school = "All/"+school.rsplit(":",1)[0] + "posts.csv"
data = open(school).readlines()
data = data[1:]
wordSet = LIWC[category]
'''
else:
fileName = sys.argv[2] + ".dict"
#for fileName in dictList
wordList = open(fileName).read()
#wordList = unicode(wordList, 'ascii', 'ignore')
wordList = filter(lambda x: x in string.printable, wordList)
wordList = wordList.split("|")
for word in wordList:
word = word.strip()
if len(word) > 1:
wordSet += [word]
wordSet = list(set(wordSet))
'''
taggedWords = []
dictionary[category] = wordSet
totalLength = 0
allPst = []
for post in data:
post = post.split("|")
pst = str(post[2])
allPst += [pst]
length = len(pst)
totalLength += length
count = 0
words = " ("
for word in wordSet:
if word.endswith("*"):
word = word[:-1]
occur = re.search(word, pst)
if occur is not None and (occur.start() ==0 or pst[occur.start()-1] == " ") and (occur.end() == len(pst) or pst[occur.end()] == " "):
count += 1
if count == int(sys.argv[3]):
words += word + ")"
#pst += words
taggedWords += [words]
taggedPost += [pst]
continue
else:
words += word + ","
rest = list(set(allPst) - set(taggedPost))
catDict = {"sex":"Sexual","death":"Death"}
totalLabeled = labelNU.label(catDict[category])
catPosts[category] = taggedPost
catPosts["words"] = taggedWords
print "Total posts is ",int(totalPosts)
avg = totalLength/int(totalPosts)
print "Average length of post is ",avg," characters"
poslabeledPosts = []
labeledPosts = []
count = 0
for post in totalLabeled:
if post[1] == "pos":
poslabeledPosts += [post[0]]
if post[0] in taggedPost:
count +=1
#print count
#print labeledPosts
#print catPosts[category]
#print "Number of labeled posts in this category",len(poslabeledPosts)
print "Number of LIWC-ed posts in this category",len(catPosts[category])
#print catPosts["words"]
if __name__=='__main__':
main()