-
Notifications
You must be signed in to change notification settings - Fork 0
/
getAllTokens.py
executable file
·48 lines (40 loc) · 1.33 KB
/
getAllTokens.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import sys
import os
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from collections import Counter
nltkStop = stopwords.words('english')
tokenizer = RegexpTokenizer(r'\w+')
path = "FCBPosts"
filesDone = os.listdir(path)
allFiles = os.listdir("/Users/arindam/FCB")
files = list(set(allFiles) - set(filesDone))
for fileName in files:
print fileName
if ".csv" not in fileName or fileName == "Georgetown University:409149419153181posts.csv":
continue
if "post" in fileName:
colNum = 2
newPath = "FCBPostWords/"
elif "comment" in fileName:
colNum = 6
newPath = "FCBCommentWords/"
print newPath
newFileName = newPath+fileName.split(":")[0]+"uniqueWords.dat"
f = open(newFileName,'w')
data = open("/Users/arindam/FCB"+"/"+fileName).readlines()
posts = []
myStop = ["me","it's","&","and","i'm","i'd", "i've","didn't"]
for i in range(10000):
myStop += [str(i+1)]
myStop += ['#'+str(i+1)+':']
stop = nltkStop + myStop
for row in data:
posts += [row.split("|")[colNum]]
uniqueWords = []
for sentence in posts:
#uniqueWords += [i for i in sentence.lower().split() if i not in stop]
uniqueWords += [i for i in tokenizer.tokenize(sentence.lower()) if i not in stop]
wordCounter = Counter(uniqueWords)
for word in wordCounter.keys():
f.write(str(word)+":"+str(wordCounter[word])+"\n")