-
Notifications
You must be signed in to change notification settings - Fork 5
/
add_tfidf.py
39 lines (32 loc) · 1.22 KB
/
add_tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import pandas
from sklearn.decomposition import RandomizedPCA
from sklearn.feature_extraction.text import TfidfVectorizer
import sys
import os.path
try:
trainfile = sys.argv[1]
testfile = sys.argv[2]
ncomponents = int(sys.argv[3])
except IndexError:
print "Please specify trainingfile.csv testingfile.csv NumComponents"
sys.exit(1)
traindf = pandas.read_csv(trainfile)
testdf = pandas.read_csv(testfile)
columns=["tfidfpca_%s" % x for x in xrange(ncomponents)]
trainCleanEssay = traindf.essay.str.decode('mac-roman')
testCleanEssay = testdf.essay.str.decode('mac-roman')
vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words="english")
trainvec = vectorizer.fit_transform(trainCleanEssay)
testvec = vectorizer.transform(testCleanEssay)
pca = RandomizedPCA(n_components=ncomponents)
pca.fit(trainvec)
trainpca = pca.transform(trainvec)
trainpcadf = pandas.DataFrame(trainpca, columns=columns)
testpca = pca.transform(testvec)
testpcadf = pandas.DataFrame(testpca, columns=columns)
traindf = traindf.combine_first(trainpcadf)
testdf = testdf.combine_first(testpcadf)
nf = lambda x: os.path.splitext(os.path.basename(x))[0] + "_tfidf.csv"
traindf.to_csv(nf(trainfile))
testdf.to_csv(nf(testfile))
print "+".join(columns)