-
Notifications
You must be signed in to change notification settings - Fork 2
/
process_sent140.py
32 lines (31 loc) · 963 Bytes
/
process_sent140.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import numpy as np
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import pandas as pd
import json
import os
import csv
import re
filepath = 'data/sentiment_analysis/training.1600000.processed.noemoticon.csv'
df = pd.read_csv(filepath, names=['label', 'id', 'date', 'flag','user','sentence'], sep=',', encoding="utf8")
sentences = df['sentence'].values
labels = df['label'].values
writer = csv.writer(open("sent140.txt", 'w', newline='',encoding="utf8"), delimiter='\t')
max = 0
for sent,y in zip(sentences,labels) :
label = 0
sent = sent.replace('"',"")
sent = sent.replace("\t"," ")
sent = sent.replace("<br />"," ")
sent = re.sub("^@\w+",'',sent).strip()
if (len(sent)>max) :
max = len(sent)
#print (sent,y)
if y == 4 :
print("hit")
label = 1
elif y == 2 :
label = 2
if len(sent) < 50 :
writer.writerow([sent,label])
print(max)