-
Notifications
You must be signed in to change notification settings - Fork 40
/
loader.py
184 lines (160 loc) · 5.47 KB
/
loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#encoding:utf-8
from collections import Counter
import tensorflow.contrib.keras as kr
import numpy as np
import codecs
import re
import jieba
def read_file(filename):
"""
Args:
filename:trian_filename,test_filename,val_filename
Returns:
two list where the first is lables and the second is contents cut by jieba
"""
re_han = re.compile(u"([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)") # the method of cutting text by punctuation
with codecs.open('./data/stopwords.txt','r',encoding='utf-8') as f:
stopwords=[line.strip() for line in f.readlines()]
contents,labels=[],[]
with codecs.open(filename,'r',encoding='utf-8') as f:
for line in f:
try:
line=line.rstrip()
assert len(line.split('\t'))==2
label,content=line.split('\t')
labels.append(label)
blocks = re_han.split(content)
word = []
for blk in blocks:
if re_han.match(blk):
seglist=jieba.lcut(blk)
word.extend([w for w in seglist if w not in stopwords])
contents.append(word)
except:
pass
return labels,contents
def build_vocab(filenames,vocab_dir,vocab_size=8000):
"""
Args:
filename:trian_filename,test_filename,val_filename
vocab_dir:path of vocab_filename
vocab_size:number of vocabulary
Returns:
writting vocab to vocab_filename
"""
all_data = []
for filename in filenames:
_,data_train=read_file(filename)
for content in data_train:
all_data.extend(content)
counter=Counter(all_data)
count_pairs=counter.most_common(vocab_size-1)
words,_=list(zip(*count_pairs))
words=['<PAD>']+list(words)
with codecs.open(vocab_dir,'w',encoding='utf-8') as f:
f.write('\n'.join(words)+'\n')
def read_vocab(vocab_dir):
"""
Args:
filename:path of vocab_filename
Returns:
words: a list of vocab
word_to_id: a dict of word to id
"""
words=codecs.open(vocab_dir,'r',encoding='utf-8').read().strip().split('\n')
word_to_id=dict(zip(words,range(len(words))))
return words,word_to_id
def read_category():
"""
Args:
None
Returns:
categories: a list of label
cat_to_id: a dict of label to id
"""
categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
cat_to_id=dict(zip(categories,range(len(categories))))
return categories,cat_to_id
def process_file(filename,word_to_id,cat_to_id,max_length=200):
"""
Args:
filename:train_filename or test_filename or val_filename
word_to_id:get from def read_vocab()
cat_to_id:get from def read_category()
max_length:allow max length of sentence
Returns:
x_pad: sequence data from preprocessing sentence
y_pad: sequence data from preprocessing label
"""
labels,contents=read_file(filename)
data_id,label_id=[],[]
for i in range(len(contents)):
data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
label_id.append(cat_to_id[labels[i]])
x_pad=kr.preprocessing.sequence.pad_sequences(data_id,max_length,padding='post', truncating='post')
y_pad=kr.utils.to_categorical(label_id)
return x_pad,y_pad
def batch_iter(x,y,batch_size=64):
"""
Args:
x: x_pad get from def process_file()
y:y_pad get from def process_file()
Yield:
input_x,input_y by batch size
"""
data_len=len(x)
num_batch=int((data_len-1)/batch_size)+1
indices=np.random.permutation(np.arange(data_len))
x_shuffle=x[indices]
y_shuffle=y[indices]
for i in range(num_batch):
start_id=i*batch_size
end_id=min((i+1)*batch_size,data_len)
yield x_shuffle[start_id:end_id],y_shuffle[start_id:end_id]
def export_word2vec_vectors(vocab, word2vec_dir,trimmed_filename):
"""
Args:
vocab: word_to_id
word2vec_dir:file path of have trained word vector by word2vec
trimmed_filename:file path of changing word_vector to numpy file
Returns:
save vocab_vector to numpy file
"""
file_r = codecs.open(word2vec_dir, 'r', encoding='utf-8')
line = file_r.readline()
voc_size, vec_dim = map(int, line.split(' '))
embeddings = np.zeros([len(vocab), vec_dim])
line = file_r.readline()
while line:
try:
items = line.split(' ')
word = items[0]
vec = np.asarray(items[1:], dtype='float32')
if word in vocab:
word_idx = vocab[word]
embeddings[word_idx] = np.asarray(vec)
except:
pass
line = file_r.readline()
np.savez_compressed(trimmed_filename, embeddings=embeddings)
def get_training_word2vec_vectors(filename):
"""
Args:
filename:numpy file
Returns:
data["embeddings"]: a matrix of vocab vector
"""
with np.load(filename) as data:
return data["embeddings"]
def get_sequence_length(x_batch):
"""
Args:
x_batch:a batch of input_data
Returns:
sequence_lenghts: a list of acutal length of every senuence_data in input_data
"""
sequence_lengths=[]
for x in x_batch:
actual_length = np.sum(np.sign(x))
sequence_lengths.append(actual_length)
return sequence_lengths