-
Notifications
You must be signed in to change notification settings - Fork 0
/
sample.py
74 lines (63 loc) · 2.24 KB
/
sample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from src.textCNN import textCNN
from src.data_helpers import load_data
from src.data_helpers import build_input_data
from sklearn.model_selection import train_test_split
import numpy as np
import jieba
import json
# Train a new model
print('Loading data')
x, y, vocabulary, vocabulary_inv = load_data('data/Spam/')
with open('vocabulary.json', 'w') as f:
json.dump(vocabulary, f)
classifier = textCNN(
sequence_length=x.shape[1],
vocabulary_size=len(vocabulary_inv),
num_classifier=y.shape[1]
)
del vocabulary
del vocabulary_inv
classifier.construct_model()
classifier.train(x, y, checkpoint_path='model/textCNN/classification.hdf5', epochs = 10)
# predict using model
def load_text_and_label():
econ = 'data/evaluation_data/econ.txt'
sports = 'data/evaluation_data/econ.txt'
econList = list(open(econ, "r").readlines())
econList = [jieba.lcut(s.strip()) for s in econList]
sportsList = list(open(sports, "r").readlines())
sportsList = [jieba.lcut(s.strip()) for s in sportsList]
x_text = econList + sportsList
econLabel = [[1, 0, 0, 0] for _ in econList]
sportLabel = [[0, 0, 1, 0] for _ in sportsList]
y = np.concatenate([econLabel, sportLabel], 0)
return [x_text, y]
def pad_sentences(sentences, padding_word="<PAD/>"):
"""
Pads all sentences to the same length. The length is defined by the longest sentence.
Returns padded sentences.
"""
sequence_length = x.shape[1] # as defined by model
padded_sentences = []
for i in range(len(sentences)):
sentence = sentences[i]
num_padding = sequence_length - len(sentence)
if num_padding > 0:
new_sentence = sentence + [padding_word] * num_padding
else:
new_sentence = sentence[:sequence_length]
padded_sentences.append(new_sentence)
return padded_sentences
with open('vocabulary.json', 'r') as fp:
dictionary = json.load(fp)
[corpus, labels] = load_text_and_label()
data = pad_sentences(corpus)
x, y = build_input_data(data, labels, dictionary)
result = classifier.model.predict(x)
result = np.where(result > 0.5, 1, 0)
counter = 0
for i in range(len(y)):
if np.array_equal(y[i], result[i]):
counter += 1
accuracy = counter/len(y)
print(accuracy)