-
Notifications
You must be signed in to change notification settings - Fork 1
/
Embedding.py
188 lines (156 loc) · 5.99 KB
/
Embedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
"""
This module offering word embedding using BERT model
"""
import torch
import Bert
from Parameters import global_param
bert_type=global_param.model_param['bert']
tokenizer,model=Bert.get_bert(bert_type=bert_type)
def Text2tokens(text):
"""
This function tokenize the input text
:param text: input text
:return: tow list of tokens and segmentation respectively
"""
marked_text = "[CLS] " + text +" [SEP]"
tokenized_text = tokenizer.tokenize(marked_text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [0] * len(tokenized_text)
return indexed_tokens,segments_ids
def Token2tonsor(token):
"""
This function convert the tuple (list tokens , list segmentation) to tuple of torch tensor
:param token: tuple (list tokens ,list segmentation)
:return: tuple of tokens and segments tensors respectively
"""
indexed_tokens,segments_ids = token[0],token[1]
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
return tokens_tensor,segments_tensors
def Bert_Forward(inputs):
"""
This function forward the input through BERT layers and feedback the activity of each layer
:param inputs: tuple of Bert inputs (tokens_tensor,segments_tensors)
:return: list of BERT layers activity
"""
tokens_tensor,segments_tensors=inputs[0],inputs[1]
model.eval()
with torch.no_grad():
activity_layers, _ = model(tokens_tensor, segments_tensors)
if(isinstance(activity_layers,torch.Tensor)):
activity_layers=[activity_layers]
return activity_layers
def Reshap(activity):
"""
This function reshape the activity tensor
layers,batches,tokens,features => tokens,layers,features
:param activity: the activity tensor
:return: the activity tensor with shape (tokens,layers,features)
"""
activity = torch.stack(activity, dim=0)
activity = torch.squeeze(activity, dim=1)
activity = activity.permute(1, 0, 2)
return activity
def Sentence_Embedding(text):
"""
This function compute the activity of input sentence in Bert network
:param text: input sentence
:return: activity tensor corresponding to sentence
"""
token=Text2tokens(text)
input=Token2tonsor(token)
activity=Bert_Forward(input)
activity=Reshap(activity)
return activity
def Features_extraction(text_tokens, mode=0):
"""
This function compute the features corresponding to input tokens using Bert model
:param text_tokens: list of tokens (indexed_tokens,segments_ids)
:param mode: strategy used in word embedding
mode 0 for get last layer activity in bert network as word embedding
:return: the list of features tensor, each ith item is a features tensor corresponding to ith token
"""
sentence_embedding=Sentence_Embedding(text_tokens)
features = []
if mode==0 :
for word_embedding in sentence_embedding:
word_features = word_embedding[-1]
features.append(word_features)
return features
def Sentence_Features(text,mode=0):
"""
This function compute the features corresponding to each word at input sentence
in order to obtain one tensor features per word the tensor features of tokens belong in the same word are averaged
:param text: input text
:param mode: strategy used for word embedding
mode 0 for get last layer activity in bert network as word embedding
:return:the list of features tensor, each ith item is a features tensor corresponding to ith word
"""
text=text.lower()
tokenized_text = tokenizer.tokenize("[CLS] " + text +" [SEP]")
token_features = Features_extraction(text, mode)
sentence_features=[]
sub_word,b_tokens=False,1
mean=global_param.corpus_param['token_mean']
for token,features in zip(tokenized_text,token_features):
if(mean):
if(token.startswith("##")):
sentence_features[-1].add(features)
nb_tokens+=1
sub_word = True
else:
if(sub_word):
sentence_features[-1]/=nb_tokens
sentence_features.append(features)
sub_word, nb_tokens = False, 1
else:
sentence_features.append(features)
if(global_param.corpus_param['post_embadding']):
size = sentence_features[0].size()
for i in range(global_param.corpus_param['padding_size']-len(sentence_features)):
sentence_features.append(torch.zeros(size))
return torch.stack(sentence_features)
def Sentence_1D_Features(text,mode=0):
"""
this function compute the features corresponding to each word at input sentence
the output tensor are flattened (the features vector are concatenated into one vector (dim=1) )
:param text: input text
:param mode: strategy used for word embedding
"""
return Sentence_Features(text,mode).view(-1)
def Global_Sentence_Embadding(text,mode):
tokenized_text = tokenizer.tokenize("[CLS] " + text + " [SEP]")
token_features = Features_extraction(text, mode)
return token_features[0]
def get_bert_inputs(text):
tokenized_text = Text2tokens("[CLS] " + text + " [SEP]")
inputs=Token2tonsor(tokenized_text)[0].view(-1)
tokenized_text = tokenizer.tokenize("[CLS] " + text +" [SEP]")
outind=[0,0,0,0]
j=0
v=False
u=False
v1=False
u1=False
for i in tokenized_text:
#print(i)
if('<'in i and v and outind[0]==0):
outind[0]=j-1
if('[' in i and u and outind[2]==0):
outind[2]=j-1
v='<' in i
u='[' in i
if('>'in i and v1 and outind[1]==0):
outind[1]=j
if(']' in i and u1 and outind[3]==0):
outind[3]=j
v1='>' in i
u1=']' in i
j+=1
if(outind[3]+outind[2]==0):
outind[3]=outind[1]
outind[2]=outind[0]
if(outind[1]+outind[0]==0):
outind[1]=outind[3]
outind[0]=outind[2]
return inputs,torch.tensor(outind)