-
Notifications
You must be signed in to change notification settings - Fork 0
/
util.py
291 lines (234 loc) · 11.5 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
from __future__ import division
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
import cv2
def predict_transform(prediction, inp_dim, anchors, num_classes, CUDA = True):
#print("Input dimension: {}".format(inp_dim))
batch_size = prediction.size(0)
stride = inp_dim // prediction.size(2) # stride de la red
#print("Stride: {}".format(stride))
#print(" Output size: {}".format(prediction.size()))
grid_size = inp_dim // stride # el grid size es distinto debio al multi-dimension predictions
#print(" Grid Size: {}".format(grid_size))
bbox_attrs = 5 + num_classes # (x,y,w,h, pc) + num_classes
num_anchors = len(anchors)
# El metodo 'view' hace lo mismo que np.reshape
prediction = prediction.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size)
prediction = prediction.transpose(1,2).contiguous()
prediction = prediction.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs)
# Re-escalar las dimensiones de anchors por un factor de '1/stride'
# por que la red redujo en un factor de 'stride'
anchors = [(a[0]/stride, a[1]/stride) for a in anchors]
# ************************************ #
# Recuperar posicion del centroide
# ************************************ #
# Aplicar las transformaciones a cada elemento correspondiente
prediction[:,:,0] = torch.sigmoid(prediction[:,:,0]) # x centroide
prediction[:,:,1] = torch.sigmoid(prediction[:,:,1]) # y centroide
prediction[:,:,4] = torch.sigmoid(prediction[:,:,4]) # objectness score
# Sumar a cada centroide los indices del bounding box de donde proviene
# para recuperar su posicion absoluta x, y
grid = np.arange(grid_size)
a, b= np.meshgrid(grid, grid)
# crear listas de los indices
x_offset = torch.FloatTensor(a).view(-1,1)
y_offset = torch.FloatTensor(b).view(-1,1)
# mover al GPU
if CUDA:
x_offset = x_offset.cuda()
y_offset = y_offset.cuda()
# crea un lista de indices que se pueden sumar directamente
# a los centroides
x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1,num_anchors).view(-1,2).unsqueeze(0)
prediction[:,:,:2] = prediction[:,:,:2] + x_y_offset
# ************************************ #
# Recuperar ancho y alto
# ************************************ #
anchors = torch.FloatTensor(anchors)
if CUDA:
anchors = anchors.cuda()
anchors = anchors.repeat(grid_size * grid_size, 1).unsqueeze(0)
prediction[:,:,2:4] = torch.exp(prediction[:,:,2:4])*anchors
# aplicar funcion sigmoide a las confianzas por clase
prediction[:,:,5: 5 + num_classes] = torch.sigmoid(prediction[:,:, 5: 5 + num_classes])
# Re escalar las dimensiones extraidas por un factor de 'stride'
prediction[:,:,:4] = prediction[:,:,:4]*stride
return prediction
def write_results(prediction, confidence, num_classes, nms_thresh = 0.4):
"""
@prediction salida de la red neural
@confidence objectness
@nms_conf non maximum supression confidence
@description En base a la confianza de la prediccion y las clases se devuelve
la prediccion final de la red, luego de post-procesar utilizando non-maximum
supression para obtener la prediccion mas precisa
"""
# considerar solo aquellas bounding box con confianza mayor al limite
conf_mask = (prediction[:,:,4] > confidence).float().unsqueeze(2)
prediction = prediction*conf_mask # si objecteness<confidence, objectess == 0
# obtener coordenadas x,y de esquinas del bounding box
# Ejemplo
# esquina superior izquierda, y = centro,y - alto/2
# esquina superior izquierda, x = centro,x - ancho/2
box_corner = prediction.new(prediction.shape)
box_corner[:,:,0] = (prediction[:,:,0] - (prediction[:,:,2]/2)) # top-left x
box_corner[:,:,1] = (prediction[:,:,1] - (prediction[:,:,3]/2)) # top-left y
box_corner[:,:,2] = (prediction[:,:,0] + (prediction[:,:,2]/2)) # top-right x
box_corner[:,:,3] = (prediction[:,:,1] + (prediction[:,:,3]/2)) # top-right y
# sustituir centro x,y,ancho,alto por esquina izquierda x,y, esquina derecha x,y
prediction[:,:,:4] = box_corner[:,:,:4]
batch_size = prediction.size(0) # leer numero de imagenes
#print("Batch size is: {}".format(batch_size))
write = False
# hacer Non Maximum Suppresion image por imagen (no por batch)
for ind in range(batch_size):
image_pred = prediction[ind] # leer imagen 'i'
# ****************************
# @IMPROVEMENTE: creo que esto se puede hacer antes, no en este punto
# y ganar eficiencia
# ********************************
# recuperar solo la clase con mayor confianza
max_conf, max_conf_index = torch.max(image_pred[:,5:5+num_classes], 1)
max_conf = max_conf.float().unsqueeze(1)
max_conf_index = max_conf_index.float().unsqueeze(1)
seq = (image_pred[:,:5], max_conf, max_conf_index) # almacenar info en un tuple
image_pred = torch.cat(seq, 1) # concatenar todos los valores en 1 solo tensor
# eliminar las bounding box con objecteness < confidence
non_zero_ind = (torch.nonzero(image_pred[:,4]))
#print("Objectess > Confidence para {} elementos".format(non_zero_ind.size(0)))
try:
# seleccionar solo las ubicaciones donde objectness > confidence
image_pred_ = image_pred[non_zero_ind.squeeze(),:].view(-1,7)
except:
continue
#print(">> image_pred all: {}".format(image_pred.size()))
#print(">> image_pred_ conf< {}: {}\n{}".format(confidence, image_pred_.size(), image_pred_))
# si no hay detectiones con objecteness > confidence, continuar a
# siguiente imagen
if image_pred_.shape[0]==0:
continue
img_classes = unique(image_pred_[:,-1]) # obtener las classes detectadas
#print(">> img_classes: {}".format(img_classes))
# ************************************ #
# NON MAXIMUM SUPRESSION
# See for reference: https://www.youtube.com/watch?v=VAo84c1hQX8
# ************************************ #
for cls in img_classes:
# obtener detecciones para la actual clase 'cls'
cls_mask = image_pred_*(image_pred_[:,-1] == cls).float().unsqueeze(1)
#print(">> Elementos de la clase {}:\n {}".format(cls, cls_mask))
class_mask_ind = torch.nonzero(cls_mask[:, -2]).squeeze()
image_pred_class = image_pred_[class_mask_ind].view(-1, 7)
# sort by objecteness in descending order
conf_sort_index = torch.sort(image_pred_class[:,4], descending =True)[1]
image_pred_class = image_pred_class[conf_sort_index]
#print(">> Elementos de la clase {} ordenados descendente: \n {}".format(cls, image_pred_class))
idx = image_pred_class.size(0)
#print(">> IDX: {}".format(idx))
# PERFORM Non Maximum Supression
for i in range(idx):
# obtener el IOU entre el bounding box con max conf, y el resto
try:
ious = bbox_iou(image_pred_class[i].unsqueeze(0), image_pred_class[i+1:])
except ValueError:
#print("Value error en elemento {}".format(i))
break
except IndexError:
#print("IndexError en elemento {}".format(i))
break
#print("IOUS para NMS: \n{}".format(ious))
# identificar bounding boxes cuyo IOU > threshold
iou_mask = (ious < nms_thresh).float().unsqueeze(1)
image_pred_class[i+1:] *= iou_mask
# eliminar bounding boxes cuyo IOU > threshold
non_zero_ind = torch.nonzero(image_pred_class[:,4]).squeeze()
image_pred_class = image_pred_class[non_zero_ind].view(-1, 7)
#
batch_ind = image_pred_class.new(image_pred_class.size(0),1).fill_(ind)
seq = batch_ind, image_pred_class
if not write:
output = torch.cat(seq, 1)
write = True
else:
out = torch.cat(seq, 1)
output = torch.cat((output,out))
try:
return output
except:
# Si no hubo detecciones
return 0
def bbox_iou(box1, box2):
"""
Calcular el IOU entre box1 y box2
"""
# Obtener coordenadas de las esquinas de cada bounding box
#print(">> Boxes\n Box1 \n{} \nBox2 \n{}".format(box1,box2))
b1_x1, b1_y1, b1_x2, b1_y2 = box1[:,0], box1[:,1], box1[:,2], box1[:,3]
b2_x1, b2_y1, b2_x2, b2_y2 = box2[:,0], box2[:,1], box2[:,2], box2[:,3]
# calcular coordenadas del rectangulo interseccion
int_rect_x1 = torch.max(b1_x1, b2_x1)
int_rect_y1 = torch.max(b1_y1, b2_y1)
int_rect_x2 = torch.max(b1_x2, b2_x2)
int_rect_y2 = torch.max(b1_y2, b2_y2)
# area de interseccion = ancho * alto
int_area = torch.clamp(int_rect_x2 - int_rect_x1 +1, min=0)* torch.clamp(int_rect_y2 - int_rect_y1 + 1, min=0)
# area de union: area1 + area 2 - inter_area
box1_area = (b1_x2 - b1_x1 + 1 ) * (b1_y2 - b1_y1 + 1)
box2_area = (b2_x2 - b2_x1 + 1 ) * (b2_y2 - b2_y1 + 1)
union_area = box2_area + box1_area - int_area
# IOU = int_area / (un_area)
iou = int_area/union_area
return iou
def unique(tensor):
"""
Devuelve un tensor cuyos elementos son las clases detectadas en el tensor
enviado como parametro
"""
tensor_np = tensor.cpu().numpy()
unique_np = np.unique(tensor_np)
unique_tensor = torch.from_numpy(unique_np)
tensor_res = tensor.new(unique_tensor.shape)
tensor_res.copy_(unique_tensor)
return tensor_res
def load_classes(namesfile):
"""
Cargar los nombres de todas las classes
"""
fp = open(namesfile, 'r')
names = fp.read().split('\n')[:-1]
return names
def resize_pad_image(img, inp_dim):
"""
@img image to resize (without changing aspect ratio) and pad
@inp_dim dimensions to which resize the image
@description resize an image without changing aspect ratio using padding
@ return resized and padded image
"""
o_width, o_height = img.shape[1], img.shape[0] # original dimensions
w, h = inp_dim
# factor de redimension
f = min(w/o_width, h/o_height)
new_w = int(o_width * f)
new_h = int(o_height * f)
# redimensionar imagen
resized_image = cv2.resize(img, (new_w, new_h), interpolation = cv2.INTER_CUBIC)
# crear un canvas con 'fondo' gris (128,128,128)
canvas = np.full((inp_dim[1], inp_dim[0], 3), 128)
# ubicar imagen original en el centro del canvas
canvas[(h-new_h)//2:(h-new_h)//2 + new_h,(w-new_w)//2:(w-new_w)//2 + new_w, :] = resized_image
return canvas
def prep_image(img, inp_dim):
"""
@img image to prepare
@inp_dim
@description redimensionar, convertir de BGR a RGB, dividing por 255,
reorganizar en orden Canales, Alto, Ancho, y transformar
en Variable la imagen de entrada
"""
img = resize_pad_image(img, (inp_dim, inp_dim))
img = img[:,:,::-1].transpose((2,0,1)).copy()
img = torch.from_numpy(img).float().div(255.0).unsqueeze(0)
return img