forked from algaebrown/DeepRecipe
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataloader.py
177 lines (135 loc) · 7.19 KB
/
dataloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import random
import numpy as np
from collections import defaultdict
import torch
import torch.utils.data as data
import torchvision.transforms as transforms
from PIL import Image
from torch.utils.data import DataLoader
from file_utils import *
from vocab import *
# from glove_utils.glove import *
def get_recipe_dataloader(dataset, config_data, in_collate_fn):
return DataLoader(dataset, batch_size=config_data['dataset']['batch_size'],
shuffle=True,
# num_workers=config_data['dataset']['num_workers'],
collate_fn=in_collate_fn,
# pin_memory=True
)
def find_img_path(imgid, root=IMAGES_PATH):
return os.path.join(root, imgid[0], imgid[1], imgid[2], imgid[3], imgid)
def make_same_length(captions):
# Merge captions (from tuple of 1D tensor to 2D tensor).
lengths = [len(cap) for cap in captions]
targets = torch.zeros(len(captions), max(lengths)+2).long()
#Tokenize
for i, cap in enumerate(captions):
targets[i, 0] = 1
end = lengths[i]+1
targets[i, 1:end] = cap[:end]
targets[i,end] = 2
return targets
def collate_fn(input_data):
"""Creates mini-batch tensors from the list of tuples (image, caption)
by padding the captions to make them of equal length.
We can not use default collate_fn because variable length tensors can't be stacked vertically.
We need to pad the captions to make them of equal length so that they can be stacked for creating a mini-batch.
Read this for more information - https://pytorch.org/docs/stable/data.html#dataloader-collate-fn
Args:
input_data: list of tuple (image, caption).
- image: torch tensor of shape (3, 256, 256).
- caption: torch tensor of shape (?); variable length.
Returns:
images: torch tensor of shape (batch_size, 3, 256, 256).
targets: torch tensor of shape (batch_size, padded_length).
lengths: list; valid length for each padded caption.
"""
# Sort a data list by caption length (descending order).
# TODO: Do we really need to sort data?
# input_data.sort(key=lambda x: len(x[3]), reverse=True)
images, ing_binary, title, ing, ins, ann_id = zip(*input_data)
images = torch.stack(images, 0)
title = make_same_length(title)
ing= make_same_length(ing)
ins = make_same_length(ins)
ing_binary = torch.stack(ing_binary, 0)
return images, title, ing_binary, ing, ins, ann_id
def get_datasets(config_data, binary_ing=True):
images_root_dir = ROOT_DIR
img_size = config_data['dataset']['img_size']
train_file_path = os.path.join(images_root_dir, config_data['dataset']['train_pickle'])
val_file_path = os.path.join(images_root_dir, config_data['dataset']['val_pickle'])
test_file_path = os.path.join(images_root_dir, config_data['dataset']['test_pickle'])
vocab_threshold = config_data['dataset']['vocabulary_threshold'] # TODO
vocab, ingd_vocab = load_vocab(train_file_path, vocab_threshold)
train_dataset = RecipeDataset(images_root_dir, train_file_path, vocab, ingd_vocab, img_size, binary_indexing = binary_ing)
test_dataset = RecipeDataset(images_root_dir, test_file_path, vocab, ingd_vocab, img_size, binary_indexing = binary_ing)
val_dataset = RecipeDataset(images_root_dir, val_file_path, vocab, ingd_vocab, img_size, binary_indexing = binary_ing)
train_data_loader = get_recipe_dataloader(train_dataset, config_data, in_collate_fn=collate_fn)
test_data_loader = get_recipe_dataloader(test_dataset, config_data, in_collate_fn=collate_fn)
val_data_loader = get_recipe_dataloader(val_dataset, config_data, in_collate_fn=collate_fn)
return vocab,ingd_vocab, train_data_loader, val_data_loader, test_data_loader, train_dataset, test_dataset, val_dataset
class RecipeDataset(data.Dataset):
"""for torch.utils.data.DataLoader"""
def __init__(self, root, pickle_path, vocab, ingd_vocab, img_size, transform=None, binary_indexing = False):
"""Set the path for images, captions and vocabulary wrapper.
Args:
root: image directory.
vocab: vocabulary wrapper.
transform: image transformations.
"""
self.binary_indexing = binary_indexing
with open(pickle_path, 'rb') as f:
dictionary = pickle.load(f)
self.root = root
self.dict = dictionary
self.ids = list(dictionary.keys())
self.max_ingd = 16
self.vocab = vocab
self.ingd_vocab = ingd_vocab
self.n_category = len(self.ingd_vocab) # total number of ingredients, plus unk, pad and...
self.normalize = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=[0.3774, 0.1051, -0.1764], std=[1.1593, 1.1756, 1.1958]) # TODO, might need to change
])
self.resize = transforms.Compose(
[transforms.Resize(img_size, interpolation=transforms.InterpolationMode.BILINEAR), transforms.CenterCrop(img_size)])
def sentence_to_tensor(self, caption):
""" given sentence, convert to tokens """
tokens = nltk.tokenize.word_tokenize(str(caption).lower())
caption = [self.vocab('<start>')]
caption.extend([self.vocab(token) for token in tokens])
caption.append(self.vocab('<end>'))
target = torch.Tensor(caption)
return target
def get_raw_data(self, ann_id):
''' return raw data (text) and path to img '''
title = self.dict[ann_id]['title'].lower()
ingridients = self.dict[ann_id]['ingredient_list']
instructions = ' '.join([i['text'] for i in self.dict[ann_id]['instructions']]).lower()
img_ids = [i['id'] for i in self.dict[ann_id]['images']] # can have multiple images, choose 1
img_paths = [find_img_path(img_id) for img_id in img_ids]
return title, ingridients, instructions, img_paths
def __getitem__(self, index):
"""Returns one data pair (image and caption)."""
ing_index_tensor = None
ann_id = self.ids[index]
title = self.dict[ann_id]['title'].lower()
random.shuffle(self.dict[ann_id]['ingredient_list']) # shuffle them cause they have no positional dep
ingridients = [self.ingd_vocab(i) for i in self.dict[ann_id]['ingredient_list']]
instructions = ' '.join([i['text'] for i in self.dict[ann_id]['instructions']]).lower()
img_id = random.choice(self.dict[ann_id]['images'])['id'] # can have multiple images, choose 1
if self.binary_indexing:
ing_index_tensor = torch.zeros(self.n_category)
ing_index_tensor[np.array(ingridients)] = 1
path = find_img_path(img_id)
image = Image.open(path).convert('RGB')
image = self.resize(image)
image = self.normalize(np.asarray(image))
# Convert caption (string) to word ids.
title_tensor = self.sentence_to_tensor(title)
ing_tensor = torch.tensor(ingridients)
ins_tensor = self.sentence_to_tensor(instructions)
return image, ing_index_tensor, title_tensor, ing_tensor, ins_tensor, ann_id
def __len__(self):
return len(self.ids)