-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_reading.py
319 lines (241 loc) · 11.9 KB
/
data_reading.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
"""
Description
===============================
"""
from __future__ import annotations
from typing import Dict, Tuple, List
import csv
RECIPES_FILE = "data/clean_recipes.csv"
REVIEWS_FILE = "data/reviews.csv"
# Key measurement words to remove from ingredients
WORDS_TO_REMOVE = {'packages', 'cups', 'cup', 'tablespoons', 'tablespoon', 'teaspoon',
'teaspoons', 'packets', 'pounds', 'pound', 'inch', 'ounces', 'drops',
'dashes', 'jar', 'dash', 'envelope', 'container', 'package', 'crushed',
'ounce', 'cans', 'can', 'loaves', 'bottle', 'packet', 'tube', 'bottle',
'sheets', 'recipe', 'peeled and chopped', 'bunch'}
# Key words of ingredients to remove
REMOVE_INGREDIENTS1 = {'marinate', 'low fat', 'breakfast', 'england', ' 2', 'fry', 'side',
'low sodium', 'Dry Mix Ingredients', 'kosher for passover', '2',
'mexico', 'raw', 'drained and mashed', 'without shells', 'bake',
'peeled and segmented', 'peeled and shredded', 'pan drippings',
'dessert', 'cubed', '(optional)', 'drained and chopped',
'Glaze', 'rinsed and dried', 'divided', 'thick circles',
'washed and cubed', 'mashed', 'fat free', 'Southern Comfort',
'peeled and julienned', 'lunch', 'chopped', 'stemmed and rinsed',
's thick', 'y', 'chopped', 'drained and finely chopped', 'top round',
'julienned', 'cleaned', 'boil', 'calories', 'party', 'gluten', 'Filling',
't', 'or less Grenadine (', 'ground', 'casings removed', 'chili', 'snow',
'chill', 'Alcoholic', 'Acesulfame potassium sweetener'}
REMOVE_INGREDIENTS2 = {'rinsed and torn', 'and dried', 'rating',
'peeled and cubed', 'split', 'for topping', 'warmed', '\x99'}
def read_recipes(file: str) -> Dict[str, list]:
"""Read the given file and return a dictionary with recipe id mapping to
other attributes of the recipe stored in a list.
Based upon the clean_recipes.csv file.
"""
recipe_dict = {}
with open(file) as csv_file:
reader = csv.reader(csv_file, delimiter=';')
next(reader)
for row in reader:
dict_val = [] # list containing recipe attributes
index = 0
for inner_row in row[:9]:
if index == 0:
if "''" in inner_row:
inner_row = inner_row.replace("''", "'").strip("'")
dict_val.append(inner_row.strip())
else:
dict_val.append(inner_row.strip())
elif index == 7: # ingredients
ingredients = inner_row.split(',')
cleaned_ing = [y.strip() for y in ingredients]
dict_val.append(set(cleaned_ing))
elif index == 8: # directions
sentence = row[8].strip("'")
if not sentence[-2:] == "**":
sentence = sentence + "**" # to split every bullet point
directions = list(sentence.split('**'))[:-1]
for x in directions:
if "''" in x:
directions[directions.index(x)] = x.replace("''", "'")
dict_val.append(directions)
else:
dict_val.append(inner_row.strip())
index += 1
recipe_dict[row[9].strip()] = dict_val # remove extra space before assigning
return recipe_dict
def get_ing_amounts(file: str) -> Dict[str, list]:
"""Read the given file and return a dictionary with recipe id mapping to
the ingredients and their amounts.
Based upon the recipes.csv file.
"""
recipe_dict = {}
with open(file) as csv_file:
reader = csv.reader(csv_file, delimiter=';')
next(reader)
for row in reader:
dict_val = [] # list containing ingredients
index = 0
for _ in row[:9]:
if index == 7: # ingredients
all_ings = row[8].strip("'")
if not all_ings[-2:] == "**":
all_ings = all_ings + "**" # to split every bullet point
dict_val.extend(list(all_ings.split('**'))[:-1])
for x in dict_val:
if "''" in x:
dict_val[dict_val.index(x)] = x.replace("''", "'")
recipe_dict[row[1].strip()] = dict_val # remove extra space before assigning
index += 1
return recipe_dict
def get_ingredients(data: Dict[str, list]) -> set:
"""Return a set of ingredients given data complying to the
format the function 'read_recipes' returns data in."""
ing = set()
for i in data:
ing.update(data[i][7])
return ing
def get_review_scores(file: csv) -> Dict[str, float]:
"""Return a dictionary of recipe ids mapping to respective user ratings obtained
from the given file.
Based upon the clean_recipes.csv file.
"""
unclean_reviews_dict = {} # recipe_id: [score, length]
# 'length' for counting occurrences and taking average
with open(file) as csv_file:
reader = csv.reader(csv_file, delimiter=';')
next(reader)
for row in reader:
row = row[0].split(",")
if row[0] in unclean_reviews_dict:
unclean_reviews_dict[row[0]][0] += int(float(row[2]))
unclean_reviews_dict[row[0]][1] += 1
else:
unclean_reviews_dict[row[0]] = [int(float(row[2])), 1]
reviews_dict = {x: round(unclean_reviews_dict[x][0] / unclean_reviews_dict[x][1], 1)
for x in unclean_reviews_dict}
return reviews_dict
def get_reviews(file: csv) -> Dict[str, list]:
"""Return a dictionary of recipe ids mapping to respective user reviews obtained
from the given file.
Based on reviews.csv file.
"""
reviews_dict = {} # recipe_id: [reviews]
with open(file) as csv_file:
reader = csv.reader(csv_file, delimiter=';')
next(reader)
for row in reader:
if len(row) == 4 and '...' not in row[3]:
if row[0] in reviews_dict:
reviews_dict[row[0]].append('"' + row[3].replace("''", "'").strip("'") + '"')
else:
reviews_dict[row[0]] = ['"' + row[3].replace("''", "'").strip("'") + '"']
return reviews_dict
def clean_ingredients(data: Dict[str, list]) -> None:
"""Mutate the provided dictionary by cleaning the ingredients (removing measurements and
strings that aren't ingredients).
Additionally, ensure the names of all the recipes don't have unnecessary quotations
surrounding them.
"""
for recipe in data:
ingredients = data[recipe][7]
ingredients_to_remove, ingredients_to_add = clean_ingredient_set(ingredients)
for ingredient in ingredients_to_remove:
# Remove all the unnecessary ingredients: did not mutate in the for loop because of
# possible errors
ingredients.remove(ingredient)
for ingredient in ingredients_to_add:
if ingredient is not None and ingredient != '':
final_ingredient = ingredient
if final_ingredient[0] == ' ':
# Ensure the ingredient we are adding does not have an unnecessary space at the
# beginning
final_ingredient = ingredient[1:]
if final_ingredient not in {'y', 'to tast', 't'}:
ingredients.add(final_ingredient)
capitalized = set()
# Capitalize all ingredients in the ingredients corresponding to a given recipe
for ingredient in ingredients:
capitalize = ingredient.capitalize()
capitalized.add(capitalize)
data[recipe][7] = capitalized
# Ensure the provided recipe name doesn't have unnecessary quotations around it
name = data[recipe][0]
if name[0] == "'":
data[recipe][0] = name[1: -1]
def clean_ingredient_set(ingredients: set) -> Tuple:
"""Helper function for clean_ingredients.
Return a tuple where the first element is a set of unclean ingredients to remove from the
original set and the second element is a set of cleaned ingredients to add to the original set.
"""
ingredients_to_remove = set()
ingredients_to_add = set()
for ingredient in ingredients:
new_ingredient = None
remove = check_remove(ingredient)
if ingredient == '':
# If the ingredient is an empty string, remove it.
ingredients_to_remove.add(ingredient)
elif ingredient[-1] == ':' or ingredient.isupper() or ingredient in REMOVE_INGREDIENTS1:
# If a label has incorrectly been classified as an ingredient or the entire
# string is not an ingredient, remove it.
ingredients_to_remove.add(ingredient)
elif ')' in ingredient:
# Remove parenthesis if in ingredient
beginning = ingredient.index(')')
new_ingredient = ingredient[beginning + 2:]
elif ingredient[0] == ' ':
# If the ingredient has a space at the beginning of it's string, remove the space
new_ingredient = ingredient[1:]
ingredients_to_remove.add(ingredient)
elif any(character.isdigit() for character in ingredient):
# If there is a number indicating the quantity of an ingredient, remove it (and the
# space following it).
numbers = [character for character in ingredient if character.isdigit()]
number_index = ingredient.index(numbers[-1])
new_ingredient = ingredient[number_index + 2:]
ingredients_to_remove.add(ingredient)
if remove[1]:
ingredients_to_remove.add(ingredient)
new_ingredient = None
# Finally, check the ingredient doesn't contain any measurement key words.
elif ingredient != 'canola oil' and remove[0][0]:
word = remove[0][1]
beginning = ingredient.index(word)
word_end_index = beginning + len(word)
new_ingredient = ingredient[word_end_index + 1:]
ingredients_to_remove.add(ingredient)
if new_ingredient is not None:
if 'to taste' in new_ingredient:
# If the substring 'to taste' is in the ingredient, remove the 'to taste'
index = new_ingredient.index('to taste')
new_ingredient = new_ingredient[:index - 1]
else:
if 'to taste' in ingredient:
# If the substring 'to taste' is in the ingredient, remove the 'to taste'
index = ingredient.index('to taste')
new_ingredient = ingredient[:index - 1]
ingredients_to_remove.add(ingredient)
ingredients_to_add.add(new_ingredient)
return (ingredients_to_remove, ingredients_to_add)
def check_remove(ingredient: str) -> \
List[List, bool]:
"""Helper function for clean_ingredient_set.
Returns a list of booleans. The first nested list indicates whether there is a
'measurement word' in the provided ingredient and (if so) contains the measurement word
at the first index.
The second element is a boolean that indicates whether there is a word from REMOVE_INGREDIENTS2
present in the provided ingredient.
"""
remove = [[False], False]
for word in WORDS_TO_REMOVE:
# Remove measurements using key words
if word in ingredient:
remove[0][0] = True
remove[0].append(word)
for word in REMOVE_INGREDIENTS2:
# Remove entire ingredients if they aren't food items
if word in ingredient:
remove[1] = True
return remove