-
Notifications
You must be signed in to change notification settings - Fork 1
/
mapreduce.py
63 lines (57 loc) · 1.72 KB
/
mapreduce.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import pandas as pd
import numpy as np
current_word = None
current_count = 0
word = None
book = open('mushrooms1.csv')
lines = book.readlines()
lines1 = []
count1 = []
word1 = []
# input comes from STDIN
for line in lines:
# remove leading and trailing whitespace
line = line.strip()
lines1.append(line)
words = line.split(',')
for word in words:
count = 1
count1.append(count)
word1.append(word)
#word1 = np.array(word1, dtype=None)
#count1 = np.array(count1, dtype=np.int8)
# parse the input we got from mapper.py
#word, count = line.split(',', 1)
#print(lines1[0])
#word1 = np.array(word1,dtype=None)
#count1 = np.array(count1,dtype=np.int8)
#count_word = np.ones((word1.shape[0],))
#word1 = np.reshape(word1,)
#output_map = np.concatenate((word1, count_word), axis=1)
#print(count_word.shape,word1.shape)
count1 = np.array(count1,dtype=np.int8)
worda1 = np.array(word1,dtype=None)
output_map = np.stack((worda1, count1.T), axis=1)
print(output_map)
df = pd.DataFrame(data=output_map,columns=['word','count'])
df = df.sort_values('word')
#df.to_csv('output_map.csv')
word_new = None
count_new = 0
word_reduce = []
count_reduce = []
for row in df.itertuples():
#row[0] = int(row[0])
iteration_count = int(getattr(row, "count"))
if word_new == getattr(row, "word"):
count_new += iteration_count
else:
word_reduce.append(word_new)
count_reduce.append(count_new)
word_new = getattr(row, "word")
count_new = iteration_count
#print(word_reduce,count_reduce)
count_reduce = np.array(count_reduce,dtype=np.int8)
word_reduce = np.array(word_reduce,dtype=None)
output_reduce = np.stack((word_reduce, count_reduce.T), axis=1)
print(output_reduce)