-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.mojo
345 lines (287 loc) · 13.8 KB
/
main.mojo
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
from max.engine import InputSpec, InferenceSession
from python import Python
from utils.index import Index
from time import now
from max.graph import Graph, TensorType, Type, ops
from max import engine
from max.tensor import Tensor, TensorShape
from max.engine import Model
from algorithm import sum
alias batch_size = 1
alias d_model = 516 # Embedding dimension
alias num_heads = 12
alias sequence_length = 64
@always_inline
fn numpy_data_pointer[
type: DType
](numpy_array: PythonObject) raises -> DTypePointer[type]:
return DTypePointer[type](
address=int(numpy_array.__array_interface__["data"][0])
)
@always_inline
fn tensor_to_numpy[
type: DType
](tensor: Tensor[type]) raises -> PythonObject:
var np = Python.import_module("numpy")
var tensor_shape = tensor.shape()
var tensor_rank = tensor.rank()
var python_list = Python.evaluate("list()")
for i in range(tensor_rank):
_ = python_list.append(tensor_shape[i])
var numpy_array:PythonObject = np.zeros(python_list, dtype=np.float32)
var dst = numpy_data_pointer[type](numpy_array)
var src = tensor.unsafe_ptr()
var length = tensor.num_elements()
memcpy(dst, src, length)
return numpy_array
@always_inline
fn numpy_to_tensor(numpy_array: PythonObject) raises -> Tensor[DType.float32]:
var tensor_shape = numpy_array.shape
var tensor_rank = len(numpy_array.shape)
var shape_list: List[Int] = List[Int]()
for i in range(tensor_rank):
shape_list.append(tensor_shape[i].__int__())
var tensor = Tensor[DType.float32] (shape_list)
var src = numpy_data_pointer[DType.float32](numpy_array)
var dst = tensor.unsafe_ptr()
var length = tensor.num_elements()
memcpy(dst, src, length)
return tensor
fn KQV_calculation(mat: Tensor[DType.float32], weight: Tensor[DType.float32], baises: Tensor[DType.float32], num_heads:Int,
head_dim:Int, multiplication:Model, transpose:Model, addition :Model, transpose_12:Model,
transpose_01:Model) raises -> Tensor[DType.float32]:
var results = transpose.execute("input0", weight)
var Q_T = results.get[DType.float32]("output0")
results = multiplication.execute("input0", mat, "input1", Q_T)
var Q = results.get[DType.float32]("output0")
results = addition.execute("input0", Q, "input1", baises)
var Q_B = results.get[DType.float32]("output0")
var known_product = batch_size * num_heads * head_dim
var inferred_dimension = Q_B.num_elements() // known_product
var t:TensorShape = (inferred_dimension, batch_size, num_heads, head_dim)
Q_B = Q_B.reshape(t)
results = transpose_12.execute("input0", Q_B)
Q = results.get[DType.float32]("output0")
t = (inferred_dimension, batch_size * num_heads, head_dim)
Q = Q.reshape(t)
results = transpose_01.execute("input0", Q)
Q = results.get[DType.float32]("output0")
return Q
struct Linear:
var weights: Tensor[DType.float32]
var biases: Tensor[DType.float32]
fn __init__(inout self, W: Tensor[DType.float32], B: Tensor[DType.float32]):
self.weights = W
self.biases = B
fn forward(self, input:Tensor[DType.float32] , multiplication_3D:Model, addition:Model) raises -> Tensor[DType.float32]:
var results = multiplication_3D.execute("input0", input, "input1", self.weights)
var output = results.get[DType.float32]("output0")
results = addition.execute("input0", output, "input1", self.biases)
output = results.get[DType.float32]("output0")
return output
struct LayerNorm:
var gema: Tensor[DType.float32]
var beta: Tensor[DType.float32]
fn __init__(inout self, gema: Tensor[DType.float32], beta: Tensor[DType.float32]):
self.gema = gema
self.beta = beta
fn forward(self, input:Tensor[DType.float32] , norm:Model) raises -> Tensor[DType.float32]:
var results = norm.execute("input0", input, "input1", self.gema, "input2", self.beta)
var output = results.get[DType.float32]("output0")
return output
struct Attention:
var W_K: Tensor[DType.float32]
var W_Q: Tensor[DType.float32]
var W_V: Tensor[DType.float32]
var W_O: Tensor[DType.float32]
var b_K: Tensor[DType.float32]
var b_Q: Tensor[DType.float32]
var b_V: Tensor[DType.float32]
var b_O: Tensor[DType.float32]
var embed_dim: Int
var num_heads: Int
var head_dim: Int
fn __init__(inout self, W_K: Tensor[DType.float32], W_Q: Tensor[DType.float32],W_V: Tensor[DType.float32],W_O: Tensor[DType.float32],
b_K: Tensor[DType.float32],b_Q: Tensor[DType.float32],b_V: Tensor[DType.float32],b_O: Tensor[DType.float32],
embed_dim:Int, num_heads:Int):
self.W_K = W_K
self.W_Q = W_Q
self.W_V = W_V
self.W_O = W_O
self.b_K = b_K
self.b_Q = b_Q
self.b_V = b_V
self.b_O = b_O
self.embed_dim = embed_dim
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads
fn forward(self, query:Tensor[DType.float32] , key: Tensor[DType.float32], value: Tensor[DType.float32],
multiplication:Model, transpose:Model, addition :Model, transpose_12:Model, transpose_01:Model,
transpose_21:Model, multiplication_3D:Model, division:Model, softmax:Model) raises -> Tensor[DType.float32]:
######### Calculating Linear Projection of Q, K, V and spliting into num heads #########
var Q = KQV_calculation(query, self.W_Q, self.b_Q, self.num_heads, self.head_dim, multiplication, transpose, addition,
transpose_12, transpose_01)
var K = KQV_calculation(key, self.W_K, self.b_K, self.num_heads, self.head_dim, multiplication, transpose, addition,
transpose_12, transpose_01)
var V = KQV_calculation(value, self.W_V, self.b_V, self.num_heads, self.head_dim, multiplication, transpose, addition,
transpose_12, transpose_01)
var inferred_dim = Q.num_elements() // (1 * self.num_heads * self.head_dim)
######### Compute attention scores #########
var results = transpose_21.execute("input0", K)
K = results.get[DType.float32]("output0")
results = multiplication_3D.execute("input0", Q, "input1", K)
var attn_scores = results.get[DType.float32]("output0")
var divisor = Tensor[DType.float32](1)
divisor[0] = math.sqrt(self.head_dim)
results = division.execute("input0", attn_scores, "input1", divisor)
attn_scores = results.get[DType.float32]("output0")
######### Applying softmax manually to get the attention weights #########
var attn_weights = Tensor[DType.float32] (attn_scores.shape())
for i in range(attn_scores.shape()[0]):
for j in range(attn_scores.shape()[1]):
var new_tens = Tensor[DType.float32] (attn_scores.shape()[2])
new_tens.store(0, attn_scores.load[width = sequence_length](i,j,0)) # Extract the last dimension for the current slice
var results = softmax.execute("input0", new_tens)
var xd = results.get[DType.float32] ("output0")
attn_weights.store(Index(i,j,0), xd.load[width = sequence_length](0))
######### Multiply the attention weights with the value projections #########
results = multiplication_3D.execute("input0", attn_weights, "input1", V)
var attn_output = results.get[DType.float32]("output0")
######### Reshape back to original dimensions #########
results = transpose_01.execute("input0", attn_output)
attn_output = results.get[DType.float32]("output0")
var t:TensorShape = (inferred_dim, batch_size, self.embed_dim)
attn_output = attn_output.reshape(t)
######### Apply the output linear projection using matmul and addition #########
results = transpose.execute("input0", self.W_O)
var WO_T = results.get[DType.float32]("output0")
results = multiplication.execute("input0", attn_output, "input1", WO_T)
var almost = results.get[DType.float32]("output0")
results = addition.execute("input0", almost, "input1", self.b_O)
var output = results.get[DType.float32]("output0")
return output
fn main() raises:
######### Creating all the graphs for computation #########
print("Compiling Graphs")
var graph = Graph(in_types=List[Type](TensorType(DType.float32, "a","m", "n"), TensorType(DType.float32, "n","x")))
var out = graph[0] @ graph[1]
graph.output(out)
graph.verify()
var session = engine.InferenceSession()
var multiplication = session.load(graph)
var graph1 = Graph(in_types=List[Type](TensorType(DType.float32, "a","m")))
var transposed = ops.transpose(graph1[0],-1,-2)
graph1.output(transposed)
graph1.verify()
var transpose = session.load(graph1)
var graph2 = Graph(in_types=List[Type](TensorType(DType.float32, "a","m", "n"), TensorType(DType.float32, "n")))
var out2 = graph2[0] + graph2[1]
graph2.output(out2)
graph2.verify()
var addition = session.load(graph2)
var graph3 = Graph(in_types=List[Type](TensorType(DType.float32, "a", "b", "c", "d")))
transposed = ops.transpose(graph3[0],1,2)
graph3.output(transposed)
graph3.verify()
var transpose_12 = session.load(graph3)
var graph4 = Graph(in_types=List[Type](TensorType(DType.float32, "a", "b", "c")))
transposed = ops.transpose(graph4[0],0,1)
graph4.output(transposed)
graph4.verify()
var transpose_01 = session.load(graph4)
var graph5 = Graph(in_types=List[Type](TensorType(DType.float32, "a", "b", "c")))
transposed = ops.transpose(graph5[0],-2,-1)
graph5.output(transposed)
graph5.verify()
var transpose_21 = session.load(graph5)
var graph6 = Graph(in_types=List[Type](TensorType(DType.float32, "a","m", "n"), TensorType(DType.float32, "a","n", "x")))
var out6 = graph6[0] @ graph6[1]
graph6.output(out6)
graph6.verify()
var multiplication_3D = session.load(graph6)
var graph7 = Graph(in_types=List[Type](TensorType(DType.float32, "a","m","n"), TensorType(DType.float32)))
var div = graph7[0] / graph7[1]
graph7.output(div)
graph7.verify()
var division = session.load(graph7)
var graph8 = Graph(in_types=List[Type](TensorType(DType.float32, "a")))
var softmaxed = ops.softmax(graph8[0])
graph8.output(softmaxed)
graph8.verify()
var softmax = session.load(graph8)
var graph9 = Graph(in_types=List[Type](TensorType(DType.float32, "a", "b", "c"),TensorType(DType.float32, "c"), TensorType(DType.float32, "c")))
var mean = ops.layer_norm(graph9[0],gamma = graph9[1], beta = graph9[2] , epsilon = 1e-5)
graph9.output(mean)
graph9.verify()
var norm = session.load(graph9)
Python.add_to_path(".")
var mypython = Python.import_module("main")
var mul: SIMD[DType.float32,1] = -1
var mojo:Float32 = 0.0
var pytorch:Float32 = 0.0
print("Starting 144 attention layers")
var start = now()
for j in range(144):
var start_pytorch = now()
var x: PythonObject = mypython.inputs_outputs()
var end_pytorch = now()
var execution_time_pytorch = (end_pytorch - start_pytorch)/1000000000
var input_weights = List[Tensor[DType.float32]]()
var output = numpy_to_tensor(x[-2])
var input = numpy_to_tensor(x[-1])
for i in range(len(x)-1):
input_weights.append(numpy_to_tensor(x[i]))
var start_mojo = now()
var layer1 = Attention(input_weights[0],input_weights[1],input_weights[2],input_weights[3],input_weights[4],
input_weights[5],input_weights[6],input_weights[7],d_model,num_heads)
var xd = layer1.forward(input,input,input,multiplication, transpose, addition, transpose_12, transpose_01, transpose_21,
multiplication_3D, division, softmax)
var end_mojo = now()
var execution_time_mojo = (end_mojo - start_mojo)/1000000000
pytorch += execution_time_pytorch
mojo += execution_time_mojo
# var diff = xd.__sub__(output.__mul__(mul))
# var s:Float32 = sum(diff._to_buffer())
# if s >= 1:
# print("Attention Layers Have Different Outputs")
# print("xd",xd)
# print("output:",output)
# print("diff:",diff)
# print("s:",s)
# else:
# print("Attention Layers Have Same Outputs")
var shape2 = TensorShape(d_model)
var gema = Tensor[DType.float32](shape2)
for i in range(gema.num_elements()):
gema[i] = 1
var beta = Tensor[DType.float32](shape2)
print("starting 144 Layer Norm")
for j in range(144):
var start_pytorch = now()
var y: PythonObject = mypython.norm_input_output()
var end_pytorch = now()
var execution_time_pytorch = (end_pytorch - start_pytorch)/1000000000
var input = numpy_to_tensor(y[0])
var output = numpy_to_tensor(y[1])
var start_mojo = now()
var layer2 = LayerNorm(gema, beta)
var xd = layer2.forward(input, norm)
var end_mojo = now()
var execution_time_mojo = (end_mojo - start_mojo)/1000000000
pytorch += execution_time_pytorch
mojo += execution_time_mojo
# var diff = xd.__sub__(output.__mul__(mul))
# var s:Float32 = sum(diff._to_buffer())
# if s >= 1.0:
# print("Norm Layers Have Different Outputs")
# print(xd)
# print(output)
# print(diff)
# print(s)
# else:
# print("Norm Layers Have Same Outputs")
var end = now()
print("Time taken in 12 attention and 12 Layer norms:")
print("mojo(sec):",mojo)
print("pytorch(sec):",pytorch)
print("Total time taken i.e., reading the weights, running both mojo and pytorch:", (end - start)/1000000000)