forked from sanketpurandare/scyther
-
Notifications
You must be signed in to change notification settings - Fork 0
/
runtime_stats.py
69 lines (61 loc) · 2.25 KB
/
runtime_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import copy
from typing import Callable, Tuple
import torch
from runtime_estimator import RuntimeEstimator
from test_model import GPT, GPTConfig, loss_fn
from torch import nn, optim
from torch._subclasses.fake_tensor import FakeTensorMode
from torch.utils.benchmark import timer
def collect_runtime_stats(
model: nn.Module,
optimizer: optim.Optimizer,
inp_and_target: Tuple[torch.Tensor, torch.Tensor],
loss_fn: Callable = lambda x, y: sum(x, y),
):
# We just need one actual iteration for estimation
warm_up_iters, actual_iters = 1, 1
inp, target = inp_and_target
def inner(num_iters: int):
for _ in range(num_iters):
loss = loss_fn(model(inp), target)
loss.backward()
optimizer.step()
optimizer.zero_grad()
# Initializing optimizer states and warm-up
inner(warm_up_iters)
estimate_mode = RuntimeEstimator()
with estimate_mode(estimate_mode_type="operator-level-cost-model"):
start = timer()
inner(actual_iters)
end = timer()
# We use only one iteration for estimation
print(f"Estimation process total_time: {end-start:.3f} ms")
estimate_mode.display_modulewise_stats(depth=4)
return (
copy.deepcopy(estimate_mode.mod_runtimes),
copy.deepcopy(estimate_mode.mod_fw_pre_order),
copy.deepcopy(estimate_mode.mod_bw_pre_order),
copy.deepcopy(estimate_mode.mod_fw_post_order),
copy.deepcopy(estimate_mode.mod_bw_post_order),
)
if __name__ == "__main__":
with FakeTensorMode():
dev = torch.device(torch.cuda.current_device())
n_layer = 6
vocab_size = 8192
config = GPTConfig(
block_size=512,
n_layer=n_layer,
dropout=0.01,
vocab_size=vocab_size,
checkpoint_activations=False,
)
with torch.device(dev):
model = GPT(config)
optimizer = optim.Adam(model.parameters(), lr=1e-2, foreach=True)
torch.manual_seed(1)
bsz, seq_len = 64, 512
src = torch.randint(0, vocab_size, (bsz, seq_len), device=dev)
tgt = torch.randint(0, vocab_size, (bsz, seq_len), device=dev)
inp = (src, tgt)
collect_runtime_stats(model, optimizer, inp, loss_fn)