Skip to content

Commit

Permalink
feat(KAN_GPT.ipynb): notebook added for colab training
Browse files Browse the repository at this point in the history
  • Loading branch information
AdityaNG committed May 2, 2024
1 parent af810f5 commit bab061d
Show file tree
Hide file tree
Showing 4 changed files with 144 additions and 66 deletions.
83 changes: 83 additions & 0 deletions KAN_GPT.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "sDr8C_ddSnYt"
},
"source": [
"# KAN-GPT\n",
"\n",
"Making a Generative Pre-trained Transformer using Kolmogorov-Arnold Networks for language modeling\n",
"\n",
"- [minGPT](https://github.com/karpathy/minGPT)\n",
"- [pykan](https://github.com/KindXiaoming/pykan)\n",
"- [WebText](https://github.com/openai/gpt-2-output-dataset)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "mZQ_DwaYUx04"
},
"outputs": [],
"source": [
"# Download Repo\n",
"!git clone https://github.com/AdityaNG/kan-gpt\n",
"%cd kan-gpt\n",
"!git pull\n",
"# Download Dataset\n",
"!./scripts/download_webtext.sh\n",
"# Install dependencies\n",
"!pip install -r requirements.txt\n",
"!pip install -e ."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "I94Fcfs1TjEI",
"outputId": "e930a6f8-5f76-4e7a-b86c-c885fd7a8017"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tokenizer_config.json: 100% 26.0/26.0 [00:00<00:00, 101kB/s]\n",
"vocab.json: 100% 1.04M/1.04M [00:00<00:00, 10.6MB/s]\n",
"merges.txt: 100% 456k/456k [00:00<00:00, 17.4MB/s]\n",
"tokenizer.json: 100% 1.36M/1.36M [00:00<00:00, 24.5MB/s]\n",
"config.json: 100% 665/665 [00:00<00:00, 2.46MB/s]\n",
"torch.Size([1023]) torch.Size([1023])\n",
"number of parameters: 124.44M\n",
"running on device cpu\n",
"iter_dt 0.00ms; iter 0: train loss 11.00978\n"
]
}
],
"source": [
"!CUDA_VISIBLE_DEVICE=\"0\" python3 -m kan_gpt.train"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@ Awesome KAN-GPT created by AdityaNG
pip install kan_gpt
```

## Train

Dummy script to make sure everything is working as expected
```bash
CUDA_VISIBLE_DEVICE="0" python3 -m kan_gpt.train --architecture MLP --batch_size 1 --dummy_dataset
```

## Usage

```py
Expand All @@ -35,4 +42,4 @@ Read the [CONTRIBUTING.md](CONTRIBUTING.md) file.

- [minGPT](https://github.com/karpathy/minGPT)
- [pykan](https://github.com/KindXiaoming/pykan)

- [WebText](https://github.com/openai/gpt-2-output-dataset)
2 changes: 1 addition & 1 deletion kan_gpt/mingpt/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def get_default_config():
C.num_workers = 4
# optimizer parameters
C.max_iters = None
C.batch_size = 1 # 64
C.batch_size = 64
C.learning_rate = 3e-4
C.betas = (0.9, 0.95)
C.weight_decay = 0.1 # only applied on matmul weights
Expand Down
116 changes: 52 additions & 64 deletions kan_gpt/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,52 +11,31 @@


def eval_split(
trainer, split, max_batches, model, train_dataset, test_dataset
trainer, split, max_batches, batch_size, model, train_dataset, test_dataset
):
dataset = {"train": train_dataset, "test": test_dataset}[split]
n = train_dataset.length # naugy direct access shrug
n = len(train_dataset) # naugy direct access shrug
results = []
mistakes_printed_already = 0

loader = DataLoader(
dataset, batch_size=100, num_workers=0, drop_last=False
dataset, batch_size=batch_size, num_workers=0, drop_last=False
)
for b, (x, y) in enumerate(loader):
x = x.to(trainer.device)
y = y.to(trainer.device)
# isolate the input pattern alone
inp = x[:, :n]
sol = y[:, -n:]
# let the model sample the rest of the sequence
cat = model.generate(
inp, n, do_sample=False
) # using greedy argmax, not sampling
sol_candidate = cat[:, n:] # isolate the filled in sequence
# compare the predicted sequence to the true sequence
correct = (
(sol == sol_candidate).all(1).cpu()
) # Software 1.0 vs. Software 2.0 fight RIGHT on this line haha
for i in range(x.size(0)):
results.append(int(correct[i]))
if (
not correct[i] and mistakes_printed_already < 3
): # only print up to 5 mistakes to get a sense
mistakes_printed_already += 1
print(
"GPT claims that %s sorted is %s but gt is %s"
% (
inp[i].tolist(),
sol_candidate[i].tolist(),
sol[i].tolist(),
)
)

logits, loss = model(x, y)

results.append(loss)

if max_batches is not None and b + 1 >= max_batches:
break
rt = torch.tensor(results, dtype=torch.float)
print(
"%s final score: %d/%d = %.2f%% correct"
% (split, rt.sum(), len(results), 100 * rt.mean())
"%s loss: %.2f%%"
% (split, rt.mean())
)
return rt.sum()
return rt.mean()


def main(args):
Expand Down Expand Up @@ -85,10 +64,11 @@ def main(args):
# create a Trainer object
train_config = Trainer.get_default_config()
train_config.learning_rate = (
5e-4 # the model we're using is so small that we can go a bit faster
float(args.learning_rate) # the model we're using is so small that we can go a bit faster
)
train_config.max_iters = 2000
train_config.num_workers = 0
train_config.max_iters = int(args.max_iters)
train_config.num_workers = int(args.num_workers)
train_config.batch_size = int(args.batch_size)
trainer = Trainer(train_config, model, train_dataset)

def batch_end_callback(trainer):
Expand All @@ -98,45 +78,53 @@ def batch_end_callback(trainer):
f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}"
)

print("=" * 20)
print("EVAL")
print("=" * 20)

model.eval()
with torch.no_grad():
train_score = eval_split(
trainer,
"train",
max_batches=5,
batch_size=int(args.batch_size),
model=model,
train_dataset=train_dataset,
test_dataset=test_dataset,
)
test_score = eval_split(
trainer,
"test",
max_batches=5,
batch_size=int(args.batch_size),
model=model,
train_dataset=train_dataset,
test_dataset=test_dataset,
)

model.train()
print("train_score: ", train_score)
print("test_score: ", test_score)

print("=" * 20)

trainer.set_callback("on_batch_end", batch_end_callback)

trainer.run()

print("=" * 20)
print("EVAL")
print("=" * 20)

model.eval()
with torch.no_grad():
train_score = eval_split(
trainer,
"train",
max_batches=50,
model=model,
train_dataset=train_dataset,
test_dataset=test_dataset,
)
test_score = eval_split(
trainer,
"test",
max_batches=50,
model=model,
train_dataset=train_dataset,
test_dataset=test_dataset,
)

print("train_score: ", train_score)
print("test_score: ", test_score)

print("=" * 20)


if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser("KAN-GPT Trainer")
parser.add_argument("--model_type", default="gpt2")
parser.add_argument("--dummy_dataset", action="store_true")
parser.add_argument("--learning_rate", default=5e-3)
parser.add_argument("--max_iters", default=2000)
parser.add_argument("--num_workers", default=0)
parser.add_argument("--batch_size", default=64)

parser.add_argument(
"--architecture", choices=["MLP", "KAN"], default="KAN"
)
Expand Down

0 comments on commit bab061d

Please sign in to comment.