From bab061d1e27e67eb8fe4d5e8e8f4e8a8698b443e Mon Sep 17 00:00:00 2001 From: Aditya NG Date: Thu, 2 May 2024 23:42:12 +0530 Subject: [PATCH] feat(KAN_GPT.ipynb): notebook added for colab training --- KAN_GPT.ipynb | 83 +++++++++++++++++++++++++++ README.md | 9 ++- kan_gpt/mingpt/trainer.py | 2 +- kan_gpt/train.py | 116 +++++++++++++++++--------------------- 4 files changed, 144 insertions(+), 66 deletions(-) create mode 100644 KAN_GPT.ipynb diff --git a/KAN_GPT.ipynb b/KAN_GPT.ipynb new file mode 100644 index 0000000..20a740d --- /dev/null +++ b/KAN_GPT.ipynb @@ -0,0 +1,83 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "sDr8C_ddSnYt" + }, + "source": [ + "# KAN-GPT\n", + "\n", + "Making a Generative Pre-trained Transformer using Kolmogorov-Arnold Networks for language modeling\n", + "\n", + "- [minGPT](https://github.com/karpathy/minGPT)\n", + "- [pykan](https://github.com/KindXiaoming/pykan)\n", + "- [WebText](https://github.com/openai/gpt-2-output-dataset)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "mZQ_DwaYUx04" + }, + "outputs": [], + "source": [ + "# Download Repo\n", + "!git clone https://github.com/AdityaNG/kan-gpt\n", + "%cd kan-gpt\n", + "!git pull\n", + "# Download Dataset\n", + "!./scripts/download_webtext.sh\n", + "# Install dependencies\n", + "!pip install -r requirements.txt\n", + "!pip install -e ." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "I94Fcfs1TjEI", + "outputId": "e930a6f8-5f76-4e7a-b86c-c885fd7a8017" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tokenizer_config.json: 100% 26.0/26.0 [00:00<00:00, 101kB/s]\n", + "vocab.json: 100% 1.04M/1.04M [00:00<00:00, 10.6MB/s]\n", + "merges.txt: 100% 456k/456k [00:00<00:00, 17.4MB/s]\n", + "tokenizer.json: 100% 1.36M/1.36M [00:00<00:00, 24.5MB/s]\n", + "config.json: 100% 665/665 [00:00<00:00, 2.46MB/s]\n", + "torch.Size([1023]) torch.Size([1023])\n", + "number of parameters: 124.44M\n", + "running on device cpu\n", + "iter_dt 0.00ms; iter 0: train loss 11.00978\n" + ] + } + ], + "source": [ + "!CUDA_VISIBLE_DEVICE=\"0\" python3 -m kan_gpt.train" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/README.md b/README.md index bd234be..0576649 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,13 @@ Awesome KAN-GPT created by AdityaNG pip install kan_gpt ``` +## Train + +Dummy script to make sure everything is working as expected +```bash +CUDA_VISIBLE_DEVICE="0" python3 -m kan_gpt.train --architecture MLP --batch_size 1 --dummy_dataset +``` + ## Usage ```py @@ -35,4 +42,4 @@ Read the [CONTRIBUTING.md](CONTRIBUTING.md) file. - [minGPT](https://github.com/karpathy/minGPT) - [pykan](https://github.com/KindXiaoming/pykan) - +- [WebText](https://github.com/openai/gpt-2-output-dataset) diff --git a/kan_gpt/mingpt/trainer.py b/kan_gpt/mingpt/trainer.py index 603c9c5..53a3fda 100644 --- a/kan_gpt/mingpt/trainer.py +++ b/kan_gpt/mingpt/trainer.py @@ -23,7 +23,7 @@ def get_default_config(): C.num_workers = 4 # optimizer parameters C.max_iters = None - C.batch_size = 1 # 64 + C.batch_size = 64 C.learning_rate = 3e-4 C.betas = (0.9, 0.95) C.weight_decay = 0.1 # only applied on matmul weights diff --git a/kan_gpt/train.py b/kan_gpt/train.py index e249cf6..3f24496 100644 --- a/kan_gpt/train.py +++ b/kan_gpt/train.py @@ -11,52 +11,31 @@ def eval_split( - trainer, split, max_batches, model, train_dataset, test_dataset + trainer, split, max_batches, batch_size, model, train_dataset, test_dataset ): dataset = {"train": train_dataset, "test": test_dataset}[split] - n = train_dataset.length # naugy direct access shrug + n = len(train_dataset) # naugy direct access shrug results = [] - mistakes_printed_already = 0 + loader = DataLoader( - dataset, batch_size=100, num_workers=0, drop_last=False + dataset, batch_size=batch_size, num_workers=0, drop_last=False ) for b, (x, y) in enumerate(loader): x = x.to(trainer.device) y = y.to(trainer.device) - # isolate the input pattern alone - inp = x[:, :n] - sol = y[:, -n:] - # let the model sample the rest of the sequence - cat = model.generate( - inp, n, do_sample=False - ) # using greedy argmax, not sampling - sol_candidate = cat[:, n:] # isolate the filled in sequence - # compare the predicted sequence to the true sequence - correct = ( - (sol == sol_candidate).all(1).cpu() - ) # Software 1.0 vs. Software 2.0 fight RIGHT on this line haha - for i in range(x.size(0)): - results.append(int(correct[i])) - if ( - not correct[i] and mistakes_printed_already < 3 - ): # only print up to 5 mistakes to get a sense - mistakes_printed_already += 1 - print( - "GPT claims that %s sorted is %s but gt is %s" - % ( - inp[i].tolist(), - sol_candidate[i].tolist(), - sol[i].tolist(), - ) - ) + + logits, loss = model(x, y) + + results.append(loss) + if max_batches is not None and b + 1 >= max_batches: break rt = torch.tensor(results, dtype=torch.float) print( - "%s final score: %d/%d = %.2f%% correct" - % (split, rt.sum(), len(results), 100 * rt.mean()) + "%s loss: %.2f%%" + % (split, rt.mean()) ) - return rt.sum() + return rt.mean() def main(args): @@ -85,10 +64,11 @@ def main(args): # create a Trainer object train_config = Trainer.get_default_config() train_config.learning_rate = ( - 5e-4 # the model we're using is so small that we can go a bit faster + float(args.learning_rate) # the model we're using is so small that we can go a bit faster ) - train_config.max_iters = 2000 - train_config.num_workers = 0 + train_config.max_iters = int(args.max_iters) + train_config.num_workers = int(args.num_workers) + train_config.batch_size = int(args.batch_size) trainer = Trainer(train_config, model, train_dataset) def batch_end_callback(trainer): @@ -98,38 +78,41 @@ def batch_end_callback(trainer): f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}" ) + print("=" * 20) + print("EVAL") + print("=" * 20) + + model.eval() + with torch.no_grad(): + train_score = eval_split( + trainer, + "train", + max_batches=5, + batch_size=int(args.batch_size), + model=model, + train_dataset=train_dataset, + test_dataset=test_dataset, + ) + test_score = eval_split( + trainer, + "test", + max_batches=5, + batch_size=int(args.batch_size), + model=model, + train_dataset=train_dataset, + test_dataset=test_dataset, + ) + + model.train() + print("train_score: ", train_score) + print("test_score: ", test_score) + + print("=" * 20) + trainer.set_callback("on_batch_end", batch_end_callback) trainer.run() - print("=" * 20) - print("EVAL") - print("=" * 20) - - model.eval() - with torch.no_grad(): - train_score = eval_split( - trainer, - "train", - max_batches=50, - model=model, - train_dataset=train_dataset, - test_dataset=test_dataset, - ) - test_score = eval_split( - trainer, - "test", - max_batches=50, - model=model, - train_dataset=train_dataset, - test_dataset=test_dataset, - ) - - print("train_score: ", train_score) - print("test_score: ", test_score) - - print("=" * 20) - if __name__ == "__main__": import argparse @@ -137,6 +120,11 @@ def batch_end_callback(trainer): parser = argparse.ArgumentParser("KAN-GPT Trainer") parser.add_argument("--model_type", default="gpt2") parser.add_argument("--dummy_dataset", action="store_true") + parser.add_argument("--learning_rate", default=5e-3) + parser.add_argument("--max_iters", default=2000) + parser.add_argument("--num_workers", default=0) + parser.add_argument("--batch_size", default=64) + parser.add_argument( "--architecture", choices=["MLP", "KAN"], default="KAN" )