diff --git a/notebook/experiment/infctx-math-and-name/config-mem-template.yaml b/notebook/experiment/infctx-math-and-name/config-mem-template.yaml new file mode 100644 index 00000000..36a88aba --- /dev/null +++ b/notebook/experiment/infctx-math-and-name/config-mem-template.yaml @@ -0,0 +1,197 @@ +# lightning.pytorch==2.0.2 +seed_everything: true +trainer: + # Configure the number of GPU, avaliable on your machine + accelerator: gpu + devices: auto + num_nodes: 1 + + # Configure the deepspeed strategy, + strategy: deepspeed_stage_1 + + # Floating point precision for the model, because RWKV is built FOR bf16 + # you should pretty much never change this setting + precision: bf16 + + # Logger setting for wandb, if you want to enable wandb, uncomment the whole logger section + # --- + logger: + class_path: lightning.pytorch.loggers.WandbLogger + init_args: + name: 'Memory Instruct (bs=256, train-ctx=512)' + project: 'RWKV-X-Experiments' + tags: ['RWKV', 'RWKV-X'] + + # Checkpoint settings for the training process + callbacks: + class_path: lightning.pytorch.callbacks.ModelCheckpoint + init_args: + # Configure this to the path you want to save your checkpoints to + # note that a subdir will be created with the name `epoch=x-step=y.ckpt` + # + # to convert a checkpoint to a model, you can use the + # `python3 export_checkpoint.py ` script, + # which will create a `rwkv_model.pth` in the checkpoint directory. + # + # Do not use the `zero_to_fp32.py` script as that will have export format issues + dirpath: ../checkpoint/V5-Base-1B5-mem-instruct/ + filename: null + + # Save the top/last K checkpoints + save_top_k: 2 + # Choose by the most recent checkpoints (time based) + monitor: 'step' + mode: max + + # If enabled (true), save a copy of the latest checkpoint to 'last.ckpt' + # useful to simply checkpoint resume scripts, at a price of disk performance + save_last: true + + # DO NOT set this as true, as the model weight exported will have format issues + # expert as checkpoint, and use the `export_checkpoint.py` script to convert to model instead + save_weights_only: false + + # How frequent you want to save a checkpoint for every step. + # This will happen for every X data sample, where X = every_n_train_steps * accumulate_grad_batches + # + # In general you will want to avoid putting a low number (expecially if accumulate_grad_batches <= 100) + # as the checkpoint process, will pause all the gpu training for some time, slowing down the overall process + # However you do not want to configure too high of a number, where you will lose too much progress if the training crashes + every_n_train_steps: 25 + every_n_epochs: null + save_on_train_epoch_end: true + train_time_interval: null + + # Other settings, you can probably leave alone + verbose: false + auto_insert_metric_name: true + + ######################################## + ## Training run parameter settings + ######################################## + + # Generally what you want to configure is the maximum number of epochs + # Leave it as -1, and it will keep going forever till interrupted + # Or set it as a number, and it will stop after that number of epochs + max_epochs: 1 + min_epochs: null + + # Number of datasamples to train for each step, a data sample is considered + # a "substep" in wandb logs, and a "step" is tracked as "trainer/global_step" + # + # This decides the number of datasample, to learn together from, before backproping + # any weight changes at the end of the batch. + # + # Recommended to be a big enough number (like 128/256) where it prevents the training + # loss from flucuating in the process. But not too big of a number where the increased + # GPU vRAM / offloaded RAM usage will cause the training to crash. + # + # You are also recommended to configure this to a large enough number to fully utilize + # your GPU processing time %, and avoid idle time for the GPU between batches + target_batch_size: 256 + +######################################## +## Training model settings +######################################## +model: + # Model to start the finetune/training process from + load_model: ../model/EWR-1B5-enwiki-instruct.pth + + # Context length to use for the training process + # the larger the number (and batch size) the larger the vram usage + # + # Note that if the datasample context length is larger then the ctx_len + # its training process would be split into ctx_len sized chunks. + # + # This allows the training of extreamly large context length (eg. 100k), + # without eating up too much vram by keeping the training context length + # to a resonable number sutible to the current GPU setup + ctx_len: 512 + + # Learning rate of the training process + # --- + # Initia learning rate of the process + lr_init: 8e-4 + # Final learning rate after the learning rate period + # learning rate will stay at final value from then onwards + # + # NOTE: lr_final / lr_period does not work with warmup_steps + # and will be ignored (or replaced) with the warmup_steps logic instead + lr_final: 5e-4 + + # Number of epoch to reduce the learning rate from lr_init to lr_final + # 1 means a single epoch (so lr would be lr_final from epoch 2 onwards) + # 0 means lr_final will apply immediately + # -1 means we take the current max_step / max_epoch as the period + lr_period: 1 + # lr_period type if its set, defaults to epoch + lr_period_type: epoch + + # We limit bptt_learning_range, to 1, to ensure high throughput within a multi-gpu setup. + # (by skipping some syncronization code). Additionally, as bptt learning should not be triggering + # anyway as the data sample should be within ctx size 99% of the time + bptt_learning: true + bptt_learning_range: 1 + +data: + # dataset_path for the prebuilt dataset, using HF `load_from_disk()` + # + # Use this if you have built your own dataset and saved it with `save_to_disk()` + # with source left as null. Other wise configure this to a directory which the + # dataset will be built and tokenized by the huggingface dataset process. + data_path: ../datapath/picocreator/experiment/rwkv-x-exp/memory/ + + # Other wise provide the source path, which is used as huggingface dataset path + # this will be used to populate the dataset_path + # + # Use either the following + # - hugging face dataset + # - Directory path to a directory containing dataset files + # - Path to a single dataset file + # - hugging face dataset mode (ie: text,csv,etc - use data_dir, to configure the path then) + # - null + # + # If source is disabled, all other params, except data_path, is ignored + source: json + # source: text + # source: /home/ubuntu/RWKV-LM-LoRA/dataset-text/enwik8.txt + + # Use data_dir, if you are using source=text/json/etc + # this should be relative to the trainer script path + source_data_dir: ../notebook/experiment/infctx-math-and-name/dataset/ + + # After loading the dataset, split out test data used for validation, + # This process is skipped if the dataset includes a test split + # This process is skipped if set to zero + test_split: 0.001 + test_split_shuffle: true + + # Tokenizer to use, use either the inbuilt 'neox', or 'world' tokenizer + # If using a custom tokenizer, provide the tokenizer file path + # --- + tokenizer: neox + + # Minimum / Maximum token size of the dataset to use + # useful for filtering out small noisy data samples from large datasets + # (eg. removal of small articles of less then 512 tokens from wikipedia) + # + # This is ignored, if set to -1 + min_token_size: -1 + max_token_size: 512 + + # Multi Column merging process, default setting is used to support and merge + # "instruction", "input", "output", datasets. To disable set multi_column_keys to [] + # + # A minimum of 2 columns is required, with non empty data, for the merge to occur + # If no match is found, this will fallback to the default prompt/completion or text column, + # or throw an error if the default fallback is not found + # --- + multi_column_keys: ['input_prefix', 'input', 'output_prefix', 'output', 'closing'] + multi_column_prefix: ['', '', '', '', ''] + multi_column_train_mask: [true, false, true, true, true] + multi_column_separator: '' + + # If processing prompt/completion jsonl pairs, the prompt is masked by default + # use this flag to disable this default behaviour + # --- + # disable_prompt_mask: false diff --git a/notebook/experiment/infctx-math-and-name/generate_math_and_name_dataset.py b/notebook/experiment/infctx-math-and-name/generate_math_and_name_dataset.py new file mode 100644 index 00000000..7c167503 --- /dev/null +++ b/notebook/experiment/infctx-math-and-name/generate_math_and_name_dataset.py @@ -0,0 +1,76 @@ +import numpy as np +import argparse + +prompt_template = f""" +You are an AI assistant who will be given some tasks to complete. +First, you will be given a name to remember. Then, you will have to sum up a series of numbers. +You will then be asked to answer some questions about the document. + +Example 1: +Name: John +1 +-2 +3 +-4 + +### Question: +What is the total sum? + +### Answer: +-2 + +### Question: +What is the name given at the start of the document? + +### Answer: +John + +Now you will be tasked to remember the name and sum up the following series of numbers. + +""" + +task_templates = [ + "\n### Question:\nWhat is the name given at the start of the document?\n\n### Answer:\n", + "\n### Question:\nWhat is the sum of the numbers given?\n\n### Answer:\n" +] + +completion_templates = [ + "\n{name}\n", + "\n{sum_of_numbers}\n", +] + +def load_names(file_path): + with open(file_path) as word_file: + valid_names = list(word_file.read().split()) + return valid_names + +names = load_names("infctx-math-and-name/names.txt") + +def get_random_prompt_completion_pair(max_numbers): + document = "" + numbers = np.random.randint(-200, 200, size=(max_numbers)) + total_sum = np.sum(numbers) + for number in numbers: + document += str(number) + "\n" + + template_index = np.random.randint(0, len(task_templates)) + task = task_templates[template_index] + name = names[np.random.randint(0, len(names))] + + prompt = prompt_template + f"Name: {name}\n" + document + task + completion = completion_templates[template_index].format(sum_of_numbers=total_sum, name=name) + return {'prompt': prompt, 'completion': completion} + +def generate_jsonl(output_file_path, max_numbers, num_samples): + with open(output_file_path, 'w') as output_file: + for _ in range(num_samples): + pair = get_random_prompt_completion_pair(max_numbers) + output_file.write(str(pair) + "\n") + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--out-file", type=str, default="questions.jsonl") + parser.add_argument("--max-numbers", type=int, default=100) + parser.add_argument("--num-samples", type=int, default=10) + args = parser.parse_args() + generate_jsonl(args.out_file, args.max_numbers, args.num_samples) \ No newline at end of file diff --git a/notebook/experiment/infctx-math-and-name/run.ipynb b/notebook/experiment/infctx-math-and-name/run.ipynb index a6100842..a42ede5c 100644 --- a/notebook/experiment/infctx-math-and-name/run.ipynb +++ b/notebook/experiment/infctx-math-and-name/run.ipynb @@ -1,5 +1,53 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "## Generating math and name dataset ##\n", + "## Done ##\n", + "total 15M\n", + "drwx------ 2 christopherchou u-christopherchou 3 Sep 16 17:12 .\n", + "drwx------ 13 christopherchou u-christopherchou 14 Sep 16 17:01 ..\n", + "-rw------- 1 christopherchou u-christopherchou 55M Sep 16 17:12 questions_numbers_1024.jsonl\n" + ] + } + ], + "source": [ + "%%script bash\n", + "\n", + "########################################\n", + "# Generate the required jsonl dataset\n", + "########################################\n", + "\n", + "# Go to config dir\n", + "cd \"../\"\n", + "\n", + "# Reset the dataset dir\n", + "mkdir -p ../dataset\n", + "rm -rf ../dataset/*.jsonl\n", + "\n", + "# Generate the various datasets\n", + "echo \"## Generating math and name dataset ##\"\n", + "\n", + "#\n", + "# We reduce the training set for lower word count - and shift the focus upwards\n", + "#\n", + "# do\n", + "python3 infctx-math-and-name/generate_math_and_name_dataset.py --out-file ../dataset/questions_numbers_1024.jsonl --max-numbers 1024 --num-samples 10000\n", + "# done\n", + "\n", + "wait\n", + "echo \"## Done ##\"\n", + "\n", + "ls -alh ../dataset/" + ] + }, { "cell_type": "code", "execution_count": null, @@ -9,8 +57,22 @@ } ], "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" }, "orig_nbformat": 4 }, diff --git a/notebook/experiment/infctx-math-and-name/stage2.ipynb b/notebook/experiment/infctx-math-and-name/stage2.ipynb new file mode 100644 index 00000000..bff1251b --- /dev/null +++ b/notebook/experiment/infctx-math-and-name/stage2.ipynb @@ -0,0 +1,358 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RWKV v5\n", + "\n", + "Simple memory training for a small model\n", + "\n", + "**Note:** This project assumes you have the rwkv-infctx conda env setup" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Basic Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# First lets setup the various directories, and init the model\n", + "!ls ../../../../../\n", + "!mkdir -p ../../../../../models/\n", + "!mkdir -p ../../../../../datapath/\n", + "!mkdir -p ../../../../../checkpoint/" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Additional dependencies for eval stuff\n", + "!pip3 install -q aiocsv aiofiles" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DEEPSPEED_STRAT: deepspeed_stage_1\n", + "ENABLE_WANDB: True\n", + "GPU_DEVICES: auto\n", + "DIR_NAME: infctx-math-and-name\n", + "NOTEBOOK_DIR: /data/chris/rwkv-fork/RWKV-infctx-trainer/notebook/experiment/infctx-math-and-name\n", + "INFERENCE_DIR: /data/chris/rwkv-fork/RWKV-infctx-trainer/RWKV-v5\n", + "TRAINER_DIR: /data/chris/rwkv-fork/RWKV-infctx-trainer/RWKV-v5\n", + "PROJECT_DIR: /data/chris/rwkv-fork/RWKV-infctx-trainer\n" + ] + } + ], + "source": [ + "DEEPSPEED_STRAT=\"deepspeed_stage_1\"\n", + "GPU_DEVICES=\"auto\"\n", + "ENABLE_WANDB=True\n", + "\n", + "# Layer count and embed dim to start with\n", + "LAYER_COUNT=6\n", + "EMBED_DIM=2048\n", + "\n", + "EMBED_SCALE=0.1\n", + "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n", + "\n", + "WANDB_PREFIX=f\"v5r3-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE}\"\n", + "FILENAME_PREFIX=f\"v5r3-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}\"\n", + "\n", + "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n", + "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n", + "print(\"GPU_DEVICES:\", GPU_DEVICES)\n", + "\n", + "if ENABLE_WANDB:\n", + " WANDB_MODE=\"online\"\n", + "else:\n", + " WANDB_MODE=\"disabled\"\n", + "\n", + "# Computing the notebook, and various paths\n", + "import os\n", + "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n", + "CONFIG_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"./\"))\n", + "PROJECT_DIR=os.path.abspath(os.path.join(CONFIG_DIR, \"../../../\"))\n", + "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", + "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", + "\n", + "# Get the notebook dir name\n", + "DIR_NAME=os.path.basename(NOTEBOOK_DIR)\n", + "\n", + "# Log names and dir\n", + "print(\"DIR_NAME:\", DIR_NAME)\n", + "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n", + "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n", + "print(\"TRAINER_DIR:\", TRAINER_DIR)\n", + "print(\"PROJECT_DIR:\", PROJECT_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2023-09-16 17:59:33-- https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/main/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/v5r3-L6-D2048-E0_1-enwiki-instruct.pth\n", + "Resolving huggingface.co (huggingface.co)... 108.138.246.71, 108.138.246.85, 108.138.246.67, ...\n", + "Connecting to huggingface.co (huggingface.co)|108.138.246.71|:443... connected.\n", + "HTTP request sent, awaiting response... 302 Found\n", + "Location: https://cdn-lfs.huggingface.co/repos/2e/f7/2ef78555202aa92abdbdf476ce3d0fd5a8b15f7245edf0b80d4d30572355f30d/0a83bdbbf6d686bfa77529fc9bbde3a91fc8d182e1dc33ce8d18f2a0abbe2576?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27v5r3-L6-D2048-E0_1-enwiki-instruct.pth%3B+filename%3D%22v5r3-L6-D2048-E0_1-enwiki-instruct.pth%22%3B&Expires=1695171573&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NTE3MTU3M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8yZS9mNy8yZWY3ODU1NTIwMmFhOTJhYmRiZGY0NzZjZTNkMGZkNWE4YjE1ZjcyNDVlZGYwYjgwZDRkMzA1NzIzNTVmMzBkLzBhODNiZGJiZjZkNjg2YmZhNzc1MjlmYzliYmRlM2E5MWZjOGQxODJlMWRjMzNjZThkMThmMmEwYWJiZTI1NzY%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=ydnz2x0eG1WBO%7ExW3sMHufYylEixqjKRuxzCPaRc0AdGMtoIsv1lnJ7DcU0TtY4RQZpUxvmtEdE43zQtOf7Bf80qf8U0mLnGOaEZLxuCrKXodOa8c8N58xr5c0Kl4XofpifWg%7EUeO2xAKAY%7EYgSyzqJDVFEzcifyu69bLA1fgZJwM7V5w4YmkJ2mmLp7wxicVMOh9y8f7evkoG9wNd2NjuTje7VhbptyFYio4KoMLfUwwXO1C5nXTYawFEIXN%7EZNpNgGeDZkPGt0RwdL4OVav8m6if%7E89QbaEnWlPjWulswil%7EkjC5893H9l7FvRJYVQAmuXOeeFcJoG64xDSjluGQ__&Key-Pair-Id=KVTP0A1DKRTAX [following]\n", + "--2023-09-16 17:59:33-- https://cdn-lfs.huggingface.co/repos/2e/f7/2ef78555202aa92abdbdf476ce3d0fd5a8b15f7245edf0b80d4d30572355f30d/0a83bdbbf6d686bfa77529fc9bbde3a91fc8d182e1dc33ce8d18f2a0abbe2576?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27v5r3-L6-D2048-E0_1-enwiki-instruct.pth%3B+filename%3D%22v5r3-L6-D2048-E0_1-enwiki-instruct.pth%22%3B&Expires=1695171573&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NTE3MTU3M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8yZS9mNy8yZWY3ODU1NTIwMmFhOTJhYmRiZGY0NzZjZTNkMGZkNWE4YjE1ZjcyNDVlZGYwYjgwZDRkMzA1NzIzNTVmMzBkLzBhODNiZGJiZjZkNjg2YmZhNzc1MjlmYzliYmRlM2E5MWZjOGQxODJlMWRjMzNjZThkMThmMmEwYWJiZTI1NzY%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=ydnz2x0eG1WBO%7ExW3sMHufYylEixqjKRuxzCPaRc0AdGMtoIsv1lnJ7DcU0TtY4RQZpUxvmtEdE43zQtOf7Bf80qf8U0mLnGOaEZLxuCrKXodOa8c8N58xr5c0Kl4XofpifWg%7EUeO2xAKAY%7EYgSyzqJDVFEzcifyu69bLA1fgZJwM7V5w4YmkJ2mmLp7wxicVMOh9y8f7evkoG9wNd2NjuTje7VhbptyFYio4KoMLfUwwXO1C5nXTYawFEIXN%7EZNpNgGeDZkPGt0RwdL4OVav8m6if%7E89QbaEnWlPjWulswil%7EkjC5893H9l7FvRJYVQAmuXOeeFcJoG64xDSjluGQ__&Key-Pair-Id=KVTP0A1DKRTAX\n", + "Resolving cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)... 18.238.192.105, 18.238.192.34, 18.238.192.50, ...\n", + "Connecting to cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)|18.238.192.105|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 1066537777 (1017M) [binary/octet-stream]\n", + "Saving to: ‘v5r3-L6-D2048-E0_1-enwiki-instruct.pth’\n", + "\n", + "v5r3-L6-D2048-E0_1- 100%[===================>] 1017M 112MB/s in 9.1s \n", + "\n", + "2023-09-16 17:59:42 (112 MB/s) - ‘v5r3-L6-D2048-E0_1-enwiki-instruct.pth’ saved [1066537777/1066537777]\n", + "\n", + "total 703M\n", + "drwx------ 2 christopherchou u-christopherchou 3 Sep 16 17:59 .\n", + "drwx------ 17 christopherchou u-christopherchou 21 Sep 16 17:59 ..\n", + "-rw------- 1 christopherchou u-christopherchou 1018M Sep 13 13:28 v5r3-L6-D2048-E0_1-enwiki-instruct.pth\n" + ] + } + ], + "source": [ + "# Download the model directly (stop gap till HF sync issues is resolved)\n", + "!cd \"{TRAINER_DIR}\" && cd \"../models/\" && \\\n", + " wget -nc \"https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/main/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/v5r3-L6-D2048-E0_1-enwiki-instruct.pth\"\n", + "\n", + "!cd \"{TRAINER_DIR}\" && cd \"../models/\" && \\\n", + " ls -alh ." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tune 2 : Context size (1024) " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "## Generating math and name dataset ##\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "## Done ##\n", + "total 25M\n", + "drwx------ 2 christopherchou u-christopherchou 3 Sep 16 17:59 .\n", + "drwx------ 13 christopherchou u-christopherchou 14 Sep 16 17:59 ..\n", + "-rw------- 1 christopherchou u-christopherchou 55M Sep 16 17:59 questions_numbers.jsonl\n" + ] + } + ], + "source": [ + "%%script bash\n", + "\n", + "########################################\n", + "# Generate the required jsonl dataset\n", + "########################################\n", + "\n", + "# Go to config dir\n", + "cd \"../\"\n", + "\n", + "# Reset the dataset dir\n", + "mkdir -p ../dataset\n", + "rm -rf ../dataset/*.jsonl\n", + "\n", + "# Generate the various datasets\n", + "echo \"## Generating math and name dataset ##\"\n", + "\n", + "#\n", + "# We reduce the training set for lower word count - and shift the focus upwards\n", + "#\n", + "# do\n", + "python3 infctx-math-and-name/generate_math_and_name_dataset.py --out-file ../dataset/questions_numbers.jsonl --max-numbers 1024 --num-samples 10000\n", + "# done\n", + "\n", + "wait\n", + "echo \"## Done ##\"\n", + "\n", + "ls -alh ../dataset/" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu117'\n", + "\u001b[31m╭─\u001b[0m\u001b[31m────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m─────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33m/data/chris/rwkv-fork/RWKV-infctx-trainer/RWKV-v5/\u001b[0m\u001b[1;33mlightning_trainer.py\u001b[0m:\u001b[94m278\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m in \u001b[92m\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m275 \u001b[0m\u001b[2m│ \u001b[0m) \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m276 \u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m277 \u001b[0m\u001b[94mif\u001b[0m \u001b[91m__name__\u001b[0m == \u001b[33m\"\u001b[0m\u001b[33m__main__\u001b[0m\u001b[33m\"\u001b[0m: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m278 \u001b[2m│ \u001b[0mcli_main() \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m279 \u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33m/data/chris/rwkv-fork/RWKV-infctx-trainer/RWKV-v5/\u001b[0m\u001b[1;33mlightning_trainer.py\u001b[0m:\u001b[94m253\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m in \u001b[92mcli_main\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m250 \u001b[0m\u001b[94mfrom\u001b[0m \u001b[4;96msrc\u001b[0m\u001b[4;96m.\u001b[0m\u001b[4;96mtrainer\u001b[0m \u001b[94mimport\u001b[0m RWKVLightningTrainer \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m251 \u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m252 \u001b[0m\u001b[94mdef\u001b[0m \u001b[92mcli_main\u001b[0m(): \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m253 \u001b[2m│ \u001b[0mLightningCLI( \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m254 \u001b[0m\u001b[2m│ │ \u001b[0mRWKV, RWKVDataModule, \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m255 \u001b[0m\u001b[2m│ │ \u001b[0msave_config_kwargs={\u001b[33m\"\u001b[0m\u001b[33moverwrite\u001b[0m\u001b[33m\"\u001b[0m: \u001b[94mTrue\u001b[0m}, \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m256 \u001b[0m\u001b[2m│ │ \u001b[0mtrainer_class=RWKVLightningTrainer, \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33m/data/chris/anaconda3/envs/fastchat-env/lib/python3.8/site-packages/lightnin\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33mg/pytorch/\u001b[0m\u001b[1;33mcli.py\u001b[0m:\u001b[94m348\u001b[0m in \u001b[92m__init__\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m345 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[96mself\u001b[0m.subclass_mode_data = (datamodule_class \u001b[95mis\u001b[0m \u001b[94mNone\u001b[0m) \u001b[95mor\u001b[0m subcla \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m346 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m347 \u001b[0m\u001b[2m│ │ \u001b[0mmain_kwargs, subparser_kwargs = \u001b[96mself\u001b[0m._setup_parser_kwargs(\u001b[96mself\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m348 \u001b[2m│ │ \u001b[0m\u001b[96mself\u001b[0m.setup_parser(run, main_kwargs, subparser_kwargs) \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m349 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[96mself\u001b[0m.parse_arguments(\u001b[96mself\u001b[0m.parser, args) \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m350 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m351 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[96mself\u001b[0m.subcommand = \u001b[96mself\u001b[0m.config[\u001b[33m\"\u001b[0m\u001b[33msubcommand\u001b[0m\u001b[33m\"\u001b[0m] \u001b[94mif\u001b[0m run \u001b[94melse\u001b[0m \u001b[94mNone\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33m/data/chris/anaconda3/envs/fastchat-env/lib/python3.8/site-packages/lightnin\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33mg/pytorch/\u001b[0m\u001b[1;33mcli.py\u001b[0m:\u001b[94m380\u001b[0m in \u001b[92msetup_parser\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m377 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[96mself\u001b[0m, add_subcommands: \u001b[96mbool\u001b[0m, main_kwargs: Dict[\u001b[96mstr\u001b[0m, Any], subp \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m378 \u001b[0m\u001b[2m│ \u001b[0m) -> \u001b[94mNone\u001b[0m: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m379 \u001b[0m\u001b[2;90m│ │ \u001b[0m\u001b[33m\"\"\"Initialize and setup the parser, subcommands, and arguments\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m380 \u001b[2m│ │ \u001b[0m\u001b[96mself\u001b[0m.parser = \u001b[96mself\u001b[0m.init_parser(**main_kwargs) \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m381 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mif\u001b[0m add_subcommands: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m382 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[96mself\u001b[0m._subcommand_method_arguments: Dict[\u001b[96mstr\u001b[0m, List[\u001b[96mstr\u001b[0m]] = \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m383 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[96mself\u001b[0m._add_subcommands(\u001b[96mself\u001b[0m.parser, **subparser_kwargs) \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33m/data/chris/anaconda3/envs/fastchat-env/lib/python3.8/site-packages/lightnin\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33mg/pytorch/\u001b[0m\u001b[1;33mcli.py\u001b[0m:\u001b[94m370\u001b[0m in \u001b[92minit_parser\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m367 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mdef\u001b[0m \u001b[92minit_parser\u001b[0m(\u001b[96mself\u001b[0m, **kwargs: Any) -> LightningArgumentParser: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m368 \u001b[0m\u001b[2;90m│ │ \u001b[0m\u001b[33m\"\"\"Method that instantiates the argument parser.\"\"\"\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m369 \u001b[0m\u001b[2m│ │ \u001b[0mkwargs.setdefault(\u001b[33m\"\u001b[0m\u001b[33mdump_header\u001b[0m\u001b[33m\"\u001b[0m, [\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33mlightning.pytorch==\u001b[0m\u001b[33m{\u001b[0mpl.__v \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m370 \u001b[2m│ │ \u001b[0mparser = LightningArgumentParser(**kwargs) \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m371 \u001b[0m\u001b[2m│ │ \u001b[0mparser.add_argument( \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m372 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[33m\"\u001b[0m\u001b[33m-c\u001b[0m\u001b[33m\"\u001b[0m, \u001b[33m\"\u001b[0m\u001b[33m--config\u001b[0m\u001b[33m\"\u001b[0m, action=ActionConfigFile, help=\u001b[33m\"\u001b[0m\u001b[33mPath to a\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m373 \u001b[0m\u001b[2m│ │ \u001b[0m) \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33m/data/chris/anaconda3/envs/fastchat-env/lib/python3.8/site-packages/lightnin\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33mg/pytorch/\u001b[0m\u001b[1;33mcli.py\u001b[0m:\u001b[94m94\u001b[0m in \u001b[92m__init__\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 91 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 92 \u001b[0m\u001b[2;33m│ │ \u001b[0m\u001b[33m\"\"\"\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 93 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[95mnot\u001b[0m _JSONARGPARSE_SIGNATURES_AVAILABLE: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m 94 \u001b[2m│ │ │ \u001b[0m\u001b[94mraise\u001b[0m \u001b[96mModuleNotFoundError\u001b[0m( \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 95 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33m{\u001b[0m_JSONARGPARSE_SIGNATURES_AVAILABLE\u001b[33m}\u001b[0m\u001b[33m. Try `pip insta\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 96 \u001b[0m\u001b[2m│ │ │ \u001b[0m) \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 97 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[96msuper\u001b[0m().\u001b[92m__init__\u001b[0m(*args, description=description, env_prefix=en \u001b[31m│\u001b[0m\n", + "\u001b[31m╰──────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n", + "\u001b[1;91mModuleNotFoundError: \u001b[0mDistributionNotFound: The \n", + "\u001b[32m'jsonargparse\u001b[0m\u001b[32m[\u001b[0m\u001b[32msignatures\u001b[0m\u001b[32m]\u001b[0m\u001b[32m>=4.17.0'\u001b[0m distribution was not found and is required by\n", + "the application. HINT: Try running `pip install -U \n", + "\u001b[32m'jsonargparse\u001b[0m\u001b[32m[\u001b[0m\u001b[32msignatures\u001b[0m\u001b[32m]\u001b[0m\u001b[32m>=4.17.0'\u001b[0m`. Try `pip install -U \n", + "\u001b[32m'jsonargparse\u001b[0m\u001b[32m[\u001b[0m\u001b[32msignatures\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m`.\n" + ] + } + ], + "source": [ + "# Start the finetune model training\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", + " python3 lightning_trainer.py fit \\\n", + " -c \"{CONFIG_DIR}/config-mem-template.yaml\" \\\n", + " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Mem-Tune ctx-1024 (train-ctx=1024, {DEEPSPEED_STRAT})\" \\\n", + " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", + " --trainer.devices=\"{GPU_DEVICES}\" \\\n", + " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-mem-ctx-1024/\" \\\n", + " --model.lr_init=5e-4 \\\n", + " --model.lr_final=4e-4 \\\n", + " --data.max_token_size=1024 \\\n", + " --model.ctx_len=1024 \\\n", + " --model.bptt_learning_range=1 \\\n", + " --model.load_model=\"../model/{FILENAME_PREFIX}-mem-instruct.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets export the model from the checkpoint\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python3 export_checkpoint.py \\\n", + " \"../checkpoint/{FILENAME_PREFIX}-mem-ctx-1024/last.ckpt\" \\\n", + " \"../model/{FILENAME_PREFIX}-mem-ctx-1024.pth\" \"bf16\"\n", + "!cd \"{TRAINER_DIR}\" && ls -alh \"../models/{FILENAME_PREFIX}-mem-ctx-1024.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets do a quick memory test\n", + "!python3 ../../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-ctx-1024.pth\"" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}