From fe597e3a9e9520a49f902c3c4cc0ea56834bcd9f Mon Sep 17 00:00:00 2001 From: "@picocreator (Eugene Cheah)" Date: Mon, 28 Aug 2023 05:26:56 +0000 Subject: [PATCH] lets go - smaller, test the limits of how small v5 can go with L6 layers --- .../v5-small-model/config-enwiki-4k.yaml | 221 ++++++ .../config-enwiki-instruct.yaml | 219 ++++++ .../v5-L6-ctx4k-models-part1.ipynb | 700 ++++++++++++++++++ 3 files changed, 1140 insertions(+) create mode 100644 notebook/experiment/rwkv-x-exp/v5-small-model/config-enwiki-4k.yaml create mode 100644 notebook/experiment/rwkv-x-exp/v5-small-model/config-enwiki-instruct.yaml create mode 100644 notebook/experiment/rwkv-x-exp/v5-small-model/v5-L6-ctx4k-models-part1.ipynb diff --git a/notebook/experiment/rwkv-x-exp/v5-small-model/config-enwiki-4k.yaml b/notebook/experiment/rwkv-x-exp/v5-small-model/config-enwiki-4k.yaml new file mode 100644 index 00000000..eb8a25b0 --- /dev/null +++ b/notebook/experiment/rwkv-x-exp/v5-small-model/config-enwiki-4k.yaml @@ -0,0 +1,221 @@ +# lightning.pytorch==2.0.2 +seed_everything: true +trainer: + # Configure the number of GPU, avaliable on your machine + accelerator: gpu + devices: auto + num_nodes: 1 + + # + # Configure the deepspeed strategy, + # + strategy: deepspeed_stage_1 + + # Logger setting for wandb, if you want to enable wandb, uncomment the whole logger section + # --- + logger: + class_path: lightning.pytorch.loggers.WandbLogger + init_args: + name: 'Enwiki foundation (train-ctx=4k)' + project: 'RWKV-X-Experiments' + tags: ['RWKV', 'RWKV-X'] + + # Checkpoint settings for the training process + callbacks: + class_path: lightning.pytorch.callbacks.ModelCheckpoint + init_args: + # Configure this to the path you want to save your checkpoints to + # note that a subdir will be created with the name `epoch=x-step=y.ckpt` + # + # to convert a checkpoint to a model, you can use the + # `python3 export_checkpoint.py ` script, + # which will create a `rwkv_model.pth` in the checkpoint directory. + # + # Do not use the `zero_to_fp32.py` script as that will have export format issues + dirpath: ../checkpoint/small-enwiki-4k/ + filename: null + + # Save the top/last K checkpoints + save_top_k: 2 + # Choose by the most recent checkpoints (time based) + monitor: 'step' + mode: max + + # If enabled (true), save a copy of the latest checkpoint to 'last.ckpt' + # useful to simply checkpoint resume scripts, at a price of disk performance + save_last: true + + # DO NOT set this as true, as the model weight exported will have format issues + # expert as checkpoint, and use the `export_checkpoint.py` script to convert to model instead + save_weights_only: false + + # How frequent you want to save a checkpoint for every step. + # This will happen for every X data sample, where X = every_n_train_steps * accumulate_grad_batches + # + # In general you will want to avoid putting a low number (expecially if accumulate_grad_batches <= 100) + # as the checkpoint process, will pause all the gpu training for some time, slowing down the overall process + # However you do not want to configure too high of a number, where you will lose too much progress if the training crashes + every_n_train_steps: 100 + every_n_epochs: null + save_on_train_epoch_end: true + train_time_interval: null + + # Other settings, you can probably leave alone + verbose: false + auto_insert_metric_name: true + + ######################################## + ## Training run parameter settings + ######################################## + + # Generally what you want to configure is the maximum number of epochs + # If you leave it as -1, and it will keep going forever till interrupted + # Or set it as a number, and it will stop after that number of epochs + max_epochs: 1 + min_epochs: null + + # Number of datasamples to train for each step, a data sample is considered + # a "substep" in wandb logs, and a "step" is tracked as "trainer/global_step" + # + # This decides the number of datasample, to learn together from, before backproping + # any weight changes at the end of the batch. + # + # Recommended to be a big enough number (like 128/256) where it prevents the training + # loss from flucuating in the process. But not too big of a number where the increased + # GPU vRAM / offloaded RAM usage will cause the training to crash. + # + # You are also recommended to configure this to a large enough number to fully utilize + # your GPU processing time %, and avoid idle time for the GPU between batches + target_batch_size: 32 + +######################################## +## Training model settings +######################################## +model: + # Model to start the finetune/training process from + load_model: ../model/V5-Base-1B5-Enwiki-4k.pth + + # Context length to use for the training process + # the larger the number (and batch size) the larger the vram usage + # + # Note that if the datasample context length is larger then the ctx_len + # its training process would be split into ctx_len sized chunks. + # + # This allows the training of extreamly large context length (eg. 100k), + # without eating up too much vram by keeping the training context length + # to a resonable number sutible to the current GPU setup + ctx_len: 4096 + + # Learning rate of the training process + # --- + # Initia learning rate of the process + lr_init: 6e-4 + # Final learning rate after the learning rate period + # learning rate will stay at final value from then onwards + # + # NOTE: lr_final / lr_period does not work with warmup_steps + # and will be ignored (or replaced) with the warmup_steps logic instead + lr_final: 4e-4 + # Number of epoch to reduce the learning rate from lr_init to lr_final + # 1 means a single epoch (so lr would be lr_final from epoch 2 onwards) + # 0 means lr_final will apply immediately + # -1 means we take the current max_step / max_epoch as the period + lr_period: 1 + # lr_period type if its set, defaults to epoch + lr_period_type: epoch + + # We disable bptt / limit bptt_learning_range, to 1, to ensure high throughput within a multi-gpu setup. + # (by skipping some syncronization code). Additionally, as bptt learning should not be triggering + # anyway as the data sample should be within ctx size 99% of the time + bptt_learning: true + bptt_learning_range: -1 + +data: + # dataset_path for the prebuilt dataset, using HF `load_from_disk()` + # + # Use this if you have built your own dataset and saved it with `save_to_disk()` + # with source left as null. Other wise configure this to a directory which the + # dataset will be built and tokenized by the huggingface dataset process. + data_path: ../datapath/enwiki_100k_16k/ + + # Other wise provide the source path, which is used as huggingface dataset path + # this will be used to populate the dataset_path + # + # Use either the following + # - hugging face dataset + # - Directory path to a directory containing dataset files + # - Path to a single dataset file + # - hugging face dataset mode (ie: text,csv,etc - use data_dir, to configure the path then) + # - null + # + # If source is disabled, all other params, except data_path, is ignored + source: "teven/enwiki_100k" + # source: text + # source: /home/ubuntu/RWKV-LM-LoRA/dataset-text/enwik8.txt + + # Use data_dir, if you are using source=text/json/etc + # this should be relative to the trainer script path + source_data_dir: null + + # After loading the dataset, split out test data used for validation, + # This process is skipped if the dataset includes a test split + # This process is skipped if set to zero + test_split: 0.005 + test_split_shuffle: true + + # Tokenizer to use, use either the inbuilt 'neox', or 'world' tokenizer + # If using a custom tokenizer, provide the tokenizer file path + # --- + tokenizer: neox + + # Minimum / Maximum token size of the dataset to use + # useful for filtering out small noisy data samples from large datasets + # (eg. removal of small articles of less then 512 tokens from wikipedia) + # + # This is ignored, if set to -1 + min_token_size: 1024 + max_token_size: -1 + + # Rechunking of text dataset, this is done only when source is set as 'text' + # and will merge the various sentencees, into larger chunks up to the target size + # + # Defaults to 4096 + # + # This is ignored, if source is not set as text + # This is ignored, if set to zero + # --- + text_rechunk_size: 4096 + + # Apply text rechunk to the dataset, even if its not a 'text' source + # This is done only after dataset filtering, and if source is not 'text' + # --- + text_rechunk_force: True + + # Custom text column to use, useful for dataset with alternative training columns labels + # This is checked before multi column merging, default is null (disabled) + # eg: 'code' + # --- + # custom_text_key: 'code' + + # Multi Column merging process, default setting is used to support and merge + # "instruction", "input", "output", datasets. To disable set multi_column_keys to [] + # + # A minimum of 2 columns is required, with non empty data, for the merge to occur + # If no match is found, this will fallback to the default prompt/completion or text column, + # or throw an error if the default fallback is not found + # --- + # multi_column_keys: ['instruction', 'input', 'output'] + # multi_column_prefix: ['Instruction:\n', 'Input:\n', 'Output:\n'] + # multi_column_train_mask: [true, false, true] + # multi_column_seperator: '\n\n' + + # If processing prompt/completion jsonl pairs, the prompt is masked by default + # use this flag to disable this default behaviour + # --- + # disable_prompt_mask: false + +# Path to the current checkpoint to continue training from +# Enable this to the last checkpoint after the first run +# (if it crash and you want to resume) +# ckpt_path: ../checkpoint/V5-Base-1B5-enwiki/epoch=0-step=2500.ckpt +ckpt_path: null diff --git a/notebook/experiment/rwkv-x-exp/v5-small-model/config-enwiki-instruct.yaml b/notebook/experiment/rwkv-x-exp/v5-small-model/config-enwiki-instruct.yaml new file mode 100644 index 00000000..59354ae8 --- /dev/null +++ b/notebook/experiment/rwkv-x-exp/v5-small-model/config-enwiki-instruct.yaml @@ -0,0 +1,219 @@ +# lightning.pytorch==2.0.2 +seed_everything: true +trainer: + # Configure the number of GPU, avaliable on your machine + accelerator: gpu + devices: auto + num_nodes: 1 + + # + # Configure the deepspeed strategy, + # + strategy: deepspeed_stage_1 + + # Floating point precision for the model, because RWKV is built FOR bf16 + # you should pretty much never change this setting + precision: bf16 + + # Logger setting for wandb, if you want to enable wandb, uncomment the whole logger section + # --- + logger: + class_path: lightning.pytorch.loggers.WandbLogger + init_args: + name: 'Enwiki Instruct (train-ctx=4096)' + project: 'RWKV-X-Experiments' + tags: ['RWKV', 'RWKV-X'] + + # Checkpoint settings for the training process + callbacks: + class_path: lightning.pytorch.callbacks.ModelCheckpoint + init_args: + # Configure this to the path you want to save your checkpoints to + # note that a subdir will be created with the name `epoch=x-step=y.ckpt` + # + # to convert a checkpoint to a model, you can use the + # `python3 export_checkpoint.py ` script, + # which will create a `rwkv_model.pth` in the checkpoint directory. + # + # Do not use the `zero_to_fp32.py` script as that will have export format issues + dirpath: ../checkpoint/small-enwiki-instruct/ + filename: null + + # Save the top/last K checkpoints + save_top_k: 2 + # Choose by the most recent checkpoints (time based) + monitor: 'step' + mode: max + + # If enabled (true), save a copy of the latest checkpoint to 'last.ckpt' + # useful to simply checkpoint resume scripts, at a price of disk performance + save_last: true + + # DO NOT set this as true, as the model weight exported will have format issues + # expert as checkpoint, and use the `export_checkpoint.py` script to convert to model instead + save_weights_only: false + + # How frequent you want to save a checkpoint for every step. + # This will happen for every X data sample, where X = every_n_train_steps * accumulate_grad_batches + # + # In general you will want to avoid putting a low number (expecially if accumulate_grad_batches <= 100) + # as the checkpoint process, will pause all the gpu training for some time, slowing down the overall process + # However you do not want to configure too high of a number, where you will lose too much progress if the training crashes + every_n_train_steps: 100 + every_n_epochs: null + save_on_train_epoch_end: true + train_time_interval: null + + # Other settings, you can probably leave alone + verbose: false + auto_insert_metric_name: true + + ######################################## + ## Training run parameter settings + ######################################## + + # Generally what you want to configure is the maximum number of epochs + # Leave it as -1, and it will keep going forever till interrupted + # Or set it as a number, and it will stop after that number of epochs + max_epochs: 1 + min_epochs: null + + # Number of datasamples to train for each step, a data sample is considered + # a "substep" in wandb logs, and a "step" is tracked as "trainer/global_step" + # + # This decides the number of datasample, to learn together from, before backproping + # any weight changes at the end of the batch. + # + # Recommended to be a big enough number (like 128/256) where it prevents the training + # loss from flucuating in the process. But not too big of a number where the increased + # GPU vRAM / offloaded RAM usage will cause the training to crash. + # + # You are also recommended to configure this to a large enough number to fully utilize + # your GPU processing time %, and avoid idle time for the GPU between batches + target_batch_size: 32 + +######################################## +## Training model settings +######################################## +model: + # Model to start the finetune/training process from + load_model: ../model/EWR-1B5-enwiki-16k.pth + + # Context length to use for the training process + # the larger the number (and batch size) the larger the vram usage + # + # Note that if the datasample context length is larger then the ctx_len + # its training process would be split into ctx_len sized chunks. + # + # This allows the training of extreamly large context length (eg. 100k), + # without eating up too much vram by keeping the training context length + # to a resonable number sutible to the current GPU setup + ctx_len: 4096 + + # Learning rate of the training process + # --- + # Initia learning rate of the process + lr_init: 4e-4 + # Final learning rate after the learning rate period + # learning rate will stay at final value from then onwards + # + # NOTE: lr_final / lr_period does not work with warmup_steps + # and will be ignored (or replaced) with the warmup_steps logic instead + lr_final: 3e-4 + # Number of epoch to reduce the learning rate from lr_init to lr_final + # 1 means a single epoch (so lr would be lr_final from epoch 2 onwards) + # 0 means lr_final will apply immediately + # -1 means we take the current max_step / max_epoch as the period + lr_period: 1 + # lr_period type if its set, defaults to epoch + lr_period_type: epoch + + # We disable bptt / limit bptt_learning_range, to 1, to ensure high throughput within a multi-gpu setup. + # (by skipping some syncronization code). Additionally, as bptt learning should not be triggering + # anyway as the data sample should be within ctx size 99% of the time + bptt_learning: true + bptt_learning_range: 1 + +data: + # dataset_path for the prebuilt dataset, using HF `load_from_disk()` + # + # Use this if you have built your own dataset and saved it with `save_to_disk()` + # with source left as null. Other wise configure this to a directory which the + # dataset will be built and tokenized by the huggingface dataset process. + data_path: ../datapath/dolly-15k-instruction-alpaca-format/ + + # Other wise provide the source path, which is used as huggingface dataset path + # this will be used to populate the dataset_path + # + # Use either the following + # - hugging face dataset + # - Directory path to a directory containing dataset files + # - Path to a single dataset file + # - hugging face dataset mode (ie: text,csv,etc - use data_dir, to configure the path then) + # - null + # + # If source is disabled, all other params, except data_path, is ignored + source: "c-s-ale/dolly-15k-instruction-alpaca-format" + # source: text + # source: /home/ubuntu/RWKV-LM-LoRA/dataset-text/enwik8.txt + + # Use data_dir, if you are using source=text/json/etc + # this should be relative to the trainer script path + source_data_dir: null + + # After loading the dataset, split out test data used for validation, + # This process is skipped if the dataset includes a test split + # This process is skipped if set to zero + test_split: 0.005 + test_split_shuffle: true + + # Tokenizer to use, use either the inbuilt 'neox', or 'world' tokenizer + # If using a custom tokenizer, provide the tokenizer file path + # --- + tokenizer: neox + + # Minimum / Maximum token size of the dataset to use + # useful for filtering out small noisy data samples from large datasets + # (eg. removal of small articles of less then 512 tokens from wikipedia) + # + # This is ignored, if set to -1 + min_token_size: -1 + max_token_size: 4096 + + # Rechunking of text dataset, this is done only when source is set as 'text' + # and will merge the various sentencees, into larger chunks up to the target size + # + # Defaults to 4096 + # + # This is ignored, if source is not set as text + # This is ignored, if set to zero + # --- + # text_rechunk_size: 2048 + + # Apply text rechunk to the dataset, even if its not a 'text' source + # This is done only after dataset filtering, and if source is not 'text' + # --- + # text_rechunk_force: false + + # Custom text column to use, useful for dataset with alternative training columns labels + # This is checked before multi column merging, default is null (disabled) + # eg: 'code' + # --- + # custom_text_key: 'code' + + # Multi Column merging process, default setting is used to support and merge + # "instruction", "input", "output", datasets. To disable set multi_column_keys to [] + # + # A minimum of 2 columns is required, with non empty data, for the merge to occur + # If no match is found, this will fallback to the default prompt/completion or text column, + # or throw an error if the default fallback is not found + # --- + multi_column_keys: ['instruction', 'input', 'output'] + multi_column_prefix: ['# Instruction:\n', '# Context:\n', '# Answer:\n'] + multi_column_train_mask: [true, false, true] + multi_column_separator: '\n\n' + + # If processing prompt/completion jsonl pairs, the prompt is masked by default + # use this flag to disable this default behaviour + # --- + # disable_prompt_mask: false diff --git a/notebook/experiment/rwkv-x-exp/v5-small-model/v5-L6-ctx4k-models-part1.ipynb b/notebook/experiment/rwkv-x-exp/v5-small-model/v5-L6-ctx4k-models-part1.ipynb new file mode 100644 index 00000000..e5f669c0 --- /dev/null +++ b/notebook/experiment/rwkv-x-exp/v5-small-model/v5-L6-ctx4k-models-part1.ipynb @@ -0,0 +1,700 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RWKV v5 Small model experiment\n", + "\n", + "- 6 layers\n", + "- 2048 embedding size\n", + "\n", + "A series of small model training expriments, to see \"how small can we go\" for v5\n", + "\n", + "**Note:** This project assumes you have the rwkv-infctx conda env setup" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Basic Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# First lets setup the various directories, and init the model\n", + "!mkdir -p ../../../../model/\n", + "!mkdir -p ../../../../datapath/\n", + "!mkdir -p ../../../../checkpoint/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "DEEPSPEED_STRAT=\"deepspeed_stage_1\"\n", + "GPU_DEVICES=\"auto\"\n", + "ENABLE_WANDB=True\n", + "\n", + "# Layer count and embed dim to start with\n", + "LAYER_COUNT=6\n", + "EMBED_DIM=2048\n", + "\n", + "EMBED_SCALE=0.1\n", + "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n", + "\n", + "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n", + "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n", + "print(\"GPU_DEVICES:\", GPU_DEVICES)\n", + "\n", + "if ENABLE_WANDB:\n", + " WANDB_MODE=\"online\"\n", + "else:\n", + " WANDB_MODE=\"disabled\"\n", + "\n", + "# Computing the notebook, and various paths\n", + "import os\n", + "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n", + "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../../\"))\n", + "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", + "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", + "\n", + "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n", + "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n", + "print(\"TRAINER_DIR:\", TRAINER_DIR)\n", + "print(\"PROJECT_DIR:\", PROJECT_DIR)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Small Model: L6-D64\n", + "\n", + "### Enwiki training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "EMBED_DIM=64\n", + "WANDB_PREFIX=f\"[small-model] v5-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE}\"\n", + "FILENAME_PREFIX=f\"v5-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Init the model\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python3 ./init_model.py \\\n", + " --n_layer \"{LAYER_COUNT}\" --n_embd \"{EMBED_DIM}\" \\\n", + " --emb-scale \"{EMBED_SCALE}\" \\\n", + " --vocab_size neox --skip-if-exists \\\n", + " \"../model/{FILENAME_PREFIX}-neox-init.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets preload the requried dataset \n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python3 preload_datapath.py \"{NOTEBOOK_DIR}/config-enwiki-4k.yaml\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Start the foundation model training\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", + " python lightning_trainer.py fit \\\n", + " -c \"{NOTEBOOK_DIR}/config-enwiki-4k.yaml\" \\\n", + " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-4k Foundation (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n", + " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", + " --trainer.devices=\"{GPU_DEVICES}\" \\\n", + " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-enwiki-4k/\" \\\n", + " --model.load_model=\"../model/{FILENAME_PREFIX}-neox-init.pth\" \\\n", + " --model.ctx_len=4096 \\\n", + " --model.bptt_learning_range=1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets export the model from the checkpoint\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-enwiki-4k/last.ckpt\" \"../model/{FILENAME_PREFIX}-enwiki-4k.pth\" \"bf16\"\n", + "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-enwiki-4k.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # Lets do a quick dragon prompt validation\n", + "!cd \"{INFERENCE_DIR}\" && \\\n", + " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-4k.pth\" \"cuda fp32\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Basic Instruct Tuning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets preload the requried dataset\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python3 preload_datapath.py \"{NOTEBOOK_DIR}/config-enwiki-instruct.yaml\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Start the instruct finetuning\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", + " python lightning_trainer.py fit \\\n", + " -c \"{NOTEBOOK_DIR}/config-enwiki-instruct.yaml\" \\\n", + " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-Instruct (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n", + " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", + " --trainer.devices=\"{GPU_DEVICES}\" \\\n", + " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-enwiki-instruct/\" \\\n", + " --model.load_model=\"../model/{FILENAME_PREFIX}-enwiki-4k.pth\" \\\n", + " --model.ctx_len=4096 \\\n", + " --model.bptt_learning_range=1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets export the model from the checkpoint\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-enwiki-instruct/last.ckpt\" \"../model/{FILENAME_PREFIX}-enwiki-instruct.pth\" \"bf16\"\n", + "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-enwiki-instruct.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets do a quick dragon prompt validation\n", + "!cd \"{INFERENCE_DIR}\" && \\\n", + " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-instruct.pth\" \"cuda fp32\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Small Model: L6-D256\n", + "\n", + "### Enwiki training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "EMBED_DIM=256\n", + "WANDB_PREFIX=f\"[small-model] v5-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE}\"\n", + "FILENAME_PREFIX=f\"v5-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Init the model\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python3 ./init_model.py \\\n", + " --n_layer \"{LAYER_COUNT}\" --n_embd \"{EMBED_DIM}\" \\\n", + " --emb-scale \"{EMBED_SCALE}\" \\\n", + " --vocab_size neox --skip-if-exists \\\n", + " \"../model/{FILENAME_PREFIX}-neox-init.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets preload the requried dataset \n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python3 preload_datapath.py \"{NOTEBOOK_DIR}/config-enwiki-4k.yaml\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Start the foundation model training\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", + " python lightning_trainer.py fit \\\n", + " -c \"{NOTEBOOK_DIR}/config-enwiki-4k.yaml\" \\\n", + " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-4k Foundation (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n", + " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", + " --trainer.devices=\"{GPU_DEVICES}\" \\\n", + " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-enwiki-4k/\" \\\n", + " --model.load_model=\"../model/{FILENAME_PREFIX}-neox-init.pth\" \\\n", + " --model.ctx_len=4096 \\\n", + " --model.bptt_learning_range=1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets export the model from the checkpoint\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-enwiki-4k/last.ckpt\" \"../model/{FILENAME_PREFIX}-enwiki-4k.pth\" \"bf16\"\n", + "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-enwiki-4k.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # Lets do a quick dragon prompt validation\n", + "!cd \"{INFERENCE_DIR}\" && \\\n", + " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-4k.pth\" \"cuda fp32\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Basic Instruct Tuning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets preload the requried dataset\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python3 preload_datapath.py \"{NOTEBOOK_DIR}/config-enwiki-instruct.yaml\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Start the instruct finetuning\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", + " python lightning_trainer.py fit \\\n", + " -c \"{NOTEBOOK_DIR}/config-enwiki-instruct.yaml\" \\\n", + " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-Instruct (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n", + " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", + " --trainer.devices=\"{GPU_DEVICES}\" \\\n", + " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-enwiki-instruct/\" \\\n", + " --model.load_model=\"../model/{FILENAME_PREFIX}-enwiki-4k.pth\" \\\n", + " --model.ctx_len=4096 \\\n", + " --model.bptt_learning_range=1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets export the model from the checkpoint\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-enwiki-instruct/last.ckpt\" \"../model/{FILENAME_PREFIX}-enwiki-instruct.pth\" \"bf16\"\n", + "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-enwiki-instruct.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets do a quick dragon prompt validation\n", + "!cd \"{INFERENCE_DIR}\" && \\\n", + " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-instruct.pth\" \"cuda fp32\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Small Model: L6-D512\n", + "\n", + "### Enwiki training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "EMBED_DIM=512\n", + "WANDB_PREFIX=f\"[small-model] v5-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE}\"\n", + "FILENAME_PREFIX=f\"v5-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Init the model\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python3 ./init_model.py \\\n", + " --n_layer \"{LAYER_COUNT}\" --n_embd \"{EMBED_DIM}\" \\\n", + " --emb-scale \"{EMBED_SCALE}\" \\\n", + " --vocab_size neox --skip-if-exists \\\n", + " \"../model/{FILENAME_PREFIX}-neox-init.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets preload the requried dataset \n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python3 preload_datapath.py \"{NOTEBOOK_DIR}/config-enwiki-4k.yaml\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Start the foundation model training\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", + " python lightning_trainer.py fit \\\n", + " -c \"{NOTEBOOK_DIR}/config-enwiki-4k.yaml\" \\\n", + " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-4k Foundation (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n", + " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", + " --trainer.devices=\"{GPU_DEVICES}\" \\\n", + " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-enwiki-4k/\" \\\n", + " --model.load_model=\"../model/{FILENAME_PREFIX}-neox-init.pth\" \\\n", + " --model.ctx_len=4096 \\\n", + " --model.bptt_learning_range=1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets export the model from the checkpoint\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-enwiki-4k/last.ckpt\" \"../model/{FILENAME_PREFIX}-enwiki-4k.pth\" \"bf16\"\n", + "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-enwiki-4k.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # Lets do a quick dragon prompt validation\n", + "!cd \"{INFERENCE_DIR}\" && \\\n", + " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-4k.pth\" \"cuda fp32\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Basic Instruct Tuning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets preload the requried dataset\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python3 preload_datapath.py \"{NOTEBOOK_DIR}/config-enwiki-instruct.yaml\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Start the instruct finetuning\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", + " python lightning_trainer.py fit \\\n", + " -c \"{NOTEBOOK_DIR}/config-enwiki-instruct.yaml\" \\\n", + " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-Instruct (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n", + " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", + " --trainer.devices=\"{GPU_DEVICES}\" \\\n", + " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-enwiki-instruct/\" \\\n", + " --model.load_model=\"../model/{FILENAME_PREFIX}-enwiki-4k.pth\" \\\n", + " --model.ctx_len=4096 \\\n", + " --model.bptt_learning_range=1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets export the model from the checkpoint\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-enwiki-instruct/last.ckpt\" \"../model/{FILENAME_PREFIX}-enwiki-instruct.pth\" \"bf16\"\n", + "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-enwiki-instruct.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets do a quick dragon prompt validation\n", + "!cd \"{INFERENCE_DIR}\" && \\\n", + " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-instruct.pth\" \"cuda fp32\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Small Model: L6-D1024\n", + "\n", + "### Enwiki training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "EMBED_DIM=1024\n", + "WANDB_PREFIX=f\"[small-model] v5-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE}\"\n", + "FILENAME_PREFIX=f\"v5-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Init the model\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python3 ./init_model.py \\\n", + " --n_layer \"{LAYER_COUNT}\" --n_embd \"{EMBED_DIM}\" \\\n", + " --emb-scale \"{EMBED_SCALE}\" \\\n", + " --vocab_size neox --skip-if-exists \\\n", + " \"../model/{FILENAME_PREFIX}-neox-init.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets preload the requried dataset \n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python3 preload_datapath.py \"{NOTEBOOK_DIR}/config-enwiki-4k.yaml\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Start the foundation model training\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", + " python lightning_trainer.py fit \\\n", + " -c \"{NOTEBOOK_DIR}/config-enwiki-4k.yaml\" \\\n", + " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-4k Foundation (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n", + " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", + " --trainer.devices=\"{GPU_DEVICES}\" \\\n", + " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-enwiki-4k/\" \\\n", + " --model.load_model=\"../model/{FILENAME_PREFIX}-neox-init.pth\" \\\n", + " --model.ctx_len=4096 \\\n", + " --model.bptt_learning_range=1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets export the model from the checkpoint\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-enwiki-4k/last.ckpt\" \"../model/{FILENAME_PREFIX}-enwiki-4k.pth\" \"bf16\"\n", + "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-enwiki-4k.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # Lets do a quick dragon prompt validation\n", + "!cd \"{INFERENCE_DIR}\" && \\\n", + " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-4k.pth\" \"cuda fp32\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Basic Instruct Tuning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets preload the requried dataset\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python3 preload_datapath.py \"{NOTEBOOK_DIR}/config-enwiki-instruct.yaml\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Start the instruct finetuning\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", + " python lightning_trainer.py fit \\\n", + " -c \"{NOTEBOOK_DIR}/config-enwiki-instruct.yaml\" \\\n", + " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-Instruct (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n", + " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", + " --trainer.devices=\"{GPU_DEVICES}\" \\\n", + " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-enwiki-instruct/\" \\\n", + " --model.load_model=\"../model/{FILENAME_PREFIX}-enwiki-4k.pth\" \\\n", + " --model.ctx_len=4096 \\\n", + " --model.bptt_learning_range=1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets export the model from the checkpoint\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-enwiki-instruct/last.ckpt\" \"../model/{FILENAME_PREFIX}-enwiki-instruct.pth\" \"bf16\"\n", + "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-enwiki-instruct.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets do a quick dragon prompt validation\n", + "!cd \"{INFERENCE_DIR}\" && \\\n", + " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-instruct.pth\" \"cuda fp32\"" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}