diff --git a/notebook/experiment/rwkv-x-exp/v5-memory/v5-L96-D1024-E1e-1-ctx4k-part2.ipynb b/notebook/experiment/rwkv-x-exp/v5-memory/v5-L96-D1024-E1e-1-ctx4k-part2.ipynb index a8841971..fe861acc 100644 --- a/notebook/experiment/rwkv-x-exp/v5-memory/v5-L96-D1024-E1e-1-ctx4k-part2.ipynb +++ b/notebook/experiment/rwkv-x-exp/v5-memory/v5-L96-D1024-E1e-1-ctx4k-part2.ipynb @@ -98,7 +98,7 @@ "source": [ "# Download the model directly (stop gap till HF sync issues is resolved)\n", "!cd \"{TRAINER_DIR}\" && cd \"../model/\" && \\\n", - " wget -nc \"https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/main/experiment/rwkv-x-exp/v5-memory/{FILENAME_PREFIX}-mem-ctx-512.pth\"\n", + " wget -nc \"https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/main/experiment/rwkv-x-exp/v5-memory/{FILENAME_PREFIX}-enwiki-4k.pth\"\n", "\n", "!cd \"{TRAINER_DIR}\" && cd \"../model/\" && \\\n", " ls -alh ." @@ -459,426 +459,6 @@ "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-ctx-512.pth\"" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Tune 3 : Low ctx size (1024), memory training\n", - "\n", - "- Tune 3: Low ctx size (1024), Scaling up !" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%script bash\n", - "\n", - "########################################\n", - "# Generate the required jsonl dataset\n", - "########################################\n", - "\n", - "# Reset the dataset dir\n", - "mkdir -p ../dataset\n", - "rm -rf ../dataset/*.jsonl\n", - "\n", - "# Generate the various datasets\n", - "echo \"## Generating word reptition dataset ##\"\n", - "\n", - "#\n", - "# We reduce the training set for lower word count - and shift the focus upwards\n", - "#\n", - "python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/word-2-count.jsonl 2 400 &\n", - "for i in {5..45..5} \n", - "do\n", - " python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 400 & \n", - " python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 10 & \n", - "done\n", - "\n", - "#\n", - "# Ramping up the 50+ - 510 words dataset\n", - "# \n", - "for i in {50..550..5} \n", - "do\n", - " python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 800 & \n", - " python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 20 & \n", - "done\n", - "\n", - "wait\n", - "echo \"## Done ##\"\n", - "\n", - "ls -alh ../dataset/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Start the finetune model training\n", - "!cd \"{TRAINER_DIR}\" && \\\n", - " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", - " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", - " python lightning_trainer.py fit \\\n", - " -c \"{NOTEBOOK_DIR}/config-mem-template.yaml\" \\\n", - " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Mem-Tune ctx-1k (train-ctx=1k, {DEEPSPEED_STRAT})\" \\\n", - " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", - " --trainer.devices=\"{GPU_DEVICES}\" \\\n", - " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-mem-ctx-1k/\" \\\n", - " --model.lr_init=4e-4 \\\n", - " --model.lr_final=2e-4 \\\n", - " --data.max_token_size=1024 \\\n", - " --model.ctx_len=1024 \\\n", - " --model.bptt_learning_range=1 \\\n", - " --model.load_model=\"../model/{FILENAME_PREFIX}-mem-ctx-512.pth\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Lets export the model from the checkpoint\n", - "!cd \"{TRAINER_DIR}\" && \\\n", - " python export_checkpoint.py \\\n", - " \"../checkpoint/{FILENAME_PREFIX}-mem-ctx-1k/last.ckpt\" \\\n", - " \"../model/{FILENAME_PREFIX}-mem-ctx-1k.pth\" \"bf16\"\n", - "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-mem-ctx-1k.pth\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Lets do a quick memory test\n", - "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", - " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-ctx-1k.pth\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Tune 4 : Low ctx size (2048), memory training\n", - "\n", - "- Tune 4: Low ctx size (2048), Scaling up !" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%script bash\n", - "\n", - "########################################\n", - "# Generate the required jsonl dataset\n", - "########################################\n", - "\n", - "# Reset the dataset dir\n", - "mkdir -p ../dataset\n", - "rm -rf ../dataset/*.jsonl\n", - "\n", - "# Generate the various datasets\n", - "echo \"## Generating word reptition dataset ##\"\n", - "\n", - "#\n", - "# We reduce the training set for lower word count - and shift the focus upwards\n", - "#\n", - "python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/word-2-count.jsonl 2 100 &\n", - "for i in {5..100..5} \n", - "do\n", - " python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 100 & \n", - " python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 1 & \n", - "done\n", - "\n", - "#\n", - "# Ramping up the 105+ - 1050 words dataset\n", - "# \n", - "for i in {105..2000..5} \n", - "do\n", - " python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 200 & \n", - " python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 20 & \n", - "done\n", - "\n", - "wait\n", - "echo \"## Done ##\"\n", - "\n", - "ls -alh ../dataset/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Start the finetune model training\n", - "!cd \"{TRAINER_DIR}\" && \\\n", - " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", - " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", - " python lightning_trainer.py fit \\\n", - " -c \"{NOTEBOOK_DIR}/config-mem-template.yaml\" \\\n", - " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Mem-Tune ctx-2k (train-ctx=2k, {DEEPSPEED_STRAT})\" \\\n", - " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", - " --trainer.devices=\"{GPU_DEVICES}\" \\\n", - " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-mem-ctx-2k/\" \\\n", - " --model.lr_init=3e-4 \\\n", - " --model.lr_final=1e-4 \\\n", - " --data.max_token_size=2048 \\\n", - " --model.ctx_len=2048 \\\n", - " --model.bptt_learning_range=1 \\\n", - " --model.load_model=\"../model/{FILENAME_PREFIX}-mem-ctx-1k.pth\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Lets export the model from the checkpoint\n", - "!cd \"{TRAINER_DIR}\" && \\\n", - " python export_checkpoint.py \\\n", - " \"../checkpoint/{FILENAME_PREFIX}-mem-ctx-2k/last.ckpt\" \\\n", - " \"../model/{FILENAME_PREFIX}-mem-ctx-2k.pth\" \"bf16\"\n", - "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-mem-ctx-2k.pth\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Lets do a quick memory test\n", - "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", - " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-ctx-2k.pth\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Tune 5 : Ramping up the ctx size (4096), memory training\n", - "\n", - "- Tune 5: Mid ctx size (4096), Scaling up!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%script bash\n", - "\n", - "########################################\n", - "# Generate the required jsonl dataset\n", - "########################################\n", - "\n", - "# Reset the dataset dir\n", - "mkdir -p ../dataset\n", - "rm -rf ../dataset/*.jsonl\n", - "\n", - "# Generate the various datasets\n", - "echo \"## Generating word reptition dataset ##\"\n", - "\n", - "#\n", - "# We reduce the training set for < 50 words - and shift the focus upwards\n", - "# (aka 50-100 token * 2 : ~100 - 250 token ctx len)\n", - "#\n", - "python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/word-2-count.jsonl 2 100 &\n", - "for i in {5..500..5} \n", - "do\n", - " python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 100 & \n", - " python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 1 & \n", - "done\n", - "\n", - "#\n", - "# Ramping up the 50+ - 2100 words dataset\n", - "# \n", - "for i in {505..4000..5} \n", - "do\n", - " python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 200 & \n", - " python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 20 & \n", - "done\n", - "\n", - "wait\n", - "echo \"## Done ##\"\n", - "\n", - "ls -alh ../dataset/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Start the finetune model training\n", - "!cd \"{TRAINER_DIR}\" && \\\n", - " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", - " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", - " python lightning_trainer.py fit \\\n", - " -c \"{NOTEBOOK_DIR}/config-mem-template.yaml\" \\\n", - " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Mem-Tune ctx-4k (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n", - " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", - " --trainer.devices=\"{GPU_DEVICES}\" \\\n", - " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-mem-ctx-4k/\" \\\n", - " --model.lr_init=3e-4 \\\n", - " --model.lr_final=1e-4 \\\n", - " --data.max_token_size=4096 \\\n", - " --model.ctx_len=4096 \\\n", - " --model.bptt_learning_range=1 \\\n", - " --model.load_model=\"../model/{FILENAME_PREFIX}-mem-ctx-1k.pth\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Lets export the model from the checkpoint\n", - "!cd \"{TRAINER_DIR}\" && \\\n", - " python export_checkpoint.py \\\n", - " \"../checkpoint/{FILENAME_PREFIX}-mem-ctx-4k/last.ckpt\" \\\n", - " \"../model/{FILENAME_PREFIX}-mem-ctx-4k.pth\" \"bf16\"\n", - "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-mem-ctx-4k.pth\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Lets do a quick memory test\n", - "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", - " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-ctx-4k.pth\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Tune 6 : Ramping up the ctx size (8192), memory training\n", - "\n", - "- Tune 6: Large ctx size (8192), Scaling up!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%script bash\n", - "\n", - "########################################\n", - "# Generate the required jsonl dataset\n", - "########################################\n", - "\n", - "# Reset the dataset dir\n", - "mkdir -p ../dataset\n", - "rm -rf ../dataset/*.jsonl\n", - "\n", - "# Generate the various datasets\n", - "echo \"## Generating word reptition dataset ##\"\n", - "\n", - "#\n", - "# We reduce the training set for < 50 words - and shift the focus upwards\n", - "# (aka 50-100 token * 2 : ~100 - 250 token ctx len)\n", - "#\n", - "python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/word-2-count.jsonl 2 50 &\n", - "for i in {5..1000..5} \n", - "do\n", - " python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 50 & \n", - " python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 1 & \n", - "done\n", - "\n", - "#\n", - "# Ramping up the 50+ - 4200 words dataset\n", - "# \n", - "for i in {1100..8000..100} \n", - "do\n", - " python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 2000 & \n", - " python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 20 & \n", - "done\n", - "\n", - "wait\n", - "echo \"## Done ##\"\n", - "\n", - "ls -lh ../dataset/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Start the finetune model training\n", - "!cd \"{TRAINER_DIR}\" && \\\n", - " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", - " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", - " python lightning_trainer.py fit \\\n", - " -c \"{NOTEBOOK_DIR}/config-mem-template.yaml\" \\\n", - " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Mem-Tune ctx-8k (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n", - " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", - " --trainer.devices=\"{GPU_DEVICES}\" \\\n", - " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-mem-ctx-8k/\" \\\n", - " --model.lr_init=3e-4 \\\n", - " --model.lr_final=1e-4 \\\n", - " --data.max_token_size=8192 \\\n", - " --model.ctx_len=4096 \\\n", - " --model.bptt_learning_range=2 \\\n", - " --model.load_model=\"../model/{FILENAME_PREFIX}-mem-ctx-4k.pth\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Lets export the model from the checkpoint\n", - "!cd \"{TRAINER_DIR}\" && \\\n", - " python export_checkpoint.py \\\n", - " \"../checkpoint/{FILENAME_PREFIX}-mem-ctx-8k/last.ckpt\" \\\n", - " \"../model/{FILENAME_PREFIX}-mem-ctx-8k.pth\" \"bf16\"\n", - "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-mem-ctx-8k.pth\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Lets do a quick memory test\n", - "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", - " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-ctx-8k.pth\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", - " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-ctx-8k.pth\" \"none\" 1000 4000" - ] } ], "metadata": { diff --git a/notebook/experiment/rwkv-x-exp/v5-memory/v5-L96-D1024-E1e-1-ctx4k-part3.ipynb b/notebook/experiment/rwkv-x-exp/v5-memory/v5-L96-D1024-E1e-1-ctx4k-part3.ipynb new file mode 100644 index 00000000..a8841971 --- /dev/null +++ b/notebook/experiment/rwkv-x-exp/v5-memory/v5-L96-D1024-E1e-1-ctx4k-part3.ipynb @@ -0,0 +1,905 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RWKV v5 / embedding init-range 1e-01 / 4k\n", + "\n", + "- 96 layers\n", + "- 1024 embedding size\n", + "\n", + "Going through the modified memory training for v5 models, across various initial embedding model weights\n", + "\n", + "**Note:** This project assumes you have the rwkv-infctx conda env setup" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Basic Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# First lets setup the various directories, and init the model\n", + "!mkdir -p ../../../../model/\n", + "!mkdir -p ../../../../datapath/\n", + "!mkdir -p ../../../../checkpoint/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Additional dependencies for eval stuff\n", + "!pip install -q aiocsv aiofiles" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "DEEPSPEED_STRAT=\"deepspeed_stage_1\"\n", + "GPU_DEVICES=\"auto\"\n", + "ENABLE_WANDB=True\n", + "\n", + "# Layer count and embed dim to start with\n", + "LAYER_COUNT=96\n", + "EMBED_DIM=1024\n", + "\n", + "# Wavnet compatibility?\n", + "RWKV_WAVENET_LAYERS=0\n", + "\n", + "EMBED_SCALE=0.1\n", + "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n", + "\n", + "WANDB_PREFIX=f\"v5-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE}\"\n", + "FILENAME_PREFIX=f\"v5-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}\"\n", + "\n", + "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n", + "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n", + "print(\"GPU_DEVICES:\", GPU_DEVICES)\n", + "\n", + "if ENABLE_WANDB:\n", + " WANDB_MODE=\"online\"\n", + "else:\n", + " WANDB_MODE=\"disabled\"\n", + "\n", + "# Computing the notebook, and various paths\n", + "import os\n", + "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n", + "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../../\"))\n", + "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", + "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", + "\n", + "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n", + "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n", + "print(\"TRAINER_DIR:\", TRAINER_DIR)\n", + "print(\"PROJECT_DIR:\", PROJECT_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Download the model directly (stop gap till HF sync issues is resolved)\n", + "!cd \"{TRAINER_DIR}\" && cd \"../model/\" && \\\n", + " wget -nc \"https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/main/experiment/rwkv-x-exp/v5-memory/{FILENAME_PREFIX}-mem-ctx-512.pth\"\n", + "\n", + "!cd \"{TRAINER_DIR}\" && cd \"../model/\" && \\\n", + " ls -alh ." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Enwiki Stage 1 : Foundation 4k model training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets preload the requried dataset \n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python3 preload_datapath.py \"{NOTEBOOK_DIR}/config-enwiki-4k.yaml\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Start the foundation model training\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", + " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", + " python lightning_trainer.py fit \\\n", + " -c \"{NOTEBOOK_DIR}/config-enwiki-4k.yaml\" \\\n", + " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-4k Foundation (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n", + " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", + " --trainer.devices=\"{GPU_DEVICES}\" \\\n", + " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-enwiki-4k/\" \\\n", + " --model.load_model=\"../model/{FILENAME_PREFIX}-neox-init.pth\" \\\n", + " --model.ctx_len=4096 \\\n", + " --model.bptt_learning_range=1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets export the model from the checkpoint\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-enwiki-4k/last.ckpt\" \"../model/{FILENAME_PREFIX}-enwiki-4k.pth\" \"bf16\"\n", + "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-enwiki-4k.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # Lets do a quick dragon prompt validation\n", + "!cd \"{INFERENCE_DIR}\" && \\\n", + " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", + " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-4k.pth\" \"cuda fp32\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets do a quick memory test\n", + "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", + " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-enwiki-4k.pth\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Enwiki Stage 2 : Basic Instruct Tuning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets preload the requried dataset\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python3 preload_datapath.py \"{NOTEBOOK_DIR}/config-enwiki-instruct.yaml\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Start the instruct finetuning\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", + " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", + " python lightning_trainer.py fit \\\n", + " -c \"{NOTEBOOK_DIR}/config-enwiki-instruct.yaml\" \\\n", + " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-Instruct (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n", + " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", + " --trainer.devices=\"{GPU_DEVICES}\" \\\n", + " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-enwiki-instruct/\" \\\n", + " --model.load_model=\"../model/{FILENAME_PREFIX}-enwiki-4k.pth\" \\\n", + " --model.ctx_len=4096 \\\n", + " --model.bptt_learning_range=1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets export the model from the checkpoint\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python export_checkpoint.py \"../checkpoint/{FILENAME_PREFIX}-enwiki-instruct/last.ckpt\" \"../model/{FILENAME_PREFIX}-enwiki-instruct.pth\" \"bf16\"\n", + "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-enwiki-instruct.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # Lets do a quick dragon prompt validation\n", + "!cd \"{INFERENCE_DIR}\" && \\\n", + " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", + " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-instruct.pth\" \"cuda fp32\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets do a quick memory test\n", + "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", + " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-enwiki-instruct.pth\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tune 1 : Simple Memory instruct finetuning\n", + "\n", + "- Tune 1: Low ctx size (512), Training with only the input masked. This does very limited memory training, and is used primarily to train the instruction set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%script bash\n", + "\n", + "########################################\n", + "# Generate the required jsonl dataset\n", + "########################################\n", + "\n", + "# Reset the dataset dir\n", + "mkdir -p ../dataset\n", + "rm -rf ../dataset/*.jsonl\n", + "\n", + "# Generate the various datasets\n", + "echo \"## Generating word reptition dataset ##\"\n", + "\n", + "# We do a strong bias for smaller word count, to teach the concept from scratch\n", + "# so that the model can learn the function. \n", + "#\n", + "# Note that all document samples, are randomized between the target word count, \n", + "# to half of the target word count.\n", + "python ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-2-count.jsonl 2 5000 &\n", + "python ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-5-count.jsonl 5 5000 &\n", + "python ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-10-count.jsonl 10 2500 &\n", + "python ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-15-count.jsonl 15 2500 &\n", + "python ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-20-count.jsonl 20 2500 &\n", + "python ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-25-count.jsonl 25 2500 &\n", + "python ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-40-count.jsonl 40 2500 &\n", + "python ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-50-count.jsonl 50 2500 &\n", + "python ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-60-count.jsonl 80 2500 &\n", + "python ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-80-count.jsonl 80 2500 &\n", + "\n", + "# With a slight mix of the larger word count\n", + "python ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-100-count.jsonl 100 2500 &\n", + "python ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-200-count.jsonl 200 2500 &\n", + "\n", + "wait\n", + "echo \"## Done ##\"\n", + "\n", + "ls -alh ../dataset/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Start the finetune model training\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", + " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", + " python lightning_trainer.py fit \\\n", + " -c \"{NOTEBOOK_DIR}/config-mem-instruct.yaml\" \\\n", + " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Mem-Instruct (train-ctx=512, {DEEPSPEED_STRAT})\" \\\n", + " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", + " --trainer.devices=\"{GPU_DEVICES}\" \\\n", + " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-mem-instruct/\" \\\n", + " --model.load_model=\"../model/{FILENAME_PREFIX}-enwiki-instruct.pth\" \\\n", + " --model.ctx_len=512 \\\n", + " --model.bptt_learning_range=1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets export the model from the checkpoint\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python export_checkpoint.py \\\n", + " \"../checkpoint/{FILENAME_PREFIX}-mem-instruct/last.ckpt\" \\\n", + " \"../model/{FILENAME_PREFIX}-mem-instruct.pth\" \"bf16\"\n", + "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-mem-instruct.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets do a quick memory test\n", + "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", + " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-instruct.pth\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tune 2 : Low ctx size (512), memory training\n", + "\n", + "- Tune 2: Low ctx size (512), Training with instruction & input masked. This forces the actual memory training on the output tokens." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%script bash\n", + "\n", + "########################################\n", + "# Generate the required jsonl dataset\n", + "########################################\n", + "\n", + "# Reset the dataset dir\n", + "mkdir -p ../dataset\n", + "rm -rf ../dataset/*.jsonl\n", + "\n", + "# Generate the various datasets\n", + "echo \"## Generating word reptition dataset ##\"\n", + "\n", + "#\n", + "# We switch over to fully masked instruct+input, to properly learn the memorization task\n", + "#\n", + "python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/word-2-count.jsonl 2 5000 &\n", + "for i in {5..95..5} \n", + "do\n", + " python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 5000 & \n", + "done\n", + "python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/word-100-count.jsonl 100 5000 &\n", + "python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/word-200-count.jsonl 200 5000 &\n", + "\n", + "#\n", + "# We mixin the shuffled word list, so that we ensure all words / tokens are learned\n", + "# however this might intrduce an exclusion bias (if seen this word, never repeat it), \n", + "# so we limit the mixture of this data samples\n", + "#\n", + "python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-10-count.jsonl 10 20 &\n", + "python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-15-count.jsonl 15 20 &\n", + "python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-25-count.jsonl 25 30 &\n", + "python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-50-count.jsonl 50 50 &\n", + "python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-75-count.jsonl 75 50 &\n", + "python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-100-count.jsonl 100 50 &\n", + "python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-200-count.jsonl 200 50 &\n", + "\n", + "wait\n", + "echo \"## Done ##\"\n", + "\n", + "ls -alh ../dataset/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Start the finetune model training\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", + " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", + " python lightning_trainer.py fit \\\n", + " -c \"{NOTEBOOK_DIR}/config-mem-template.yaml\" \\\n", + " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Mem-Tune ctx-512 (train-ctx=512, {DEEPSPEED_STRAT})\" \\\n", + " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", + " --trainer.devices=\"{GPU_DEVICES}\" \\\n", + " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-mem-ctx-512/\" \\\n", + " --model.lr_init=5e-4 \\\n", + " --model.lr_final=4e-4 \\\n", + " --data.max_token_size=512 \\\n", + " --model.ctx_len=512 \\\n", + " --model.bptt_learning_range=1 \\\n", + " --model.load_model=\"../model/{FILENAME_PREFIX}-mem-instruct.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets export the model from the checkpoint\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python export_checkpoint.py \\\n", + " \"../checkpoint/{FILENAME_PREFIX}-mem-ctx-512/last.ckpt\" \\\n", + " \"../model/{FILENAME_PREFIX}-mem-ctx-512.pth\" \"bf16\"\n", + "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-mem-ctx-512.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets do a quick memory test\n", + "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", + " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-ctx-512.pth\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tune 3 : Low ctx size (1024), memory training\n", + "\n", + "- Tune 3: Low ctx size (1024), Scaling up !" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%script bash\n", + "\n", + "########################################\n", + "# Generate the required jsonl dataset\n", + "########################################\n", + "\n", + "# Reset the dataset dir\n", + "mkdir -p ../dataset\n", + "rm -rf ../dataset/*.jsonl\n", + "\n", + "# Generate the various datasets\n", + "echo \"## Generating word reptition dataset ##\"\n", + "\n", + "#\n", + "# We reduce the training set for lower word count - and shift the focus upwards\n", + "#\n", + "python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/word-2-count.jsonl 2 400 &\n", + "for i in {5..45..5} \n", + "do\n", + " python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 400 & \n", + " python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 10 & \n", + "done\n", + "\n", + "#\n", + "# Ramping up the 50+ - 510 words dataset\n", + "# \n", + "for i in {50..550..5} \n", + "do\n", + " python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 800 & \n", + " python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 20 & \n", + "done\n", + "\n", + "wait\n", + "echo \"## Done ##\"\n", + "\n", + "ls -alh ../dataset/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Start the finetune model training\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", + " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", + " python lightning_trainer.py fit \\\n", + " -c \"{NOTEBOOK_DIR}/config-mem-template.yaml\" \\\n", + " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Mem-Tune ctx-1k (train-ctx=1k, {DEEPSPEED_STRAT})\" \\\n", + " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", + " --trainer.devices=\"{GPU_DEVICES}\" \\\n", + " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-mem-ctx-1k/\" \\\n", + " --model.lr_init=4e-4 \\\n", + " --model.lr_final=2e-4 \\\n", + " --data.max_token_size=1024 \\\n", + " --model.ctx_len=1024 \\\n", + " --model.bptt_learning_range=1 \\\n", + " --model.load_model=\"../model/{FILENAME_PREFIX}-mem-ctx-512.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets export the model from the checkpoint\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python export_checkpoint.py \\\n", + " \"../checkpoint/{FILENAME_PREFIX}-mem-ctx-1k/last.ckpt\" \\\n", + " \"../model/{FILENAME_PREFIX}-mem-ctx-1k.pth\" \"bf16\"\n", + "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-mem-ctx-1k.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets do a quick memory test\n", + "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", + " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-ctx-1k.pth\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tune 4 : Low ctx size (2048), memory training\n", + "\n", + "- Tune 4: Low ctx size (2048), Scaling up !" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%script bash\n", + "\n", + "########################################\n", + "# Generate the required jsonl dataset\n", + "########################################\n", + "\n", + "# Reset the dataset dir\n", + "mkdir -p ../dataset\n", + "rm -rf ../dataset/*.jsonl\n", + "\n", + "# Generate the various datasets\n", + "echo \"## Generating word reptition dataset ##\"\n", + "\n", + "#\n", + "# We reduce the training set for lower word count - and shift the focus upwards\n", + "#\n", + "python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/word-2-count.jsonl 2 100 &\n", + "for i in {5..100..5} \n", + "do\n", + " python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 100 & \n", + " python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 1 & \n", + "done\n", + "\n", + "#\n", + "# Ramping up the 105+ - 1050 words dataset\n", + "# \n", + "for i in {105..2000..5} \n", + "do\n", + " python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 200 & \n", + " python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 20 & \n", + "done\n", + "\n", + "wait\n", + "echo \"## Done ##\"\n", + "\n", + "ls -alh ../dataset/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Start the finetune model training\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", + " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", + " python lightning_trainer.py fit \\\n", + " -c \"{NOTEBOOK_DIR}/config-mem-template.yaml\" \\\n", + " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Mem-Tune ctx-2k (train-ctx=2k, {DEEPSPEED_STRAT})\" \\\n", + " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", + " --trainer.devices=\"{GPU_DEVICES}\" \\\n", + " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-mem-ctx-2k/\" \\\n", + " --model.lr_init=3e-4 \\\n", + " --model.lr_final=1e-4 \\\n", + " --data.max_token_size=2048 \\\n", + " --model.ctx_len=2048 \\\n", + " --model.bptt_learning_range=1 \\\n", + " --model.load_model=\"../model/{FILENAME_PREFIX}-mem-ctx-1k.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets export the model from the checkpoint\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python export_checkpoint.py \\\n", + " \"../checkpoint/{FILENAME_PREFIX}-mem-ctx-2k/last.ckpt\" \\\n", + " \"../model/{FILENAME_PREFIX}-mem-ctx-2k.pth\" \"bf16\"\n", + "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-mem-ctx-2k.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets do a quick memory test\n", + "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", + " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-ctx-2k.pth\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tune 5 : Ramping up the ctx size (4096), memory training\n", + "\n", + "- Tune 5: Mid ctx size (4096), Scaling up!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%script bash\n", + "\n", + "########################################\n", + "# Generate the required jsonl dataset\n", + "########################################\n", + "\n", + "# Reset the dataset dir\n", + "mkdir -p ../dataset\n", + "rm -rf ../dataset/*.jsonl\n", + "\n", + "# Generate the various datasets\n", + "echo \"## Generating word reptition dataset ##\"\n", + "\n", + "#\n", + "# We reduce the training set for < 50 words - and shift the focus upwards\n", + "# (aka 50-100 token * 2 : ~100 - 250 token ctx len)\n", + "#\n", + "python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/word-2-count.jsonl 2 100 &\n", + "for i in {5..500..5} \n", + "do\n", + " python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 100 & \n", + " python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 1 & \n", + "done\n", + "\n", + "#\n", + "# Ramping up the 50+ - 2100 words dataset\n", + "# \n", + "for i in {505..4000..5} \n", + "do\n", + " python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 200 & \n", + " python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 20 & \n", + "done\n", + "\n", + "wait\n", + "echo \"## Done ##\"\n", + "\n", + "ls -alh ../dataset/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Start the finetune model training\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", + " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", + " python lightning_trainer.py fit \\\n", + " -c \"{NOTEBOOK_DIR}/config-mem-template.yaml\" \\\n", + " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Mem-Tune ctx-4k (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n", + " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", + " --trainer.devices=\"{GPU_DEVICES}\" \\\n", + " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-mem-ctx-4k/\" \\\n", + " --model.lr_init=3e-4 \\\n", + " --model.lr_final=1e-4 \\\n", + " --data.max_token_size=4096 \\\n", + " --model.ctx_len=4096 \\\n", + " --model.bptt_learning_range=1 \\\n", + " --model.load_model=\"../model/{FILENAME_PREFIX}-mem-ctx-1k.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets export the model from the checkpoint\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python export_checkpoint.py \\\n", + " \"../checkpoint/{FILENAME_PREFIX}-mem-ctx-4k/last.ckpt\" \\\n", + " \"../model/{FILENAME_PREFIX}-mem-ctx-4k.pth\" \"bf16\"\n", + "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-mem-ctx-4k.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets do a quick memory test\n", + "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", + " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-ctx-4k.pth\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tune 6 : Ramping up the ctx size (8192), memory training\n", + "\n", + "- Tune 6: Large ctx size (8192), Scaling up!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%script bash\n", + "\n", + "########################################\n", + "# Generate the required jsonl dataset\n", + "########################################\n", + "\n", + "# Reset the dataset dir\n", + "mkdir -p ../dataset\n", + "rm -rf ../dataset/*.jsonl\n", + "\n", + "# Generate the various datasets\n", + "echo \"## Generating word reptition dataset ##\"\n", + "\n", + "#\n", + "# We reduce the training set for < 50 words - and shift the focus upwards\n", + "# (aka 50-100 token * 2 : ~100 - 250 token ctx len)\n", + "#\n", + "python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/word-2-count.jsonl 2 50 &\n", + "for i in {5..1000..5} \n", + "do\n", + " python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 50 & \n", + " python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 1 & \n", + "done\n", + "\n", + "#\n", + "# Ramping up the 50+ - 4200 words dataset\n", + "# \n", + "for i in {1100..8000..100} \n", + "do\n", + " python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 2000 & \n", + " python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 20 & \n", + "done\n", + "\n", + "wait\n", + "echo \"## Done ##\"\n", + "\n", + "ls -lh ../dataset/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Start the finetune model training\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", + " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", + " python lightning_trainer.py fit \\\n", + " -c \"{NOTEBOOK_DIR}/config-mem-template.yaml\" \\\n", + " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Mem-Tune ctx-8k (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n", + " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", + " --trainer.devices=\"{GPU_DEVICES}\" \\\n", + " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-mem-ctx-8k/\" \\\n", + " --model.lr_init=3e-4 \\\n", + " --model.lr_final=1e-4 \\\n", + " --data.max_token_size=8192 \\\n", + " --model.ctx_len=4096 \\\n", + " --model.bptt_learning_range=2 \\\n", + " --model.load_model=\"../model/{FILENAME_PREFIX}-mem-ctx-4k.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets export the model from the checkpoint\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python export_checkpoint.py \\\n", + " \"../checkpoint/{FILENAME_PREFIX}-mem-ctx-8k/last.ckpt\" \\\n", + " \"../model/{FILENAME_PREFIX}-mem-ctx-8k.pth\" \"bf16\"\n", + "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-mem-ctx-8k.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets do a quick memory test\n", + "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", + " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-ctx-8k.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", + " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-ctx-8k.pth\" \"none\" 1000 4000" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebook/experiment/rwkv-x-exp/v5-memory/v5-L96-D1024-E1e-1-ctx4k-part4.ipynb b/notebook/experiment/rwkv-x-exp/v5-memory/v5-L96-D1024-E1e-1-ctx4k-part4.ipynb new file mode 100644 index 00000000..76ee505d --- /dev/null +++ b/notebook/experiment/rwkv-x-exp/v5-memory/v5-L96-D1024-E1e-1-ctx4k-part4.ipynb @@ -0,0 +1,242 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RWKV v5 / embedding init-range 1e-01 / 4k\n", + "\n", + "- 96 layers\n", + "- 1024 embedding size\n", + "\n", + "Going through the modified memory training for v5 models, across various initial embedding model weights\n", + "\n", + "**Note:** This project assumes you have the rwkv-infctx conda env setup" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Basic Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# First lets setup the various directories, and init the model\n", + "!mkdir -p ../../../../model/\n", + "!mkdir -p ../../../../datapath/\n", + "!mkdir -p ../../../../checkpoint/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Additional dependencies for eval stuff\n", + "!pip install -q aiocsv aiofiles" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "DEEPSPEED_STRAT=\"deepspeed_stage_1\"\n", + "GPU_DEVICES=\"auto\"\n", + "ENABLE_WANDB=True\n", + "\n", + "# Layer count and embed dim to start with\n", + "LAYER_COUNT=96\n", + "EMBED_DIM=1024\n", + "\n", + "# Wavnet compatibility?\n", + "RWKV_WAVENET_LAYERS=0\n", + "\n", + "EMBED_SCALE=0.1\n", + "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n", + "\n", + "WANDB_PREFIX=f\"v5-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE}\"\n", + "FILENAME_PREFIX=f\"v5-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}\"\n", + "\n", + "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n", + "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n", + "print(\"GPU_DEVICES:\", GPU_DEVICES)\n", + "\n", + "if ENABLE_WANDB:\n", + " WANDB_MODE=\"online\"\n", + "else:\n", + " WANDB_MODE=\"disabled\"\n", + "\n", + "# Computing the notebook, and various paths\n", + "import os\n", + "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n", + "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../../\"))\n", + "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", + "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n", + "\n", + "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n", + "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n", + "print(\"TRAINER_DIR:\", TRAINER_DIR)\n", + "print(\"PROJECT_DIR:\", PROJECT_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Download the model directly (stop gap till HF sync issues is resolved)\n", + "!cd \"{TRAINER_DIR}\" && cd \"../model/\" && \\\n", + " wget -nc \"https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/main/experiment/rwkv-x-exp/v5-memory/{FILENAME_PREFIX}-mem-ctx-4k.pth\"\n", + "\n", + "!cd \"{TRAINER_DIR}\" && cd \"../model/\" && \\\n", + " ls -alh ." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tune 6 : Ramping up the ctx size (8192), memory training\n", + "\n", + "- Tune 6: Large ctx size (8192), Scaling up!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%script bash\n", + "\n", + "########################################\n", + "# Generate the required jsonl dataset\n", + "########################################\n", + "\n", + "# Reset the dataset dir\n", + "mkdir -p ../dataset\n", + "rm -rf ../dataset/*.jsonl\n", + "\n", + "# Generate the various datasets\n", + "echo \"## Generating word reptition dataset ##\"\n", + "\n", + "#\n", + "# We reduce the training set for < 50 words - and shift the focus upwards\n", + "# (aka 50-100 token * 2 : ~100 - 250 token ctx len)\n", + "#\n", + "python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/word-2-count.jsonl 2 50 &\n", + "for i in {5..1000..5} \n", + "do\n", + " python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 50 & \n", + " python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 1 & \n", + "done\n", + "\n", + "#\n", + "# Ramping up the 50+ - 4200 words dataset\n", + "# \n", + "for i in {1100..8000..100} \n", + "do\n", + " python ../memory_script/gen_limited_prompt_completion_jsonl.py ../dataset/gen-word-$i-count.jsonl $i 2000 & \n", + " python ../memory_script/shuffle_limited_prompt_completion_jsonl.py ../dataset/shuffle-word-$i-count.jsonl $i 20 & \n", + "done\n", + "\n", + "wait\n", + "echo \"## Done ##\"\n", + "\n", + "ls -lh ../dataset/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Start the finetune model training\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", + " export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", + " python lightning_trainer.py fit \\\n", + " -c \"{NOTEBOOK_DIR}/config-mem-template.yaml\" \\\n", + " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Mem-Tune ctx-8k (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n", + " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", + " --trainer.devices=\"{GPU_DEVICES}\" \\\n", + " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-mem-ctx-8k/\" \\\n", + " --model.lr_init=3e-4 \\\n", + " --model.lr_final=1e-4 \\\n", + " --data.max_token_size=8192 \\\n", + " --model.ctx_len=4096 \\\n", + " --model.bptt_learning_range=2 \\\n", + " --model.load_model=\"../model/{FILENAME_PREFIX}-mem-ctx-4k.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets export the model from the checkpoint\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python export_checkpoint.py \\\n", + " \"../checkpoint/{FILENAME_PREFIX}-mem-ctx-8k/last.ckpt\" \\\n", + " \"../model/{FILENAME_PREFIX}-mem-ctx-8k.pth\" \"bf16\"\n", + "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-mem-ctx-8k.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets do a quick memory test\n", + "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", + " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-ctx-8k.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n", + " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-ctx-8k.pth\" \"none\" 1000 4000" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}