From 7f7f61fa0a5e27e25aeb20f07357452cfc024f79 Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Fri, 2 Feb 2024 22:35:28 +0000
Subject: [PATCH 1/2] typo in checkpoint size

---
 .../finetune-example/Eagle-x-zMultipack.ipynb | 157 +++++++-----------
 .../finetune-example/Eagle-x-zMultipack.yaml  |   4 +-
 2 files changed, 65 insertions(+), 96 deletions(-)

diff --git a/notebook/finetune-example/Eagle-x-zMultipack.ipynb b/notebook/finetune-example/Eagle-x-zMultipack.ipynb
index 011e9e23..a3f577c6 100644
--- a/notebook/finetune-example/Eagle-x-zMultipack.ipynb
+++ b/notebook/finetune-example/Eagle-x-zMultipack.ipynb
@@ -280,19 +280,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-02-02 18:49:43,863] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
-      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
-      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.2'\n",
+      "[2024-02-02 20:06:50,112] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-jit' with torch '2.1.2'\n",
       "/root/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/finetune-example/./Eagle-x-zMultipack.yaml', '--model.load_model=../model/RWKV-v5-Eagle-World-7B-v2-20240128-ctx4096.pth', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=../checkpoint/Eagle-x-zMultipack/', '--trainer.logger.init_args.name=RWKV-v5-Finetune - Eagle-x-zMultipack (tctxlen=4096, deepspeed_stage_2)', '--trainer.logger.init_args.project=RWKV-v5-Finetune', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=512', '--trainer.microbatch_size=8', '--model.ctx_len=4096', '--trainer.devices=auto'], args=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/finetune-example/./Eagle-x-zMultipack.yaml', '--model.load_model=../model/RWKV-v5-Eagle-World-7B-v2-20240128-ctx4096.pth', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=../checkpoint/Eagle-x-zMultipack/', '--trainer.logger.init_args.name=RWKV-v5-Finetune - Eagle-x-zMultipack (tctxlen=4096, deepspeed_stage_2)', '--trainer.logger.init_args.project=RWKV-v5-Finetune', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=512', '--trainer.microbatch_size=8', '--model.ctx_len=4096', '--trainer.devices=auto'].\n",
-      "/root/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/fabric/utilities/seed.py:40: No seed found, seed set to 190722569\n",
-      "Seed set to 190722569\n",
+      "/root/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/fabric/utilities/seed.py:40: No seed found, seed set to 2361451725\n",
+      "Seed set to 2361451725\n",
       "/root/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
       "  return self.fget.__get__(instance, owner)()\n",
       "GPU available: True (cuda), used: True\n",
@@ -309,85 +308,80 @@
       "   - accumulate_grad_batches: 8\n",
       "   - effective_batch_size:    512\n",
       "\n",
-      "[rank: 0] Seed set to 190722569\n",
+      "[rank: 0] Seed set to 2361451725\n",
       "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/8\n",
-      "[2024-02-02 18:50:23,466] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
-      "[2024-02-02 18:50:23,504] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
-      "[2024-02-02 18:50:23,557] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
-      "[2024-02-02 18:50:23,558] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
-      "[2024-02-02 18:50:23,686] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
-      "[2024-02-02 18:50:23,690] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
-      "[2024-02-02 18:50:23,691] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
-      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
-      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.2'\n",
-      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
-      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.2'\n",
-      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
-      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.2'\n",
-      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
-      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.2'\n",
-      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
-      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.2'\n",
-      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
-      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.2'\n",
-      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
-      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.2'\n",
-      "[rank: 3] Seed set to 190722569\n",
+      "[2024-02-02 20:07:30,976] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-02-02 20:07:31,016] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-02-02 20:07:31,033] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-02-02 20:07:31,053] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-02-02 20:07:31,094] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-02-02 20:07:31,099] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-02-02 20:07:31,111] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-jit' with torch '2.1.2'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-jit' with torch '2.1.2'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-jit' with torch '2.1.2'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-jit' with torch '2.1.2'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-jit' with torch '2.1.2'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-jit' with torch '2.1.2'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-jit' with torch '2.1.2'\n",
+      "[rank: 1] Seed set to 2361451725\n",
       "/root/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
       "  return self.fget.__get__(instance, owner)()\n",
-      "[rank: 1] Seed set to 190722569\n",
+      "[rank: 6] Seed set to 2361451725\n",
+      "[rank: 5] Seed set to 2361451725\n",
       "/root/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
       "  return self.fget.__get__(instance, owner)()\n",
-      "[rank: 7] Seed set to 190722569\n",
-      "[rank: 5] Seed set to 190722569\n",
       "/root/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
       "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 3] Seed set to 2361451725\n",
+      "[rank: 2] Seed set to 2361451725\n",
       "/root/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
       "  return self.fget.__get__(instance, owner)()\n",
-      "[rank: 2] Seed set to 190722569\n",
-      "[rank: 6] Seed set to 190722569\n",
-      "[rank: 4] Seed set to 190722569\n",
       "/root/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
       "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 7] Seed set to 2361451725\n",
+      "[rank: 4] Seed set to 2361451725\n",
       "/root/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
       "  return self.fget.__get__(instance, owner)()\n",
       "/root/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
       "  return self.fget.__get__(instance, owner)()\n",
-      "[rank: 2] Seed set to 190722569\n",
-      "initializing deepspeed distributed: GLOBAL_RANK: 2, MEMBER: 3/8\n",
-      "[rank: 3] Seed set to 190722569\n",
+      "[rank: 3] Seed set to 2361451725\n",
       "initializing deepspeed distributed: GLOBAL_RANK: 3, MEMBER: 4/8\n",
-      "[rank: 6] Seed set to 190722569\n",
-      "initializing deepspeed distributed: GLOBAL_RANK: 6, MEMBER: 7/8\n",
-      "[rank: 4] Seed set to 190722569\n",
+      "[rank: 4] Seed set to 2361451725\n",
       "initializing deepspeed distributed: GLOBAL_RANK: 4, MEMBER: 5/8\n",
-      "[rank: 5] Seed set to 190722569\n",
-      "initializing deepspeed distributed: GLOBAL_RANK: 5, MEMBER: 6/8\n",
-      "[rank: 7] Seed set to 190722569\n",
+      "[rank: 7] Seed set to 2361451725\n",
       "initializing deepspeed distributed: GLOBAL_RANK: 7, MEMBER: 8/8\n",
-      "[rank: 1] Seed set to 190722569\n",
+      "[rank: 5] Seed set to 2361451725\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 5, MEMBER: 6/8\n",
+      "[rank: 2] Seed set to 2361451725\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 2, MEMBER: 3/8\n",
+      "[rank: 1] Seed set to 2361451725\n",
       "initializing deepspeed distributed: GLOBAL_RANK: 1, MEMBER: 2/8\n",
+      "[rank: 6] Seed set to 2361451725\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 6, MEMBER: 7/8\n",
       "Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.\n",
       "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
       "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.16.2\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20240202_185125-po507nn3\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20240202_200833-9ti9brqw\u001b[0m\n",
       "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\n",
       "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33mRWKV-v5-Finetune - Eagle-x-zMultipack (tctxlen=4096, deepspeed_stage_2)\u001b[0m\n",
       "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-v5-Finetune\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-v5-Finetune/runs/po507nn3\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-v5-Finetune/runs/9ti9brqw\u001b[0m\n",
       "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 7 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 6 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 4 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
       "LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
       "LOCAL_RANK: 5 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
-      "LOCAL_RANK: 7 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
-      "LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)\n",
+      "\n",
+      "\n",
       "[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)\n",
-      "[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)\n",
       "[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)\n",
+      "[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)\n",
       "\n",
-      "LOCAL_RANK: 4 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
-      "[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)\n",
-      "LOCAL_RANK: 6 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
-      "LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
       "#\n",
       "# RWKV lighting_trainer.py important notes \n",
       "# https://github.com/RWKV/RWKV-infctx-trainer \n",
@@ -402,12 +396,10 @@
       "    - lr_final: 4.000e-05 (4e-05)\n",
       "\n",
       "[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)\n",
-      "[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)\n",
-      "[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)\n",
-      "Using /root/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
-      "Using /root/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
       "Using /root/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...Using /root/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
       "Using /root/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
+      "\n",
       "Using /root/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
       "Using /root/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
       "Using /root/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
@@ -418,36 +410,36 @@
       "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
       "ninja: no work to do.\n",
       "Loading extension module fused_adam...\n",
-      "Time to load fused_adam op: 0.049744367599487305 seconds\n",
+      "Time to load fused_adam op: 0.049720048904418945 seconds\n",
       "/root/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1702400430266/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
       "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...Loading extension module fused_adam...\n",
       "Loading extension module fused_adam...\n",
       "Loading extension module fused_adam...\n",
+      "\n",
       "Loading extension module fused_adam...\n",
-      "Time to load fused_adam op: 0.10116791725158691 seconds\n",
-      "Time to load fused_adam op: 0.10123014450073242 seconds\n",
       "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10112690925598145 secondsTime to load fused_adam op: 0.10111641883850098 seconds\n",
+      "Time to load fused_adam op: 0.10111737251281738 seconds\n",
+      "\n",
+      "Time to load fused_adam op: 0.1011362075805664 seconds\n",
+      "Time to load fused_adam op: 0.10121989250183105 seconds\n",
+      "Time to load fused_adam op: 0.10100865364074707 seconds\n",
       "/root/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1702400430266/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
       "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
       "/root/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1702400430266/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
       "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
-      "Time to load fused_adam op: 0.10133147239685059 secondsLoading extension module fused_adam...\n",
-      "\n",
       "/root/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1702400430266/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
       "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
-      "Loading `train_dataloader` to estimate number of stepping batches.\n",
-      "Time to load fused_adam op: 0.10153722763061523 seconds\n",
-      "Loading extension module fused_adam...\n",
-      "Time to load fused_adam op: 0.10162758827209473 seconds\n",
       "/root/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1702400430266/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
       "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading `train_dataloader` to estimate number of stepping batches.\n",
       "/root/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1702400430266/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
       "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
-      "Time to load fused_adam op: 0.10161066055297852 seconds\n",
-      "Loading extension module fused_adam...\n",
       "/root/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1702400430266/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
       "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
-      "Time to load fused_adam op: 0.1011042594909668 seconds\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10171246528625488 seconds\n",
       "/root/miniconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1702400430266/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
       "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
       "\n",
@@ -462,30 +454,7 @@
       "0         Non-trainable params\n",
       "7.5 B     Total params\n",
       "30,072.177Total estimated model params size (MB)\n",
-      "Epoch 0:   0%|                                         | 0/1611 [00:00<?, ?it/s][rank4]:[2024-02-02 19:11:03,861] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
-      "[rank4]:[2024-02-02 19:11:03,861] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:97)\n",
-      "[rank4]:[2024-02-02 19:11:03,861] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
-      "[rank5]:[2024-02-02 19:11:03,862] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
-      "[rank5]:[2024-02-02 19:11:03,862] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:97)\n",
-      "[rank5]:[2024-02-02 19:11:03,862] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
-      "[rank1]:[2024-02-02 19:11:03,862] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
-      "[rank1]:[2024-02-02 19:11:03,862] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:97)\n",
-      "[rank1]:[2024-02-02 19:11:03,862] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
-      "[rank2]:[2024-02-02 19:11:03,862] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
-      "[rank2]:[2024-02-02 19:11:03,862] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:97)\n",
-      "[rank2]:[2024-02-02 19:11:03,862] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
-      "[rank3]:[2024-02-02 19:11:03,862] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
-      "[rank3]:[2024-02-02 19:11:03,862] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:97)\n",
-      "[rank3]:[2024-02-02 19:11:03,862] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
-      "[rank6]:[2024-02-02 19:11:03,862] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
-      "[rank6]:[2024-02-02 19:11:03,862] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:97)\n",
-      "[rank6]:[2024-02-02 19:11:03,862] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
-      "[rank0]:[2024-02-02 19:11:03,863] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
-      "[rank0]:[2024-02-02 19:11:03,863] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:97)\n",
-      "[rank0]:[2024-02-02 19:11:03,863] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
-      "[rank7]:[2024-02-02 19:11:03,863] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
-      "[rank7]:[2024-02-02 19:11:03,863] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:97)\n",
-      "[rank7]:[2024-02-02 19:11:03,863] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n"
+      "Epoch 0:   1%| | 19/1611 [14:52<20:46:02,  0.02it/s, v_num=brqw, train/loss=0.96^C\n"
      ]
     }
    ],
diff --git a/notebook/finetune-example/Eagle-x-zMultipack.yaml b/notebook/finetune-example/Eagle-x-zMultipack.yaml
index e5038307..8be7b2a4 100644
--- a/notebook/finetune-example/Eagle-x-zMultipack.yaml
+++ b/notebook/finetune-example/Eagle-x-zMultipack.yaml
@@ -57,8 +57,8 @@ trainer:
       # In general you will want to avoid putting a low number (expecially if accumulate_grad_batches <= 100)
       # as the checkpoint process, will pause all the gpu training for some time, slowing down the overall process
       # However you do not want to configure too high of a number, where you will lose too much progress if the training crashes
-      every_n_train_steps: null
-      every_n_epochs: 128
+      every_n_train_steps: 128
+      every_n_epochs: null
       save_on_train_epoch_end: true
       train_time_interval: null
 

From 786889d2a53bfa87ef2c2b0cddf6225c9718b4df Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Fri, 2 Feb 2024 23:54:10 +0000
Subject: [PATCH 2/2] updated examples

---
 notebook/finetune-example/Eagle-x-ALMA-prompt-completion.yaml | 4 ++--
 notebook/finetune-example/Eagle-x-capybara-chat.yaml          | 4 ++--
 notebook/finetune-example/Eagle-x-openhermes1-instruct.yaml   | 4 ++--
 notebook/finetune-example/Eagle-x-textbooks.yaml              | 4 ++--
 notebook/finetune-example/Eagle-x-zMultipack.yaml             | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/notebook/finetune-example/Eagle-x-ALMA-prompt-completion.yaml b/notebook/finetune-example/Eagle-x-ALMA-prompt-completion.yaml
index 9776a73c..1639e81c 100644
--- a/notebook/finetune-example/Eagle-x-ALMA-prompt-completion.yaml
+++ b/notebook/finetune-example/Eagle-x-ALMA-prompt-completion.yaml
@@ -74,8 +74,8 @@ model:
   load_model: ../model/L6-D512-neox-init.pth
 
   # Starting and ending learning rate
-  lr_init: 5e-5
-  lr_final: 5e-5
+  lr_init: 1e-5
+  lr_final: 1e-5
 
   # Training context length, note that the dataset can be
   # larger then the context size, in which the trainer
diff --git a/notebook/finetune-example/Eagle-x-capybara-chat.yaml b/notebook/finetune-example/Eagle-x-capybara-chat.yaml
index 1b944640..56e3e971 100644
--- a/notebook/finetune-example/Eagle-x-capybara-chat.yaml
+++ b/notebook/finetune-example/Eagle-x-capybara-chat.yaml
@@ -79,8 +79,8 @@ model:
   load_model: ../model/L6-D512-neox-init.pth
 
   # Starting and ending learning rate
-  lr_init: 5e-5
-  lr_final: 5e-5
+  lr_init: 1e-5
+  lr_final: 1e-5
 
   # Training context length, note that the dataset can be
   # larger then the context size, in which the trainer
diff --git a/notebook/finetune-example/Eagle-x-openhermes1-instruct.yaml b/notebook/finetune-example/Eagle-x-openhermes1-instruct.yaml
index e5910fd6..b038c324 100644
--- a/notebook/finetune-example/Eagle-x-openhermes1-instruct.yaml
+++ b/notebook/finetune-example/Eagle-x-openhermes1-instruct.yaml
@@ -74,8 +74,8 @@ model:
   load_model: ../model/L6-D512-neox-init.pth
 
   # Starting and ending learning rate
-  lr_init: 5e-5
-  lr_final: 5e-5
+  lr_init: 1e-5
+  lr_final: 1e-5
 
   # Training context length, note that the dataset can be
   # larger then the context size, in which the trainer
diff --git a/notebook/finetune-example/Eagle-x-textbooks.yaml b/notebook/finetune-example/Eagle-x-textbooks.yaml
index 2a361db3..0e49d34c 100644
--- a/notebook/finetune-example/Eagle-x-textbooks.yaml
+++ b/notebook/finetune-example/Eagle-x-textbooks.yaml
@@ -79,8 +79,8 @@ model:
   load_model: ../model/L6-D512-neox-init.pth
 
   # Starting and ending learning rate
-  lr_init: 5e-5
-  lr_final: 5e-5
+  lr_init: 1e-5
+  lr_final: 1e-5
 
   # Training context length, note that the dataset can be
   # larger then the context size, in which the trainer
diff --git a/notebook/finetune-example/Eagle-x-zMultipack.yaml b/notebook/finetune-example/Eagle-x-zMultipack.yaml
index 8be7b2a4..beac4131 100644
--- a/notebook/finetune-example/Eagle-x-zMultipack.yaml
+++ b/notebook/finetune-example/Eagle-x-zMultipack.yaml
@@ -11,7 +11,7 @@ trainer:
 
   # Resonable batch size, for a more realistic it/s rate
   # this is currently overwritten in the notebook
-  target_batch_size: 512
+  target_batch_size: 256
 
   # Logger setting for wandb, if you want to enable wandb, uncomment the whole logger section
   # ---
@@ -72,7 +72,7 @@ model:
   load_model: ../model/L6-D512-neox-init.pth
 
   # Starting and ending learning rate
-  lr_init: 5e-5
+  lr_init: 1e-5
   lr_final: 4e-5
   lr_period: 2