reset validation file

RWKV · Sep 14, 2023 · b93866f · b93866f
1 parent dea5cc8
commit b93866f
Showing 1 changed file with 8 additions and 225 deletions.
diff --git a/notebook/trainer-v5-validation/test-sort-offset-length.ipynb b/notebook/trainer-v5-validation/test-sort-offset-length.ipynb
@@ -11,21 +11,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "ENABLE_WANDB: True\n",
-      "GPU_DEVICES: auto\n",
-      "NOTEBOOK_DIR: /home/ubuntu/picocreator-memory-experiment/notebook/trainer-v5-validation\n",
-      "TRAINER_DIR: /home/ubuntu/picocreator-memory-experiment/RWKV-v5\n",
-      "PROJECT_DIR: /home/ubuntu/picocreator-memory-experiment\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "GPU_DEVICES=\"auto\"\n",
     "ENABLE_WANDB=True\n",
@@ -52,27 +40,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2023-09-14 07:12:02,378] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
-      "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1'\n",
-      "---- Initializing model ----\n",
-      "No of layers: 6\n",
-      "Embedding size: 2048\n",
-      "Output model path: ../model/L6-D2048-neox-v5base-init.pth\n",
-      "Vocab size: 50277\n",
-      "Emb scale: 0.0001\n",
-      "Note: this process takes a significant time (and ram) for large models\n",
-      "---- ----- ----\n",
-      "Model exists, skipping init_model\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Init the model\n",
     "!cd \"{TRAINER_DIR}\" && \\\n",
@@ -84,18 +54,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Saving the dataset (1/1 shards): 100%|█| 989/989 [00:00<00:00, 56411.55 examples\n",
-      "Saving the dataset (1/1 shards): 100%|█| 100/100 [00:00<00:00, 10393.78 examples\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Lets preload the requried dataset \n",
     "!cd \"{TRAINER_DIR}\" && \\\n",
@@ -104,187 +65,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2023-09-14 07:12:15,982] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
-      "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1'\n",
-      "/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:484: UserWarning: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/ubuntu/picocreator-memory-experiment/notebook/trainer-v5-validation/config/test-sort-offset-length.yaml', '--trainer.logger.init_args.name=infctx-v5-sort-offset-test (deepspeed_stage_2_offload)', '--trainer.strategy=deepspeed_stage_2_offload', '--trainer.devices=auto'], args=['fit', '-c', '/home/ubuntu/picocreator-memory-experiment/notebook/trainer-v5-validation/config/test-sort-offset-length.yaml', '--trainer.logger.init_args.name=infctx-v5-sort-offset-test (deepspeed_stage_2_offload)', '--trainer.strategy=deepspeed_stage_2_offload', '--trainer.devices=auto'].\n",
-      "  rank_zero_warn(\n",
-      "Global seed set to 3941088705\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
-      "cat: /sys/module/amdgpu/initstate: No such file or directory\n",
-      "ERROR:root:Driver not initialized (amdgpu not found in modules)\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.15.10 is available!  To upgrade, please run:\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:  $ pip install wandb --upgrade\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.15.8\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20230914_071218-bvjeosux\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33minfctx-v5-sort-offset-test (deepspeed_stage_2_offload)\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-infctx-validation\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-infctx-validation/runs/bvjeosux\u001b[0m\n",
-      "GPU available: True (cuda), used: True\n",
-      "TPU available: False, using: 0 TPU cores\n",
-      "IPU available: False, using: 0 IPUs\n",
-      "HPU available: False, using: 0 HPUs\n",
-      "\n",
-      "\n",
-      "[RWKV.Trainer] Applying 'target_batch_size' with the following:\n",
-      "   - target_batch_size:       16\n",
-      "   - num_nodes:               1\n",
-      "   - num_devices:             1\n",
-      "   - accumulate_grad_batches: 16\n",
-      "   - effective_batch_size:    16\n",
-      "\n",
-      "Saving the dataset (1/1 shards): 100%|█| 989/989 [00:00<00:00, 52275.51 examples\n",
-      "Saving the dataset (1/1 shards): 100%|█| 100/100 [00:00<00:00, 9794.51 examples/\n",
-      "[rank: 0] Global seed set to 3941088705\n",
-      "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/1\n",
-      "[2023-09-14 07:12:27,935] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented\n",
-      "Enabling DeepSpeed BF16.\n",
-      "/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:615: UserWarning: Checkpoint directory /home/ubuntu/picocreator-memory-experiment/checkpoint/trainer-validaiton/infctx-v5-sort-offset exists and is not empty.\n",
-      "  rank_zero_warn(f\"Checkpoint directory {dirpath} exists and is not empty.\")\n",
-      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
-      "#\n",
-      "# RWKV lighting_trainer.py important notes \n",
-      "# https://github.com/RWKV/RWKV-infctx-trainer \n",
-      "#\n",
-      "# - Ensure your host is not running cuda 12.0 (use either 11.8, or >=12.1), as this is known to have freeze issues\n",
-      "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\n",
-      "# - When resuming from checkpoint, the estimated time is inaccurate\n",
-      "#\n",
-      "\n",
-      "[RWKV.model] Configuring optimizer with\n",
-      "    - lr_init:  6.000e-04 (0.0006)\n",
-      "    - lr_final: 4.000e-04 (0.0004)\n",
-      "\n",
-      "Using /home/ubuntu/.cache/torch_extensions/py311_cu118 as PyTorch extensions root...\n",
-      "Detected CUDA files, patching ldflags\n",
-      "Emitting ninja build file /home/ubuntu/.cache/torch_extensions/py311_cu118/cpu_adam/build.ninja...\n",
-      "Building extension module cpu_adam...\n",
-      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
-      "ninja: no work to do.\n",
-      "Loading extension module cpu_adam...\n",
-      "Time to load cpu_adam op: 2.4343717098236084 seconds\n",
-      "Loading `train_dataloader` to estimate number of stepping batches.\n",
-      "Rank: 0 partition count [1, 1] and sizes[(533245952, False), (384, False)] \n",
-      "\n",
-      "  | Name   | Type       | Params\n",
-      "--------------------------------------\n",
-      "0 | emb    | Embedding  | 102 M \n",
-      "1 | blocks | ModuleList | 327 M \n",
-      "2 | ln_out | LayerNorm  | 4.1 K \n",
-      "3 | head   | Linear     | 102 M \n",
-      "--------------------------------------\n",
-      "533 M     Trainable params\n",
-      "0         Non-trainable params\n",
-      "533 M     Total params\n",
-      "2,132.985 Total estimated model params size (MB)\n",
-      "Epoch 0:   0%|                                          | 0/989 [00:00<?, ?it/s]Traceback (most recent call last):\n",
-      "  File \"/home/ubuntu/picocreator-memory-experiment/RWKV-v5/lightning_trainer.py\", line 258, in <module>\n",
-      "    cli_main()\n",
-      "  File \"/home/ubuntu/picocreator-memory-experiment/RWKV-v5/lightning_trainer.py\", line 233, in cli_main\n",
-      "    LightningCLI(\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py\", line 353, in __init__\n",
-      "    self._run_subcommand(self.subcommand)\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py\", line 642, in _run_subcommand\n",
-      "    fn(**fn_kwargs)\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py\", line 529, in fit\n",
-      "    call._call_and_handle_interrupt(\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py\", line 41, in _call_and_handle_interrupt\n",
-      "    return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py\", line 91, in launch\n",
-      "    return function(*args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py\", line 568, in _fit_impl\n",
-      "    self._run(model, ckpt_path=ckpt_path)\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py\", line 973, in _run\n",
-      "    results = self._run_stage()\n",
-      "              ^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py\", line 1016, in _run_stage\n",
-      "    self.fit_loop.run()\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py\", line 201, in run\n",
-      "    self.advance()\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py\", line 354, in advance\n",
-      "    self.epoch_loop.run(self._data_fetcher)\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/training_epoch_loop.py\", line 133, in run\n",
-      "    self.advance(data_fetcher)\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/training_epoch_loop.py\", line 218, in advance\n",
-      "    batch_output = self.automatic_optimization.run(trainer.optimizers[0], kwargs)\n",
-      "                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py\", line 185, in run\n",
-      "    self._optimizer_step(kwargs.get(\"batch_idx\", 0), closure)\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py\", line 260, in _optimizer_step\n",
-      "    call._call_lightning_module_hook(\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py\", line 144, in _call_lightning_module_hook\n",
-      "    output = fn(*args, **kwargs)\n",
-      "             ^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/core/module.py\", line 1256, in optimizer_step\n",
-      "    optimizer.step(closure=optimizer_closure)\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/core/optimizer.py\", line 155, in step\n",
-      "    step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)\n",
-      "                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/ddp.py\", line 256, in optimizer_step\n",
-      "    optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)\n",
-      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py\", line 225, in optimizer_step\n",
-      "    return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/plugins/precision/deepspeed.py\", line 92, in optimizer_step\n",
-      "    closure_result = closure()\n",
-      "                     ^^^^^^^^^\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py\", line 140, in __call__\n",
-      "    self._result = self.closure(*args, **kwargs)\n",
-      "                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py\", line 126, in closure\n",
-      "    step_output = self._step_fn()\n",
-      "                  ^^^^^^^^^^^^^^^\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py\", line 307, in _training_step\n",
-      "    training_step_output = call._call_strategy_hook(trainer, \"training_step\", *kwargs.values())\n",
-      "                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py\", line 291, in _call_strategy_hook\n",
-      "    output = fn(*args, **kwargs)\n",
-      "             ^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/ddp.py\", line 328, in training_step\n",
-      "    return self.model(*args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1501, in _call_impl\n",
-      "    return forward_call(*args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/utils/nvtx.py\", line 15, in wrapped_fn\n",
-      "    ret_val = func(*args, **kwargs)\n",
-      "              ^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/runtime/engine.py\", line 1769, in forward\n",
-      "    loss = self.module(*inputs, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1501, in _call_impl\n",
-      "    return forward_call(*args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/overrides/base.py\", line 90, in forward\n",
-      "    output = self._forward_module.training_step(*inputs, **kwargs)\n",
-      "             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/ubuntu/picocreator-memory-experiment/RWKV-v5/src/model.py\", line 1354, in training_step\n",
-      "    total_loss = self.compute_loss(batch, batch_idx, True)\n",
-      "                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/ubuntu/picocreator-memory-experiment/RWKV-v5/src/model.py\", line 996, in compute_loss\n",
-      "    if self._counting_tokens is None or batch_idx == 0:\n",
-      "       ^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/ubuntu/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1614, in __getattr__\n",
-      "    raise AttributeError(\"'{}' object has no attribute '{}'\".format(\n",
-      "AttributeError: 'RWKV' object has no attribute '_counting_tokens'\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Waiting for W&B process to finish... \u001b[31m(failed 1).\u001b[0m Press Control-C to abort syncing.\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33minfctx-v5-sort-offset-test (deepspeed_stage_2_offload)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-infctx-validation/runs/bvjeosux\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-infctx-validation/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjk3MDM0MTA4/version_details/v6\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 6 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20230914_071218-bvjeosux/logs\u001b[0m\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "!cd \"{TRAINER_DIR}\" && \\\n",
     "    export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",