diff --git a/notebook/experiment/infctx-math-and-name/config-mem-template.yaml b/notebook/experiment/infctx-math-and-name/config-mem-template.yaml
new file mode 100644
index 00000000..36a88aba
--- /dev/null
+++ b/notebook/experiment/infctx-math-and-name/config-mem-template.yaml
@@ -0,0 +1,197 @@
+# lightning.pytorch==2.0.2
+seed_everything: true
+trainer:
+  # Configure the number of GPU, avaliable on your machine
+  accelerator: gpu
+  devices: auto
+  num_nodes: 1
+
+  # Configure the deepspeed strategy, 
+  strategy: deepspeed_stage_1
+
+  # Floating point precision for the model, because RWKV is built FOR bf16
+  # you should pretty much never change this setting
+  precision: bf16
+
+  # Logger setting for wandb, if you want to enable wandb, uncomment the whole logger section
+  # ---
+  logger:
+    class_path: lightning.pytorch.loggers.WandbLogger
+    init_args:
+      name: 'Memory Instruct (bs=256, train-ctx=512)'
+      project: 'RWKV-X-Experiments'
+      tags: ['RWKV', 'RWKV-X']
+
+  # Checkpoint settings for the training process
+  callbacks:
+    class_path: lightning.pytorch.callbacks.ModelCheckpoint
+    init_args:
+      # Configure this to the path you want to save your checkpoints to
+      # note that a subdir will be created with the name `epoch=x-step=y.ckpt`
+      # 
+      # to convert a checkpoint to a model, you can use the 
+      # `python3 export_checkpoint.py <checkpoint path>` script, 
+      # which will create a `rwkv_model.pth` in the checkpoint directory.
+      #
+      # Do not use the `zero_to_fp32.py` script as that will have export format issues
+      dirpath: ../checkpoint/V5-Base-1B5-mem-instruct/
+      filename: null
+      
+      # Save the top/last K checkpoints
+      save_top_k: 2
+      # Choose by the most recent checkpoints (time based)
+      monitor: 'step'
+      mode: max
+      
+      # If enabled (true), save a copy of the latest checkpoint to 'last.ckpt'
+      # useful to simply checkpoint resume scripts, at a price of disk performance
+      save_last: true
+
+      # DO NOT set this as true, as the model weight exported will have format issues
+      # expert as checkpoint, and use the `export_checkpoint.py` script to convert to model instead
+      save_weights_only: false
+
+      # How frequent you want to save a checkpoint for every step.
+      # This will happen for every X data sample, where X = every_n_train_steps * accumulate_grad_batches
+      #
+      # In general you will want to avoid putting a low number (expecially if accumulate_grad_batches <= 100)
+      # as the checkpoint process, will pause all the gpu training for some time, slowing down the overall process
+      # However you do not want to configure too high of a number, where you will lose too much progress if the training crashes
+      every_n_train_steps: 25
+      every_n_epochs: null
+      save_on_train_epoch_end: true
+      train_time_interval: null
+
+      # Other settings, you can probably leave alone
+      verbose: false
+      auto_insert_metric_name: true
+  
+  ########################################
+  ## Training run parameter settings
+  ########################################
+
+  # Generally what you want to configure is the maximum number of epochs
+  # Leave it as -1, and it will keep going forever till interrupted
+  # Or set it as a number, and it will stop after that number of epochs
+  max_epochs: 1
+  min_epochs: null
+
+  # Number of datasamples to train for each step, a data sample is considered
+  # a "substep" in wandb logs, and a "step" is tracked as "trainer/global_step"
+  #
+  # This decides the number of datasample, to learn together from, before backproping
+  # any weight changes at the end of the batch.
+  #
+  # Recommended to be a big enough number (like 128/256) where it prevents the training 
+  # loss from flucuating in the process. But not too big of a number where the increased
+  # GPU vRAM / offloaded RAM usage will cause the training to crash.
+  #
+  # You are also recommended to configure this to a large enough number to fully utilize
+  # your GPU processing time %, and avoid idle time for the GPU between batches
+  target_batch_size: 256
+
+########################################
+## Training model settings
+########################################
+model:
+  # Model to start the finetune/training process from
+  load_model: ../model/EWR-1B5-enwiki-instruct.pth
+
+  # Context length to use for the training process
+  # the larger the number (and batch size) the larger the vram usage
+  # 
+  # Note that if the datasample context length is larger then the ctx_len
+  # its training process would be split into ctx_len sized chunks.
+  #
+  # This allows the training of extreamly large context length (eg. 100k),
+  # without eating up too much vram by keeping the training context length
+  # to a resonable number sutible to the current GPU setup
+  ctx_len: 512
+  
+  # Learning rate of the training process
+  # ---
+  # Initia learning rate of the process
+  lr_init: 8e-4
+  # Final learning rate after the learning rate period
+  # learning rate will stay at final value from then onwards
+  #
+  # NOTE: lr_final / lr_period does not work with warmup_steps
+  #       and will be ignored (or replaced) with the warmup_steps logic instead
+  lr_final: 5e-4
+
+  # Number of epoch to reduce the learning rate from lr_init to lr_final
+  #  1 means a single epoch (so lr would be lr_final from epoch 2 onwards)
+  #  0 means lr_final will apply immediately
+  # -1 means we take the current max_step / max_epoch as the period
+  lr_period: 1
+  # lr_period type if its set, defaults to epoch
+  lr_period_type: epoch
+
+  # We limit bptt_learning_range, to 1, to ensure high throughput within a multi-gpu setup. 
+  # (by skipping some syncronization code). Additionally, as bptt learning should not be triggering 
+  # anyway as the data sample should be within ctx size 99% of the time
+  bptt_learning: true
+  bptt_learning_range: 1
+
+data:
+  # dataset_path for the prebuilt dataset, using HF `load_from_disk()`
+  #
+  # Use this if you have built your own dataset and saved it with `save_to_disk()`
+  # with source left as null. Other wise configure this to a directory which the 
+  # dataset will be built and tokenized by the huggingface dataset process.
+  data_path: ../datapath/picocreator/experiment/rwkv-x-exp/memory/
+
+  # Other wise provide the source path, which is used as huggingface dataset path
+  # this will be used to populate the dataset_path
+  #
+  # Use either the following
+  # - hugging face dataset 
+  # - Directory path to a directory containing dataset files
+  # - Path to a single dataset file
+  # - hugging face dataset mode (ie: text,csv,etc - use data_dir, to configure the path then)
+  # - null
+  #
+  # If source is disabled, all other params, except data_path, is ignored
+  source: json
+  # source: text
+  # source: /home/ubuntu/RWKV-LM-LoRA/dataset-text/enwik8.txt
+
+  # Use data_dir, if you are using source=text/json/etc
+  # this should be relative to the trainer script path
+  source_data_dir: ../notebook/experiment/infctx-math-and-name/dataset/
+
+  # After loading the dataset, split out test data used for validation, 
+  # This process is skipped if the dataset includes a test split
+  # This process is skipped if set to zero
+  test_split: 0.001
+  test_split_shuffle: true
+
+  # Tokenizer to use, use either the inbuilt 'neox', or 'world' tokenizer
+  # If using a custom tokenizer, provide the tokenizer file path
+  # ---
+  tokenizer: neox
+
+  # Minimum / Maximum token size of the dataset to use
+  # useful for filtering out small noisy data samples from large datasets
+  # (eg. removal of small articles of less then 512 tokens from wikipedia)
+  #
+  # This is ignored, if set to -1
+  min_token_size: -1
+  max_token_size: 512
+
+  # Multi Column merging process, default setting is used to support and merge
+  # "instruction", "input", "output", datasets. To disable set multi_column_keys to []
+  #
+  # A minimum of 2 columns is required, with non empty data, for the merge to occur
+  # If no match is found, this will fallback to the default prompt/completion or text column, 
+  # or throw an error if the default fallback is not found
+  # ---
+  multi_column_keys: ['input_prefix', 'input', 'output_prefix', 'output', 'closing']
+  multi_column_prefix: ['', '', '', '', '']
+  multi_column_train_mask: [true, false, true, true, true]
+  multi_column_separator: ''
+
+  # If processing prompt/completion jsonl pairs, the prompt is masked by default
+  # use this flag to disable this default behaviour
+  # ---
+  # disable_prompt_mask: false
diff --git a/notebook/experiment/infctx-math-and-name/generate_math_and_name_dataset.py b/notebook/experiment/infctx-math-and-name/generate_math_and_name_dataset.py
new file mode 100644
index 00000000..7c167503
--- /dev/null
+++ b/notebook/experiment/infctx-math-and-name/generate_math_and_name_dataset.py
@@ -0,0 +1,76 @@
+import numpy as np
+import argparse
+
+prompt_template = f"""
+You are an AI assistant who will be given some tasks to complete.
+First, you will be given a name to remember. Then, you will have to sum up a series of numbers.
+You will then be asked to answer some questions about the document.
+
+Example 1:
+Name: John
+1
+-2
+3
+-4
+
+### Question:
+What is the total sum?
+
+### Answer:
+-2
+
+### Question:
+What is the name given at the start of the document?
+
+### Answer:
+John
+
+Now you will be tasked to remember the name and sum up the following series of numbers.
+
+"""
+
+task_templates = [
+    "\n### Question:\nWhat is the name given at the start of the document?\n\n### Answer:\n",
+    "\n### Question:\nWhat is the sum of the numbers given?\n\n### Answer:\n"
+]
+
+completion_templates = [
+    "\n{name}\n",
+    "\n{sum_of_numbers}\n",
+]
+
+def load_names(file_path):
+    with open(file_path) as word_file:
+        valid_names = list(word_file.read().split())
+    return valid_names
+
+names = load_names("infctx-math-and-name/names.txt")
+
+def get_random_prompt_completion_pair(max_numbers):
+        document = ""
+        numbers = np.random.randint(-200, 200, size=(max_numbers))
+        total_sum = np.sum(numbers)
+        for number in numbers:
+            document += str(number) + "\n"
+
+        template_index = np.random.randint(0, len(task_templates))
+        task = task_templates[template_index]
+        name = names[np.random.randint(0, len(names))]
+
+        prompt = prompt_template + f"Name: {name}\n" + document + task
+        completion = completion_templates[template_index].format(sum_of_numbers=total_sum, name=name)
+        return {'prompt': prompt, 'completion': completion}
+
+def generate_jsonl(output_file_path, max_numbers, num_samples):
+    with open(output_file_path, 'w') as output_file:
+        for _ in range(num_samples):
+            pair = get_random_prompt_completion_pair(max_numbers)
+            output_file.write(str(pair) + "\n")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--out-file", type=str, default="questions.jsonl")
+    parser.add_argument("--max-numbers", type=int, default=100)
+    parser.add_argument("--num-samples", type=int, default=10)
+    args = parser.parse_args()
+    generate_jsonl(args.out_file, args.max_numbers, args.num_samples)
\ No newline at end of file
diff --git a/notebook/experiment/infctx-math-and-name/run.ipynb b/notebook/experiment/infctx-math-and-name/run.ipynb
index a6100842..a42ede5c 100644
--- a/notebook/experiment/infctx-math-and-name/run.ipynb
+++ b/notebook/experiment/infctx-math-and-name/run.ipynb
@@ -1,5 +1,53 @@
 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "## Generating math and name dataset ##\n",
+      "## Done ##\n",
+      "total 15M\n",
+      "drwx------  2 christopherchou u-christopherchou   3 Sep 16 17:12 .\n",
+      "drwx------ 13 christopherchou u-christopherchou  14 Sep 16 17:01 ..\n",
+      "-rw-------  1 christopherchou u-christopherchou 55M Sep 16 17:12 questions_numbers_1024.jsonl\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%script bash\n",
+    "\n",
+    "########################################\n",
+    "# Generate the required jsonl dataset\n",
+    "########################################\n",
+    "\n",
+    "# Go to config dir\n",
+    "cd \"../\"\n",
+    "\n",
+    "# Reset the dataset dir\n",
+    "mkdir -p ../dataset\n",
+    "rm -rf ../dataset/*.jsonl\n",
+    "\n",
+    "# Generate the various datasets\n",
+    "echo \"## Generating math and name dataset ##\"\n",
+    "\n",
+    "#\n",
+    "# We reduce the training set for lower word count - and shift the focus upwards\n",
+    "#\n",
+    "# do\n",
+    "python3 infctx-math-and-name/generate_math_and_name_dataset.py --out-file ../dataset/questions_numbers_1024.jsonl --max-numbers 1024 --num-samples 10000\n",
+    "# done\n",
+    "\n",
+    "wait\n",
+    "echo \"## Done ##\"\n",
+    "\n",
+    "ls -alh ../dataset/"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -9,8 +57,22 @@
   }
  ],
  "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
   "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
   },
   "orig_nbformat": 4
  },
diff --git a/notebook/experiment/infctx-math-and-name/stage2.ipynb b/notebook/experiment/infctx-math-and-name/stage2.ipynb
new file mode 100644
index 00000000..bff1251b
--- /dev/null
+++ b/notebook/experiment/infctx-math-and-name/stage2.ipynb
@@ -0,0 +1,358 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# RWKV v5\n",
+    "\n",
+    "Simple memory training for a small model\n",
+    "\n",
+    "**Note:** This project assumes you have the rwkv-infctx conda env setup"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Basic Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# First lets setup the various directories, and init the model\n",
+    "!ls ../../../../../\n",
+    "!mkdir -p ../../../../../models/\n",
+    "!mkdir -p ../../../../../datapath/\n",
+    "!mkdir -p ../../../../../checkpoint/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Additional dependencies for eval stuff\n",
+    "!pip3 install -q aiocsv aiofiles"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DEEPSPEED_STRAT: deepspeed_stage_1\n",
+      "ENABLE_WANDB: True\n",
+      "GPU_DEVICES: auto\n",
+      "DIR_NAME: infctx-math-and-name\n",
+      "NOTEBOOK_DIR: /data/chris/rwkv-fork/RWKV-infctx-trainer/notebook/experiment/infctx-math-and-name\n",
+      "INFERENCE_DIR: /data/chris/rwkv-fork/RWKV-infctx-trainer/RWKV-v5\n",
+      "TRAINER_DIR: /data/chris/rwkv-fork/RWKV-infctx-trainer/RWKV-v5\n",
+      "PROJECT_DIR: /data/chris/rwkv-fork/RWKV-infctx-trainer\n"
+     ]
+    }
+   ],
+   "source": [
+    "DEEPSPEED_STRAT=\"deepspeed_stage_1\"\n",
+    "GPU_DEVICES=\"auto\"\n",
+    "ENABLE_WANDB=True\n",
+    "\n",
+    "# Layer count and embed dim to start with\n",
+    "LAYER_COUNT=6\n",
+    "EMBED_DIM=2048\n",
+    "\n",
+    "EMBED_SCALE=0.1\n",
+    "EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(\".\", \"_\")\n",
+    "\n",
+    "WANDB_PREFIX=f\"v5r3-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE}\"\n",
+    "FILENAME_PREFIX=f\"v5r3-L{LAYER_COUNT}-D{EMBED_DIM}-E{EMBED_SCALE_LABEL}\"\n",
+    "\n",
+    "print(\"DEEPSPEED_STRAT:\", DEEPSPEED_STRAT)\n",
+    "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n",
+    "print(\"GPU_DEVICES:\", GPU_DEVICES)\n",
+    "\n",
+    "if ENABLE_WANDB:\n",
+    "    WANDB_MODE=\"online\"\n",
+    "else:\n",
+    "    WANDB_MODE=\"disabled\"\n",
+    "\n",
+    "# Computing the notebook, and various paths\n",
+    "import os\n",
+    "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n",
+    "CONFIG_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"./\"))\n",
+    "PROJECT_DIR=os.path.abspath(os.path.join(CONFIG_DIR, \"../../../\"))\n",
+    "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n",
+    "INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n",
+    "\n",
+    "# Get the notebook dir name\n",
+    "DIR_NAME=os.path.basename(NOTEBOOK_DIR)\n",
+    "\n",
+    "# Log names and dir\n",
+    "print(\"DIR_NAME:\", DIR_NAME)\n",
+    "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n",
+    "print(\"INFERENCE_DIR:\", INFERENCE_DIR)\n",
+    "print(\"TRAINER_DIR:\", TRAINER_DIR)\n",
+    "print(\"PROJECT_DIR:\", PROJECT_DIR)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2023-09-16 17:59:33--  https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/main/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/v5r3-L6-D2048-E0_1-enwiki-instruct.pth\n",
+      "Resolving huggingface.co (huggingface.co)... 108.138.246.71, 108.138.246.85, 108.138.246.67, ...\n",
+      "Connecting to huggingface.co (huggingface.co)|108.138.246.71|:443... connected.\n",
+      "HTTP request sent, awaiting response... 302 Found\n",
+      "Location: https://cdn-lfs.huggingface.co/repos/2e/f7/2ef78555202aa92abdbdf476ce3d0fd5a8b15f7245edf0b80d4d30572355f30d/0a83bdbbf6d686bfa77529fc9bbde3a91fc8d182e1dc33ce8d18f2a0abbe2576?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27v5r3-L6-D2048-E0_1-enwiki-instruct.pth%3B+filename%3D%22v5r3-L6-D2048-E0_1-enwiki-instruct.pth%22%3B&Expires=1695171573&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NTE3MTU3M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8yZS9mNy8yZWY3ODU1NTIwMmFhOTJhYmRiZGY0NzZjZTNkMGZkNWE4YjE1ZjcyNDVlZGYwYjgwZDRkMzA1NzIzNTVmMzBkLzBhODNiZGJiZjZkNjg2YmZhNzc1MjlmYzliYmRlM2E5MWZjOGQxODJlMWRjMzNjZThkMThmMmEwYWJiZTI1NzY%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=ydnz2x0eG1WBO%7ExW3sMHufYylEixqjKRuxzCPaRc0AdGMtoIsv1lnJ7DcU0TtY4RQZpUxvmtEdE43zQtOf7Bf80qf8U0mLnGOaEZLxuCrKXodOa8c8N58xr5c0Kl4XofpifWg%7EUeO2xAKAY%7EYgSyzqJDVFEzcifyu69bLA1fgZJwM7V5w4YmkJ2mmLp7wxicVMOh9y8f7evkoG9wNd2NjuTje7VhbptyFYio4KoMLfUwwXO1C5nXTYawFEIXN%7EZNpNgGeDZkPGt0RwdL4OVav8m6if%7E89QbaEnWlPjWulswil%7EkjC5893H9l7FvRJYVQAmuXOeeFcJoG64xDSjluGQ__&Key-Pair-Id=KVTP0A1DKRTAX [following]\n",
+      "--2023-09-16 17:59:33--  https://cdn-lfs.huggingface.co/repos/2e/f7/2ef78555202aa92abdbdf476ce3d0fd5a8b15f7245edf0b80d4d30572355f30d/0a83bdbbf6d686bfa77529fc9bbde3a91fc8d182e1dc33ce8d18f2a0abbe2576?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27v5r3-L6-D2048-E0_1-enwiki-instruct.pth%3B+filename%3D%22v5r3-L6-D2048-E0_1-enwiki-instruct.pth%22%3B&Expires=1695171573&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NTE3MTU3M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8yZS9mNy8yZWY3ODU1NTIwMmFhOTJhYmRiZGY0NzZjZTNkMGZkNWE4YjE1ZjcyNDVlZGYwYjgwZDRkMzA1NzIzNTVmMzBkLzBhODNiZGJiZjZkNjg2YmZhNzc1MjlmYzliYmRlM2E5MWZjOGQxODJlMWRjMzNjZThkMThmMmEwYWJiZTI1NzY%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=ydnz2x0eG1WBO%7ExW3sMHufYylEixqjKRuxzCPaRc0AdGMtoIsv1lnJ7DcU0TtY4RQZpUxvmtEdE43zQtOf7Bf80qf8U0mLnGOaEZLxuCrKXodOa8c8N58xr5c0Kl4XofpifWg%7EUeO2xAKAY%7EYgSyzqJDVFEzcifyu69bLA1fgZJwM7V5w4YmkJ2mmLp7wxicVMOh9y8f7evkoG9wNd2NjuTje7VhbptyFYio4KoMLfUwwXO1C5nXTYawFEIXN%7EZNpNgGeDZkPGt0RwdL4OVav8m6if%7E89QbaEnWlPjWulswil%7EkjC5893H9l7FvRJYVQAmuXOeeFcJoG64xDSjluGQ__&Key-Pair-Id=KVTP0A1DKRTAX\n",
+      "Resolving cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)... 18.238.192.105, 18.238.192.34, 18.238.192.50, ...\n",
+      "Connecting to cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)|18.238.192.105|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 1066537777 (1017M) [binary/octet-stream]\n",
+      "Saving to: ‘v5r3-L6-D2048-E0_1-enwiki-instruct.pth’\n",
+      "\n",
+      "v5r3-L6-D2048-E0_1- 100%[===================>]   1017M   112MB/s    in 9.1s    \n",
+      "\n",
+      "2023-09-16 17:59:42 (112 MB/s) - ‘v5r3-L6-D2048-E0_1-enwiki-instruct.pth’ saved [1066537777/1066537777]\n",
+      "\n",
+      "total 703M\n",
+      "drwx------  2 christopherchou u-christopherchou     3 Sep 16 17:59 .\n",
+      "drwx------ 17 christopherchou u-christopherchou    21 Sep 16 17:59 ..\n",
+      "-rw-------  1 christopherchou u-christopherchou 1018M Sep 13 13:28 v5r3-L6-D2048-E0_1-enwiki-instruct.pth\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Download the model directly (stop gap till HF sync issues is resolved)\n",
+    "!cd \"{TRAINER_DIR}\" && cd \"../models/\" && \\\n",
+    "    wget -nc \"https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/main/experiment/rwkv-x-exp/v5-r3-memory/L6-D2048-E1e-1-ctx4k/v5r3-L6-D2048-E0_1-enwiki-instruct.pth\"\n",
+    "\n",
+    "!cd \"{TRAINER_DIR}\" && cd \"../models/\" && \\\n",
+    "    ls -alh ."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Tune 2 : Context size (1024) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "## Generating math and name dataset ##\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "## Done ##\n",
+      "total 25M\n",
+      "drwx------  2 christopherchou u-christopherchou   3 Sep 16 17:59 .\n",
+      "drwx------ 13 christopherchou u-christopherchou  14 Sep 16 17:59 ..\n",
+      "-rw-------  1 christopherchou u-christopherchou 55M Sep 16 17:59 questions_numbers.jsonl\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%script bash\n",
+    "\n",
+    "########################################\n",
+    "# Generate the required jsonl dataset\n",
+    "########################################\n",
+    "\n",
+    "# Go to config dir\n",
+    "cd \"../\"\n",
+    "\n",
+    "# Reset the dataset dir\n",
+    "mkdir -p ../dataset\n",
+    "rm -rf ../dataset/*.jsonl\n",
+    "\n",
+    "# Generate the various datasets\n",
+    "echo \"## Generating math and name dataset ##\"\n",
+    "\n",
+    "#\n",
+    "# We reduce the training set for lower word count - and shift the focus upwards\n",
+    "#\n",
+    "# do\n",
+    "python3 infctx-math-and-name/generate_math_and_name_dataset.py --out-file ../dataset/questions_numbers.jsonl --max-numbers 1024 --num-samples 10000\n",
+    "# done\n",
+    "\n",
+    "wait\n",
+    "echo \"## Done ##\"\n",
+    "\n",
+    "ls -alh ../dataset/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu117'\n",
+      "\u001b[31m╭─\u001b[0m\u001b[31m────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m─────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n",
+      "\u001b[31m│\u001b[0m \u001b[2;33m/data/chris/rwkv-fork/RWKV-infctx-trainer/RWKV-v5/\u001b[0m\u001b[1;33mlightning_trainer.py\u001b[0m:\u001b[94m278\u001b[0m   \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m in \u001b[92m<module>\u001b[0m                                                                  \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m                                                                              \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m275 \u001b[0m\u001b[2m│   \u001b[0m)                                                                  \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m276 \u001b[0m                                                                       \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m277 \u001b[0m\u001b[94mif\u001b[0m \u001b[91m__name__\u001b[0m == \u001b[33m\"\u001b[0m\u001b[33m__main__\u001b[0m\u001b[33m\"\u001b[0m:                                             \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m278 \u001b[2m│   \u001b[0mcli_main()                                                         \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m279 \u001b[0m                                                                       \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m                                                                              \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m \u001b[2;33m/data/chris/rwkv-fork/RWKV-infctx-trainer/RWKV-v5/\u001b[0m\u001b[1;33mlightning_trainer.py\u001b[0m:\u001b[94m253\u001b[0m   \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m in \u001b[92mcli_main\u001b[0m                                                                  \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m                                                                              \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m250 \u001b[0m\u001b[94mfrom\u001b[0m \u001b[4;96msrc\u001b[0m\u001b[4;96m.\u001b[0m\u001b[4;96mtrainer\u001b[0m \u001b[94mimport\u001b[0m RWKVLightningTrainer                           \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m251 \u001b[0m                                                                       \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m252 \u001b[0m\u001b[94mdef\u001b[0m \u001b[92mcli_main\u001b[0m():                                                        \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m253 \u001b[2m│   \u001b[0mLightningCLI(                                                      \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m254 \u001b[0m\u001b[2m│   │   \u001b[0mRWKV, RWKVDataModule,                                          \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m255 \u001b[0m\u001b[2m│   │   \u001b[0msave_config_kwargs={\u001b[33m\"\u001b[0m\u001b[33moverwrite\u001b[0m\u001b[33m\"\u001b[0m: \u001b[94mTrue\u001b[0m},                        \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m256 \u001b[0m\u001b[2m│   │   \u001b[0mtrainer_class=RWKVLightningTrainer,                            \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m                                                                              \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m \u001b[2;33m/data/chris/anaconda3/envs/fastchat-env/lib/python3.8/site-packages/lightnin\u001b[0m \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m \u001b[2;33mg/pytorch/\u001b[0m\u001b[1;33mcli.py\u001b[0m:\u001b[94m348\u001b[0m in \u001b[92m__init__\u001b[0m                                             \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m                                                                              \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m345 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[96mself\u001b[0m.subclass_mode_data = (datamodule_class \u001b[95mis\u001b[0m \u001b[94mNone\u001b[0m) \u001b[95mor\u001b[0m subcla \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m346 \u001b[0m\u001b[2m│   │   \u001b[0m                                                               \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m347 \u001b[0m\u001b[2m│   │   \u001b[0mmain_kwargs, subparser_kwargs = \u001b[96mself\u001b[0m._setup_parser_kwargs(\u001b[96mself\u001b[0m \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m348 \u001b[2m│   │   \u001b[0m\u001b[96mself\u001b[0m.setup_parser(run, main_kwargs, subparser_kwargs)          \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m349 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[96mself\u001b[0m.parse_arguments(\u001b[96mself\u001b[0m.parser, args)                        \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m350 \u001b[0m\u001b[2m│   │   \u001b[0m                                                               \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m351 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[96mself\u001b[0m.subcommand = \u001b[96mself\u001b[0m.config[\u001b[33m\"\u001b[0m\u001b[33msubcommand\u001b[0m\u001b[33m\"\u001b[0m] \u001b[94mif\u001b[0m run \u001b[94melse\u001b[0m \u001b[94mNone\u001b[0m   \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m                                                                              \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m \u001b[2;33m/data/chris/anaconda3/envs/fastchat-env/lib/python3.8/site-packages/lightnin\u001b[0m \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m \u001b[2;33mg/pytorch/\u001b[0m\u001b[1;33mcli.py\u001b[0m:\u001b[94m380\u001b[0m in \u001b[92msetup_parser\u001b[0m                                         \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m                                                                              \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m377 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[96mself\u001b[0m, add_subcommands: \u001b[96mbool\u001b[0m, main_kwargs: Dict[\u001b[96mstr\u001b[0m, Any], subp \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m378 \u001b[0m\u001b[2m│   \u001b[0m) -> \u001b[94mNone\u001b[0m:                                                         \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m379 \u001b[0m\u001b[2;90m│   │   \u001b[0m\u001b[33m\"\"\"Initialize and setup the parser, subcommands, and arguments\u001b[0m \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m380 \u001b[2m│   │   \u001b[0m\u001b[96mself\u001b[0m.parser = \u001b[96mself\u001b[0m.init_parser(**main_kwargs)                  \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m381 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[94mif\u001b[0m add_subcommands:                                            \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m382 \u001b[0m\u001b[2m│   │   │   \u001b[0m\u001b[96mself\u001b[0m._subcommand_method_arguments: Dict[\u001b[96mstr\u001b[0m, List[\u001b[96mstr\u001b[0m]] =  \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m383 \u001b[0m\u001b[2m│   │   │   \u001b[0m\u001b[96mself\u001b[0m._add_subcommands(\u001b[96mself\u001b[0m.parser, **subparser_kwargs)     \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m                                                                              \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m \u001b[2;33m/data/chris/anaconda3/envs/fastchat-env/lib/python3.8/site-packages/lightnin\u001b[0m \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m \u001b[2;33mg/pytorch/\u001b[0m\u001b[1;33mcli.py\u001b[0m:\u001b[94m370\u001b[0m in \u001b[92minit_parser\u001b[0m                                          \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m                                                                              \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m367 \u001b[0m\u001b[2m│   \u001b[0m\u001b[94mdef\u001b[0m \u001b[92minit_parser\u001b[0m(\u001b[96mself\u001b[0m, **kwargs: Any) -> LightningArgumentParser:   \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m368 \u001b[0m\u001b[2;90m│   │   \u001b[0m\u001b[33m\"\"\"Method that instantiates the argument parser.\"\"\"\u001b[0m            \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m369 \u001b[0m\u001b[2m│   │   \u001b[0mkwargs.setdefault(\u001b[33m\"\u001b[0m\u001b[33mdump_header\u001b[0m\u001b[33m\"\u001b[0m, [\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33mlightning.pytorch==\u001b[0m\u001b[33m{\u001b[0mpl.__v \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m370 \u001b[2m│   │   \u001b[0mparser = LightningArgumentParser(**kwargs)                     \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m371 \u001b[0m\u001b[2m│   │   \u001b[0mparser.add_argument(                                           \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m372 \u001b[0m\u001b[2m│   │   │   \u001b[0m\u001b[33m\"\u001b[0m\u001b[33m-c\u001b[0m\u001b[33m\"\u001b[0m, \u001b[33m\"\u001b[0m\u001b[33m--config\u001b[0m\u001b[33m\"\u001b[0m, action=ActionConfigFile, help=\u001b[33m\"\u001b[0m\u001b[33mPath to a\u001b[0m \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m373 \u001b[0m\u001b[2m│   │   \u001b[0m)                                                              \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m                                                                              \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m \u001b[2;33m/data/chris/anaconda3/envs/fastchat-env/lib/python3.8/site-packages/lightnin\u001b[0m \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m \u001b[2;33mg/pytorch/\u001b[0m\u001b[1;33mcli.py\u001b[0m:\u001b[94m94\u001b[0m in \u001b[92m__init__\u001b[0m                                              \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m                                                                              \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m 91 \u001b[0m\u001b[2m│   │   \u001b[0m                                                               \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m 92 \u001b[0m\u001b[2;33m│   │   \u001b[0m\u001b[33m\"\"\"\u001b[0m                                                            \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m 93 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[94mif\u001b[0m \u001b[95mnot\u001b[0m _JSONARGPARSE_SIGNATURES_AVAILABLE:                     \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m 94 \u001b[2m│   │   │   \u001b[0m\u001b[94mraise\u001b[0m \u001b[96mModuleNotFoundError\u001b[0m(                                 \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m 95 \u001b[0m\u001b[2m│   │   │   │   \u001b[0m\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33m{\u001b[0m_JSONARGPARSE_SIGNATURES_AVAILABLE\u001b[33m}\u001b[0m\u001b[33m. Try `pip insta\u001b[0m \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m 96 \u001b[0m\u001b[2m│   │   │   \u001b[0m)                                                          \u001b[31m│\u001b[0m\n",
+      "\u001b[31m│\u001b[0m   \u001b[2m 97 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[96msuper\u001b[0m().\u001b[92m__init__\u001b[0m(*args, description=description, env_prefix=en \u001b[31m│\u001b[0m\n",
+      "\u001b[31m╰──────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n",
+      "\u001b[1;91mModuleNotFoundError: \u001b[0mDistributionNotFound: The \n",
+      "\u001b[32m'jsonargparse\u001b[0m\u001b[32m[\u001b[0m\u001b[32msignatures\u001b[0m\u001b[32m]\u001b[0m\u001b[32m>=4.17.0'\u001b[0m distribution was not found and is required by\n",
+      "the application. HINT: Try running `pip install -U \n",
+      "\u001b[32m'jsonargparse\u001b[0m\u001b[32m[\u001b[0m\u001b[32msignatures\u001b[0m\u001b[32m]\u001b[0m\u001b[32m>=4.17.0'\u001b[0m`. Try `pip install -U \n",
+      "\u001b[32m'jsonargparse\u001b[0m\u001b[32m[\u001b[0m\u001b[32msignatures\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m`.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Start the finetune model training\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
+    "    python3 lightning_trainer.py fit \\\n",
+    "        -c \"{CONFIG_DIR}/config-mem-template.yaml\" \\\n",
+    "        --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Mem-Tune ctx-1024 (train-ctx=1024, {DEEPSPEED_STRAT})\" \\\n",
+    "        --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
+    "        --trainer.devices=\"{GPU_DEVICES}\"  \\\n",
+    "        --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-mem-ctx-1024/\" \\\n",
+    "        --model.lr_init=5e-4 \\\n",
+    "        --model.lr_final=4e-4 \\\n",
+    "        --data.max_token_size=1024 \\\n",
+    "        --model.ctx_len=1024 \\\n",
+    "        --model.bptt_learning_range=1 \\\n",
+    "        --model.load_model=\"../model/{FILENAME_PREFIX}-mem-instruct.pth\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Lets export the model from the checkpoint\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    python3 export_checkpoint.py \\\n",
+    "        \"../checkpoint/{FILENAME_PREFIX}-mem-ctx-1024/last.ckpt\" \\\n",
+    "        \"../model/{FILENAME_PREFIX}-mem-ctx-1024.pth\" \"bf16\"\n",
+    "!cd \"{TRAINER_DIR}\" && ls -alh \"../models/{FILENAME_PREFIX}-mem-ctx-1024.pth\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Lets do a quick memory test\n",
+    "!python3 ../../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-ctx-1024.pth\""
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}