Merge branch 'rwkv-x-playground' of https://github.com/RWKV/RWKV-infc…

…tx-trainer into rwkv-x-playground
RWKV · Sep 17, 2023 · 792981d · 792981d
2 parents a8bbe67 + 136dc7b
commit 792981d
Show file tree

Hide file tree

Showing 4 changed files with 694 additions and 1 deletion.
diff --git a/notebook/experiment/infctx-math-and-name/config-mem-template.yaml b/notebook/experiment/infctx-math-and-name/config-mem-template.yaml
@@ -0,0 +1,197 @@
+# lightning.pytorch==2.0.2
+seed_everything: true
+trainer:
+  # Configure the number of GPU, avaliable on your machine
+  accelerator: gpu
+  devices: auto
+  num_nodes: 1
+
+  # Configure the deepspeed strategy, 
+  strategy: deepspeed_stage_1
+
+  # Floating point precision for the model, because RWKV is built FOR bf16
+  # you should pretty much never change this setting
+  precision: bf16
+
+  # Logger setting for wandb, if you want to enable wandb, uncomment the whole logger section
+  # ---
+  logger:
+    class_path: lightning.pytorch.loggers.WandbLogger
+    init_args:
+      name: 'Memory Instruct (bs=256, train-ctx=512)'
+      project: 'RWKV-X-Experiments'
+      tags: ['RWKV', 'RWKV-X']
+
+  # Checkpoint settings for the training process
+  callbacks:
+    class_path: lightning.pytorch.callbacks.ModelCheckpoint
+    init_args:
+      # Configure this to the path you want to save your checkpoints to
+      # note that a subdir will be created with the name `epoch=x-step=y.ckpt`
+      # 
+      # to convert a checkpoint to a model, you can use the 
+      # `python3 export_checkpoint.py <checkpoint path>` script, 
+      # which will create a `rwkv_model.pth` in the checkpoint directory.
+      #
+      # Do not use the `zero_to_fp32.py` script as that will have export format issues
+      dirpath: ../checkpoint/V5-Base-1B5-mem-instruct/
+      filename: null
+
+      # Save the top/last K checkpoints
+      save_top_k: 2
+      # Choose by the most recent checkpoints (time based)
+      monitor: 'step'
+      mode: max
+
+      # If enabled (true), save a copy of the latest checkpoint to 'last.ckpt'
+      # useful to simply checkpoint resume scripts, at a price of disk performance
+      save_last: true
+
+      # DO NOT set this as true, as the model weight exported will have format issues
+      # expert as checkpoint, and use the `export_checkpoint.py` script to convert to model instead
+      save_weights_only: false
+
+      # How frequent you want to save a checkpoint for every step.
+      # This will happen for every X data sample, where X = every_n_train_steps * accumulate_grad_batches
+      #
+      # In general you will want to avoid putting a low number (expecially if accumulate_grad_batches <= 100)
+      # as the checkpoint process, will pause all the gpu training for some time, slowing down the overall process
+      # However you do not want to configure too high of a number, where you will lose too much progress if the training crashes
+      every_n_train_steps: 25
+      every_n_epochs: null
+      save_on_train_epoch_end: true
+      train_time_interval: null
+
+      # Other settings, you can probably leave alone
+      verbose: false
+      auto_insert_metric_name: true
+
+  ########################################
+  ## Training run parameter settings
+  ########################################
+
+  # Generally what you want to configure is the maximum number of epochs
+  # Leave it as -1, and it will keep going forever till interrupted
+  # Or set it as a number, and it will stop after that number of epochs
+  max_epochs: 1
+  min_epochs: null
+
+  # Number of datasamples to train for each step, a data sample is considered
+  # a "substep" in wandb logs, and a "step" is tracked as "trainer/global_step"
+  #
+  # This decides the number of datasample, to learn together from, before backproping
+  # any weight changes at the end of the batch.
+  #
+  # Recommended to be a big enough number (like 128/256) where it prevents the training 
+  # loss from flucuating in the process. But not too big of a number where the increased
+  # GPU vRAM / offloaded RAM usage will cause the training to crash.
+  #
+  # You are also recommended to configure this to a large enough number to fully utilize
+  # your GPU processing time %, and avoid idle time for the GPU between batches
+  target_batch_size: 256
+
+########################################
+## Training model settings
+########################################
+model:
+  # Model to start the finetune/training process from
+  load_model: ../model/EWR-1B5-enwiki-instruct.pth
+
+  # Context length to use for the training process
+  # the larger the number (and batch size) the larger the vram usage
+  # 
+  # Note that if the datasample context length is larger then the ctx_len
+  # its training process would be split into ctx_len sized chunks.
+  #
+  # This allows the training of extreamly large context length (eg. 100k),
+  # without eating up too much vram by keeping the training context length
+  # to a resonable number sutible to the current GPU setup
+  ctx_len: 512
+
+  # Learning rate of the training process
+  # ---
+  # Initia learning rate of the process
+  lr_init: 8e-4
+  # Final learning rate after the learning rate period
+  # learning rate will stay at final value from then onwards
+  #
+  # NOTE: lr_final / lr_period does not work with warmup_steps
+  #       and will be ignored (or replaced) with the warmup_steps logic instead
+  lr_final: 5e-4
+
+  # Number of epoch to reduce the learning rate from lr_init to lr_final
+  #  1 means a single epoch (so lr would be lr_final from epoch 2 onwards)
+  #  0 means lr_final will apply immediately
+  # -1 means we take the current max_step / max_epoch as the period
+  lr_period: 1
+  # lr_period type if its set, defaults to epoch
+  lr_period_type: epoch
+
+  # We limit bptt_learning_range, to 1, to ensure high throughput within a multi-gpu setup. 
+  # (by skipping some syncronization code). Additionally, as bptt learning should not be triggering 
+  # anyway as the data sample should be within ctx size 99% of the time
+  bptt_learning: true
+  bptt_learning_range: 1
+
+data:
+  # dataset_path for the prebuilt dataset, using HF `load_from_disk()`
+  #
+  # Use this if you have built your own dataset and saved it with `save_to_disk()`
+  # with source left as null. Other wise configure this to a directory which the 
+  # dataset will be built and tokenized by the huggingface dataset process.
+  data_path: ../datapath/picocreator/experiment/rwkv-x-exp/memory/
+
+  # Other wise provide the source path, which is used as huggingface dataset path
+  # this will be used to populate the dataset_path
+  #
+  # Use either the following
+  # - hugging face dataset 
+  # - Directory path to a directory containing dataset files
+  # - Path to a single dataset file
+  # - hugging face dataset mode (ie: text,csv,etc - use data_dir, to configure the path then)
+  # - null
+  #
+  # If source is disabled, all other params, except data_path, is ignored
+  source: json
+  # source: text
+  # source: /home/ubuntu/RWKV-LM-LoRA/dataset-text/enwik8.txt
+
+  # Use data_dir, if you are using source=text/json/etc
+  # this should be relative to the trainer script path
+  source_data_dir: ../notebook/experiment/infctx-math-and-name/dataset/
+
+  # After loading the dataset, split out test data used for validation, 
+  # This process is skipped if the dataset includes a test split
+  # This process is skipped if set to zero
+  test_split: 0.001
+  test_split_shuffle: true
+
+  # Tokenizer to use, use either the inbuilt 'neox', or 'world' tokenizer
+  # If using a custom tokenizer, provide the tokenizer file path
+  # ---
+  tokenizer: neox
+
+  # Minimum / Maximum token size of the dataset to use
+  # useful for filtering out small noisy data samples from large datasets
+  # (eg. removal of small articles of less then 512 tokens from wikipedia)
+  #
+  # This is ignored, if set to -1
+  min_token_size: -1
+  max_token_size: 512
+
+  # Multi Column merging process, default setting is used to support and merge
+  # "instruction", "input", "output", datasets. To disable set multi_column_keys to []
+  #
+  # A minimum of 2 columns is required, with non empty data, for the merge to occur
+  # If no match is found, this will fallback to the default prompt/completion or text column, 
+  # or throw an error if the default fallback is not found
+  # ---
+  multi_column_keys: ['input_prefix', 'input', 'output_prefix', 'output', 'closing']
+  multi_column_prefix: ['', '', '', '', '']
+  multi_column_train_mask: [true, false, true, true, true]
+  multi_column_separator: ''
+
+  # If processing prompt/completion jsonl pairs, the prompt is masked by default
+  # use this flag to disable this default behaviour
+  # ---
+  # disable_prompt_mask: false
diff --git a/notebook/experiment/infctx-math-and-name/generate_math_and_name_dataset.py b/notebook/experiment/infctx-math-and-name/generate_math_and_name_dataset.py
@@ -0,0 +1,76 @@
+import numpy as np
+import argparse
+
+prompt_template = f"""
+You are an AI assistant who will be given some tasks to complete.
+First, you will be given a name to remember. Then, you will have to sum up a series of numbers.
+You will then be asked to answer some questions about the document.
+
+Example 1:
+Name: John
+1
+-2
+3
+-4
+
+### Question:
+What is the total sum?
+
+### Answer:
+-2
+
+### Question:
+What is the name given at the start of the document?
+
+### Answer:
+John
+
+Now you will be tasked to remember the name and sum up the following series of numbers.
+
+"""
+
+task_templates = [
+    "\n### Question:\nWhat is the name given at the start of the document?\n\n### Answer:\n",
+    "\n### Question:\nWhat is the sum of the numbers given?\n\n### Answer:\n"
+]
+
+completion_templates = [
+    "\n{name}\n",
+    "\n{sum_of_numbers}\n",
+]
+
+def load_names(file_path):
+    with open(file_path) as word_file:
+        valid_names = list(word_file.read().split())
+    return valid_names
+
+names = load_names("infctx-math-and-name/names.txt")
+
+def get_random_prompt_completion_pair(max_numbers):
+        document = ""
+        numbers = np.random.randint(-200, 200, size=(max_numbers))
+        total_sum = np.sum(numbers)
+        for number in numbers:
+            document += str(number) + "\n"
+
+        template_index = np.random.randint(0, len(task_templates))
+        task = task_templates[template_index]
+        name = names[np.random.randint(0, len(names))]
+
+        prompt = prompt_template + f"Name: {name}\n" + document + task
+        completion = completion_templates[template_index].format(sum_of_numbers=total_sum, name=name)
+        return {'prompt': prompt, 'completion': completion}
+
+def generate_jsonl(output_file_path, max_numbers, num_samples):
+    with open(output_file_path, 'w') as output_file:
+        for _ in range(num_samples):
+            pair = get_random_prompt_completion_pair(max_numbers)
+            output_file.write(str(pair) + "\n")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--out-file", type=str, default="questions.jsonl")
+    parser.add_argument("--max-numbers", type=int, default=100)
+    parser.add_argument("--num-samples", type=int, default=10)
+    args = parser.parse_args()
+    generate_jsonl(args.out_file, args.max_numbers, args.num_samples)
diff --git a/notebook/experiment/infctx-math-and-name/run.ipynb b/notebook/experiment/infctx-math-and-name/run.ipynb
@@ -1,5 +1,53 @@
 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "## Generating math and name dataset ##\n",
+      "## Done ##\n",
+      "total 15M\n",
+      "drwx------  2 christopherchou u-christopherchou   3 Sep 16 17:12 .\n",
+      "drwx------ 13 christopherchou u-christopherchou  14 Sep 16 17:01 ..\n",
+      "-rw-------  1 christopherchou u-christopherchou 55M Sep 16 17:12 questions_numbers_1024.jsonl\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%script bash\n",
+    "\n",
+    "########################################\n",
+    "# Generate the required jsonl dataset\n",
+    "########################################\n",
+    "\n",
+    "# Go to config dir\n",
+    "cd \"../\"\n",
+    "\n",
+    "# Reset the dataset dir\n",
+    "mkdir -p ../dataset\n",
+    "rm -rf ../dataset/*.jsonl\n",
+    "\n",
+    "# Generate the various datasets\n",
+    "echo \"## Generating math and name dataset ##\"\n",
+    "\n",
+    "#\n",
+    "# We reduce the training set for lower word count - and shift the focus upwards\n",
+    "#\n",
+    "# do\n",
+    "python3 infctx-math-and-name/generate_math_and_name_dataset.py --out-file ../dataset/questions_numbers_1024.jsonl --max-numbers 1024 --num-samples 10000\n",
+    "# done\n",
+    "\n",
+    "wait\n",
+    "echo \"## Done ##\"\n",
+    "\n",
+    "ls -alh ../dataset/"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -9,8 +57,22 @@
   }
  ],
  "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
   "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
   },
   "orig_nbformat": 4
  },