Skip to content

Commit

Permalink
Merge branch 'rwkv-x-playground' of https://github.com/RWKV/RWKV-infc…
Browse files Browse the repository at this point in the history
…tx-trainer into rwkv-x-playground
  • Loading branch information
PicoCreator committed Sep 17, 2023
2 parents a8bbe67 + 136dc7b commit 792981d
Show file tree
Hide file tree
Showing 4 changed files with 694 additions and 1 deletion.
197 changes: 197 additions & 0 deletions notebook/experiment/infctx-math-and-name/config-mem-template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
# lightning.pytorch==2.0.2
seed_everything: true
trainer:
# Configure the number of GPU, avaliable on your machine
accelerator: gpu
devices: auto
num_nodes: 1

# Configure the deepspeed strategy,
strategy: deepspeed_stage_1

# Floating point precision for the model, because RWKV is built FOR bf16
# you should pretty much never change this setting
precision: bf16

# Logger setting for wandb, if you want to enable wandb, uncomment the whole logger section
# ---
logger:
class_path: lightning.pytorch.loggers.WandbLogger
init_args:
name: 'Memory Instruct (bs=256, train-ctx=512)'
project: 'RWKV-X-Experiments'
tags: ['RWKV', 'RWKV-X']

# Checkpoint settings for the training process
callbacks:
class_path: lightning.pytorch.callbacks.ModelCheckpoint
init_args:
# Configure this to the path you want to save your checkpoints to
# note that a subdir will be created with the name `epoch=x-step=y.ckpt`
#
# to convert a checkpoint to a model, you can use the
# `python3 export_checkpoint.py <checkpoint path>` script,
# which will create a `rwkv_model.pth` in the checkpoint directory.
#
# Do not use the `zero_to_fp32.py` script as that will have export format issues
dirpath: ../checkpoint/V5-Base-1B5-mem-instruct/
filename: null

# Save the top/last K checkpoints
save_top_k: 2
# Choose by the most recent checkpoints (time based)
monitor: 'step'
mode: max

# If enabled (true), save a copy of the latest checkpoint to 'last.ckpt'
# useful to simply checkpoint resume scripts, at a price of disk performance
save_last: true

# DO NOT set this as true, as the model weight exported will have format issues
# expert as checkpoint, and use the `export_checkpoint.py` script to convert to model instead
save_weights_only: false

# How frequent you want to save a checkpoint for every step.
# This will happen for every X data sample, where X = every_n_train_steps * accumulate_grad_batches
#
# In general you will want to avoid putting a low number (expecially if accumulate_grad_batches <= 100)
# as the checkpoint process, will pause all the gpu training for some time, slowing down the overall process
# However you do not want to configure too high of a number, where you will lose too much progress if the training crashes
every_n_train_steps: 25
every_n_epochs: null
save_on_train_epoch_end: true
train_time_interval: null

# Other settings, you can probably leave alone
verbose: false
auto_insert_metric_name: true

########################################
## Training run parameter settings
########################################

# Generally what you want to configure is the maximum number of epochs
# Leave it as -1, and it will keep going forever till interrupted
# Or set it as a number, and it will stop after that number of epochs
max_epochs: 1
min_epochs: null

# Number of datasamples to train for each step, a data sample is considered
# a "substep" in wandb logs, and a "step" is tracked as "trainer/global_step"
#
# This decides the number of datasample, to learn together from, before backproping
# any weight changes at the end of the batch.
#
# Recommended to be a big enough number (like 128/256) where it prevents the training
# loss from flucuating in the process. But not too big of a number where the increased
# GPU vRAM / offloaded RAM usage will cause the training to crash.
#
# You are also recommended to configure this to a large enough number to fully utilize
# your GPU processing time %, and avoid idle time for the GPU between batches
target_batch_size: 256

########################################
## Training model settings
########################################
model:
# Model to start the finetune/training process from
load_model: ../model/EWR-1B5-enwiki-instruct.pth

# Context length to use for the training process
# the larger the number (and batch size) the larger the vram usage
#
# Note that if the datasample context length is larger then the ctx_len
# its training process would be split into ctx_len sized chunks.
#
# This allows the training of extreamly large context length (eg. 100k),
# without eating up too much vram by keeping the training context length
# to a resonable number sutible to the current GPU setup
ctx_len: 512

# Learning rate of the training process
# ---
# Initia learning rate of the process
lr_init: 8e-4
# Final learning rate after the learning rate period
# learning rate will stay at final value from then onwards
#
# NOTE: lr_final / lr_period does not work with warmup_steps
# and will be ignored (or replaced) with the warmup_steps logic instead
lr_final: 5e-4

# Number of epoch to reduce the learning rate from lr_init to lr_final
# 1 means a single epoch (so lr would be lr_final from epoch 2 onwards)
# 0 means lr_final will apply immediately
# -1 means we take the current max_step / max_epoch as the period
lr_period: 1
# lr_period type if its set, defaults to epoch
lr_period_type: epoch

# We limit bptt_learning_range, to 1, to ensure high throughput within a multi-gpu setup.
# (by skipping some syncronization code). Additionally, as bptt learning should not be triggering
# anyway as the data sample should be within ctx size 99% of the time
bptt_learning: true
bptt_learning_range: 1

data:
# dataset_path for the prebuilt dataset, using HF `load_from_disk()`
#
# Use this if you have built your own dataset and saved it with `save_to_disk()`
# with source left as null. Other wise configure this to a directory which the
# dataset will be built and tokenized by the huggingface dataset process.
data_path: ../datapath/picocreator/experiment/rwkv-x-exp/memory/

# Other wise provide the source path, which is used as huggingface dataset path
# this will be used to populate the dataset_path
#
# Use either the following
# - hugging face dataset
# - Directory path to a directory containing dataset files
# - Path to a single dataset file
# - hugging face dataset mode (ie: text,csv,etc - use data_dir, to configure the path then)
# - null
#
# If source is disabled, all other params, except data_path, is ignored
source: json
# source: text
# source: /home/ubuntu/RWKV-LM-LoRA/dataset-text/enwik8.txt

# Use data_dir, if you are using source=text/json/etc
# this should be relative to the trainer script path
source_data_dir: ../notebook/experiment/infctx-math-and-name/dataset/

# After loading the dataset, split out test data used for validation,
# This process is skipped if the dataset includes a test split
# This process is skipped if set to zero
test_split: 0.001
test_split_shuffle: true

# Tokenizer to use, use either the inbuilt 'neox', or 'world' tokenizer
# If using a custom tokenizer, provide the tokenizer file path
# ---
tokenizer: neox

# Minimum / Maximum token size of the dataset to use
# useful for filtering out small noisy data samples from large datasets
# (eg. removal of small articles of less then 512 tokens from wikipedia)
#
# This is ignored, if set to -1
min_token_size: -1
max_token_size: 512

# Multi Column merging process, default setting is used to support and merge
# "instruction", "input", "output", datasets. To disable set multi_column_keys to []
#
# A minimum of 2 columns is required, with non empty data, for the merge to occur
# If no match is found, this will fallback to the default prompt/completion or text column,
# or throw an error if the default fallback is not found
# ---
multi_column_keys: ['input_prefix', 'input', 'output_prefix', 'output', 'closing']
multi_column_prefix: ['', '', '', '', '']
multi_column_train_mask: [true, false, true, true, true]
multi_column_separator: ''

# If processing prompt/completion jsonl pairs, the prompt is masked by default
# use this flag to disable this default behaviour
# ---
# disable_prompt_mask: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import numpy as np
import argparse

prompt_template = f"""
You are an AI assistant who will be given some tasks to complete.
First, you will be given a name to remember. Then, you will have to sum up a series of numbers.
You will then be asked to answer some questions about the document.
Example 1:
Name: John
1
-2
3
-4
### Question:
What is the total sum?
### Answer:
-2
### Question:
What is the name given at the start of the document?
### Answer:
John
Now you will be tasked to remember the name and sum up the following series of numbers.
"""

task_templates = [
"\n### Question:\nWhat is the name given at the start of the document?\n\n### Answer:\n",
"\n### Question:\nWhat is the sum of the numbers given?\n\n### Answer:\n"
]

completion_templates = [
"\n{name}\n",
"\n{sum_of_numbers}\n",
]

def load_names(file_path):
with open(file_path) as word_file:
valid_names = list(word_file.read().split())
return valid_names

names = load_names("infctx-math-and-name/names.txt")

def get_random_prompt_completion_pair(max_numbers):
document = ""
numbers = np.random.randint(-200, 200, size=(max_numbers))
total_sum = np.sum(numbers)
for number in numbers:
document += str(number) + "\n"

template_index = np.random.randint(0, len(task_templates))
task = task_templates[template_index]
name = names[np.random.randint(0, len(names))]

prompt = prompt_template + f"Name: {name}\n" + document + task
completion = completion_templates[template_index].format(sum_of_numbers=total_sum, name=name)
return {'prompt': prompt, 'completion': completion}

def generate_jsonl(output_file_path, max_numbers, num_samples):
with open(output_file_path, 'w') as output_file:
for _ in range(num_samples):
pair = get_random_prompt_completion_pair(max_numbers)
output_file.write(str(pair) + "\n")

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--out-file", type=str, default="questions.jsonl")
parser.add_argument("--max-numbers", type=int, default=100)
parser.add_argument("--num-samples", type=int, default=10)
args = parser.parse_args()
generate_jsonl(args.out_file, args.max_numbers, args.num_samples)
64 changes: 63 additions & 1 deletion notebook/experiment/infctx-math-and-name/run.ipynb
Original file line number Diff line number Diff line change
@@ -1,5 +1,53 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"## Generating math and name dataset ##\n",
"## Done ##\n",
"total 15M\n",
"drwx------ 2 christopherchou u-christopherchou 3 Sep 16 17:12 .\n",
"drwx------ 13 christopherchou u-christopherchou 14 Sep 16 17:01 ..\n",
"-rw------- 1 christopherchou u-christopherchou 55M Sep 16 17:12 questions_numbers_1024.jsonl\n"
]
}
],
"source": [
"%%script bash\n",
"\n",
"########################################\n",
"# Generate the required jsonl dataset\n",
"########################################\n",
"\n",
"# Go to config dir\n",
"cd \"../\"\n",
"\n",
"# Reset the dataset dir\n",
"mkdir -p ../dataset\n",
"rm -rf ../dataset/*.jsonl\n",
"\n",
"# Generate the various datasets\n",
"echo \"## Generating math and name dataset ##\"\n",
"\n",
"#\n",
"# We reduce the training set for lower word count - and shift the focus upwards\n",
"#\n",
"# do\n",
"python3 infctx-math-and-name/generate_math_and_name_dataset.py --out-file ../dataset/questions_numbers_1024.jsonl --max-numbers 1024 --num-samples 10000\n",
"# done\n",
"\n",
"wait\n",
"echo \"## Done ##\"\n",
"\n",
"ls -alh ../dataset/"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -9,8 +57,22 @@
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python"
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
},
"orig_nbformat": 4
},
Expand Down
Loading

0 comments on commit 792981d

Please sign in to comment.