From cb223bcda6f416593e688f8d875b7764193b6269 Mon Sep 17 00:00:00 2001
From: Nathan <me@nwilce.co.uk>
Date: Mon, 4 Dec 2023 05:46:53 +0000
Subject: [PATCH 01/33] feat: multiple turn conversation format support

---
 RWKV-v5/config-example.yaml |  34 ++++++++++--
 RWKV-v5/src/data.py         | 107 ++++++++++++++++++++++++++++++++++++
 2 files changed, 137 insertions(+), 4 deletions(-)

diff --git a/RWKV-v5/config-example.yaml b/RWKV-v5/config-example.yaml
index 9b191aec..97f89919 100644
--- a/RWKV-v5/config-example.yaml
+++ b/RWKV-v5/config-example.yaml
@@ -427,11 +427,37 @@ data:
   #
   # See: https://github.com/RWKV/RWKV-infctx-trainer/issues/34
   # ---
-  # multi_column_keys: ['instruction', 'input', 'output']
-  # multi_column_prefix: ['Instruction:\n', 'Input:\n', 'Output:\n']
-  # multi_column_suffix: ['\n\n', '\n\n', '\n\n']
+  # multi_column_keys: ["instruction", "input", "output"]
+  # multi_column_prefix: ["Instruction:\n", "Input:\n", "Output:\n"]
+  # multi_column_suffix: ["\n\n", "\n\n", "\n\n"]
   # multi_column_train_mask: [true, false, true]
-  # multi_column_separator: '\n\n'
+  # multi_column_separator: "\n\n"
+
+  # Conversation merging process
+  # useful for merging full conversational datasets, into single documents
+  # default is off, (or set conversation_key to [])
+  # conversation_formatting supports "iopairs" or "sender" for now.
+  # ---
+  # conversation_format: 'iopairs'
+  # conversation_key: 'conversation'
+
+  # Iopairs config
+  # This means that every object in the conversation object is a pair of input output.
+  # In future it will also support a format where one of the keys dictates the format style
+  # if conversation_key is set to null, it will use the root object as the conversation object
+  # ---
+  # conversation_input_key_prefix_map: {'input': "\n\nUser: ", 'output': "\n\nAssistant: ""}
+  # conversation_input_key_mask: {'input': false, 'output': true}
+
+  # Sender config
+  # This means that every object in the conversation object is a single message (with sender and message keys - or similar)
+  # conversation_format: 'sender'
+  # conversation_input_key_map: {'message': "\n\n{sender}: "", 'context': ''} - processed in order on each turn
+  #
+  # conversation_sender_key: 'sender'
+  # conversation_sender_value_map: {'user': 'User', 'assistant': 'Assistant', 'system': 'System'}
+  # conversation_sender_mask: {'user': false, 'assistant': true, 'system': false}
+
 
   # If processing prompt/completion jsonl pairs, the prompt is masked by default
   # use this flag to disable this default behaviour
diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index 8ba9cae1..dda3bfa7 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -46,6 +46,7 @@ def prepare_data_static(**kargs):
         # Special handling for binidx
         #--------------------------------
 
+        # TODO: verify this works, i have a suspicion this just creates a new "document" for each token.
         if kargs["tokenizer"] == "binidx":
             from .dataflow.binidx import MMapIndexedDataset
 
@@ -236,6 +237,25 @@ def encodeTokens(x):
             if multi_column_separator is not None and len(multi_column_separator) > 0:
                 multi_column_separator_encodings = encodeTokens(multi_column_separator)
 
+        conversation_prefix_encoding_map = {}
+        conversation_enabled = False
+        if 'conversation_format' in kargs and kargs["conversation_format"] is not None:
+            if kargs["conversation_format"] == "iopairs":
+                # preencode all prefixes (keyed by the input key)
+                for key, prefix in kargs['conversation_input_key_prefix_map'].items():
+                    conversation_prefix_encoding_map[key] = encodeTokens(prefix)
+                conversation_enabled = True
+            elif kargs["conversation_format"] == "sender":
+                # preencode all prefixes (keyed by the sender value)
+                for key, relabel in kargs['conversation_sender_value_map'].items():
+                    for input_key, value in kargs['conversation_input_key_map'].items():
+                        if input_key not in conversation_prefix_encoding_map:
+                            conversation_prefix_encoding_map[input_key] = {}
+                        conversation_prefix_encoding_map[input_key][key] = encodeTokens(value.replace('{sender}', relabel))
+                        # example conversation_prefix_encoding_map['message']['user'] = encodeTokens('\n\nUser:')
+
+                conversation_enabled = True
+
         # Maps the dataset record to the tokenized result
         # handles a wide variety of format according to the data configuration
         #
@@ -253,6 +273,79 @@ def map_tokenizer(x):
                 if kargs["custom_text_key"] in x:
                     return encodeTokens(x[kargs["custom_text_key"]])
                 
+            if conversation_enabled:
+                conv_key = kargs['conversation_key'] if 'conversation_key' in kargs else None
+                conversation = x[conv_key] if conv_key is not None else x
+
+                # Array of output values we will return
+                input_ids = []
+                token_type_ids = []
+                attention_mask = []
+
+                if kargs['conversation_format'] == 'iopairs':
+                    # lets loop through each io pair
+                    for i in range(len(conversation)):
+                        # lets loop through each key in the io pair
+                        for key, value in conversation[i].items():
+                            # lets get the prefix for this key
+                            prefix = conversation_prefix_encoding_map[key]
+
+                            # Add the prefix
+                            if prefix is not None:
+                                input_ids += prefix['input_ids']
+                                token_type_ids += prefix['token_type_ids']
+                                attention_mask += prefix['attention_mask']
+
+                            # Tokenize the column
+                            column_encodings = encodeTokens(value)
+
+                            # Add the column
+                            input_ids += column_encodings['input_ids']
+                            token_type_ids += column_encodings['token_type_ids']
+
+                            if key not in kargs["conversation_input_key_mask"] or kargs["conversation_input_key_mask"][key]:
+                                # If the corresponding `conversation_input_key_mask` is not set, we will assume as valid training data
+                                attention_mask += ([1] * len(column_encodings['input_ids']))
+                            else: # kargs["conversation_input_key_mask"][key] is False
+                                # This means it is false, lets not pay attention to it
+                                attention_mask += ([0] * len(column_encodings['input_ids']))
+                
+                elif kargs['conversation_format'] == 'sender':
+                    for i in range(len(conversation)):
+                        turn = conversation[i]
+                        sender = turn[kargs['conversation_sender_key']]
+                            
+                        for key, value in kargs['conversation_input_key_map'].items():
+                            if key in turn:
+                                # lets get the prefix for this key
+                                prefix = conversation_prefix_encoding_map[key][sender]
+
+                                # Add the prefix
+                                if prefix is not None:
+                                    input_ids += prefix['input_ids']
+                                    token_type_ids += prefix['token_type_ids']
+                                    attention_mask += prefix['attention_mask']
+
+                                # Tokenize the column
+                                column_encodings = encodeTokens(turn[key])
+
+                                # Add the column
+                                input_ids += column_encodings['input_ids']
+                                token_type_ids += column_encodings['token_type_ids']
+
+                                if sender not in kargs["conversation_sender_mask"] or kargs["conversation_sender_mask"][sender]:
+                                    # If the corresponding `conversation_input_key_mask` is not set, we will assume as valid training data
+                                    attention_mask += ([1] * len(column_encodings['input_ids']))
+                                else: # kargs["conversation_input_key_mask"][key] is False
+                                    # This means it is false, lets not pay attention to it
+                                    attention_mask += ([0] * len(column_encodings['input_ids']))
+
+                return {
+                    'input_ids': input_ids,
+                    'token_type_ids': token_type_ids,
+                    'attention_mask': attention_mask
+                }
+                    
             # Multi column merging support
             if multi_column_enabled:
                 # Lets count the number of columns we have
@@ -600,6 +693,20 @@ def __init__(
         multi_column_suffix: list = None,
         multi_column_train_mask: list = None,
         multi_column_separator: str = None,
+        # Conversation format support
+        conversation_format: str = None,
+        conversation_key: str = None,
+
+        # conversation_format == 'iopairs'
+        conversation_input_key_prefix_map: dict = None,
+        conversation_input_key_mask: dict = None,
+
+        # conversation_format == 'sender'
+        conversation_sender_key: str = None,
+        conversation_sender_value_map: dict = None,
+        conversation_input_key_map: dict = None,
+        conversation_sender_mask: dict = None,
+
         # prompt/completion format masking support
         disable_prompt_completion_mask: bool = False,
         # Skip database setup checks if datapath exists, ignored if using preload_datapath.py

From 0c71c37639d4ced877cfad201c03d1f9b522a21a Mon Sep 17 00:00:00 2001
From: Nathan <me@nwilce.co.uk>
Date: Mon, 4 Dec 2023 06:36:33 +0000
Subject: [PATCH 02/33] updated datasets

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index c4d6b930..e0f4df48 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ torchaudio
 torchvision
 lightning==2.0.5
 deepspeed==0.10.0
-datasets==2.13.1
+datasets==2.15.0
 transformers==4.30.2
 ninja==1.11.1
 numexpr==2.8.4

From ee6398afb3fcc28d17359f86538757a177a6cfd0 Mon Sep 17 00:00:00 2001
From: Nathan <me@nwilce.co.uk>
Date: Mon, 4 Dec 2023 07:03:37 +0000
Subject: [PATCH 03/33] fix masking

---
 RWKV-v5/src/data.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index dda3bfa7..37b69e53 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -294,7 +294,6 @@ def map_tokenizer(x):
                             if prefix is not None:
                                 input_ids += prefix['input_ids']
                                 token_type_ids += prefix['token_type_ids']
-                                attention_mask += prefix['attention_mask']
 
                             # Tokenize the column
                             column_encodings = encodeTokens(value)
@@ -305,10 +304,10 @@ def map_tokenizer(x):
 
                             if key not in kargs["conversation_input_key_mask"] or kargs["conversation_input_key_mask"][key]:
                                 # If the corresponding `conversation_input_key_mask` is not set, we will assume as valid training data
-                                attention_mask += ([1] * len(column_encodings['input_ids']))
+                                attention_mask += ([1] * (len(column_encodings['input_ids']) + (len(prefix['input_ids']) if prefix is not None else 0)))
                             else: # kargs["conversation_input_key_mask"][key] is False
                                 # This means it is false, lets not pay attention to it
-                                attention_mask += ([0] * len(column_encodings['input_ids']))
+                                attention_mask += ([0] * (len(column_encodings['input_ids']) + (len(prefix['input_ids']) if prefix is not None else 0)))
                 
                 elif kargs['conversation_format'] == 'sender':
                     for i in range(len(conversation)):
@@ -324,7 +323,6 @@ def map_tokenizer(x):
                                 if prefix is not None:
                                     input_ids += prefix['input_ids']
                                     token_type_ids += prefix['token_type_ids']
-                                    attention_mask += prefix['attention_mask']
 
                                 # Tokenize the column
                                 column_encodings = encodeTokens(turn[key])
@@ -335,10 +333,16 @@ def map_tokenizer(x):
 
                                 if sender not in kargs["conversation_sender_mask"] or kargs["conversation_sender_mask"][sender]:
                                     # If the corresponding `conversation_input_key_mask` is not set, we will assume as valid training data
-                                    attention_mask += ([1] * len(column_encodings['input_ids']))
+                                    attention_mask += ([1] * (len(column_encodings['input_ids']) + (len(prefix['input_ids']) if prefix is not None else 0)))
                                 else: # kargs["conversation_input_key_mask"][key] is False
                                     # This means it is false, lets not pay attention to it
-                                    attention_mask += ([0] * len(column_encodings['input_ids']))
+                                    attention_mask += ([0] * (len(column_encodings['input_ids']) + (len(prefix['input_ids']) if prefix is not None else 0)))
+
+                print({
+                    'input_ids': input_ids,
+                    'token_type_ids': token_type_ids,
+                    'attention_mask': attention_mask
+                })
 
                 return {
                     'input_ids': input_ids,

From f803ac8aa1046602813b3c00e55e7de0f482835a Mon Sep 17 00:00:00 2001
From: Nathan <me@nwilce.co.uk>
Date: Mon, 4 Dec 2023 07:10:37 +0000
Subject: [PATCH 04/33] fix: remove debug print;

---
 RWKV-v5/src/data.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index 37b69e53..fb2c238f 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -338,12 +338,6 @@ def map_tokenizer(x):
                                     # This means it is false, lets not pay attention to it
                                     attention_mask += ([0] * (len(column_encodings['input_ids']) + (len(prefix['input_ids']) if prefix is not None else 0)))
 
-                print({
-                    'input_ids': input_ids,
-                    'token_type_ids': token_type_ids,
-                    'attention_mask': attention_mask
-                })
-
                 return {
                     'input_ids': input_ids,
                     'token_type_ids': token_type_ids,

From ca25bd03c0957e738b87f0928599e55df1f05e64 Mon Sep 17 00:00:00 2001
From: Nathan <me@nwilce.co.uk>
Date: Mon, 4 Dec 2023 08:22:51 +0000
Subject: [PATCH 05/33] fix

---
 RWKV-v5/src/model.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/RWKV-v5/src/model.py b/RWKV-v5/src/model.py
index 199480d1..86cb7025 100644
--- a/RWKV-v5/src/model.py
+++ b/RWKV-v5/src/model.py
@@ -1210,6 +1210,10 @@ def training_step(self, batch, batch_idx):
             gc.collect()
             torch.cuda.empty_cache()
 
+        # if loss not a number return None
+        if torch.isnan(total_loss):
+            return None
+
         return total_loss
 
     @TCompileBaseline

From fe42709582f3fb1567157d6ac2edca250d664445 Mon Sep 17 00:00:00 2001
From: Nathan <me@nwilce.co.uk>
Date: Mon, 4 Dec 2023 12:36:54 +0000
Subject: [PATCH 06/33] fix config

---
 RWKV-v5/config-example.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RWKV-v5/config-example.yaml b/RWKV-v5/config-example.yaml
index 97f89919..727e3cc7 100644
--- a/RWKV-v5/config-example.yaml
+++ b/RWKV-v5/config-example.yaml
@@ -446,7 +446,7 @@ data:
   # In future it will also support a format where one of the keys dictates the format style
   # if conversation_key is set to null, it will use the root object as the conversation object
   # ---
-  # conversation_input_key_prefix_map: {'input': "\n\nUser: ", 'output': "\n\nAssistant: ""}
+  # conversation_input_key_prefix_map: {'input': "\n\nUser: ", 'output': "\n\nAssistant: "}
   # conversation_input_key_mask: {'input': false, 'output': true}
 
   # Sender config

From cc674770083dc823f50ff38af291b860cf1ee4e2 Mon Sep 17 00:00:00 2001
From: Nathan <me@nwilce.co.uk>
Date: Mon, 4 Dec 2023 14:31:13 +0000
Subject: [PATCH 07/33] fix: masking issue

---
 RWKV-v5/src/data.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index fb2c238f..dda3bfa7 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -294,6 +294,7 @@ def map_tokenizer(x):
                             if prefix is not None:
                                 input_ids += prefix['input_ids']
                                 token_type_ids += prefix['token_type_ids']
+                                attention_mask += prefix['attention_mask']
 
                             # Tokenize the column
                             column_encodings = encodeTokens(value)
@@ -304,10 +305,10 @@ def map_tokenizer(x):
 
                             if key not in kargs["conversation_input_key_mask"] or kargs["conversation_input_key_mask"][key]:
                                 # If the corresponding `conversation_input_key_mask` is not set, we will assume as valid training data
-                                attention_mask += ([1] * (len(column_encodings['input_ids']) + (len(prefix['input_ids']) if prefix is not None else 0)))
+                                attention_mask += ([1] * len(column_encodings['input_ids']))
                             else: # kargs["conversation_input_key_mask"][key] is False
                                 # This means it is false, lets not pay attention to it
-                                attention_mask += ([0] * (len(column_encodings['input_ids']) + (len(prefix['input_ids']) if prefix is not None else 0)))
+                                attention_mask += ([0] * len(column_encodings['input_ids']))
                 
                 elif kargs['conversation_format'] == 'sender':
                     for i in range(len(conversation)):
@@ -323,6 +324,7 @@ def map_tokenizer(x):
                                 if prefix is not None:
                                     input_ids += prefix['input_ids']
                                     token_type_ids += prefix['token_type_ids']
+                                    attention_mask += prefix['attention_mask']
 
                                 # Tokenize the column
                                 column_encodings = encodeTokens(turn[key])
@@ -333,10 +335,10 @@ def map_tokenizer(x):
 
                                 if sender not in kargs["conversation_sender_mask"] or kargs["conversation_sender_mask"][sender]:
                                     # If the corresponding `conversation_input_key_mask` is not set, we will assume as valid training data
-                                    attention_mask += ([1] * (len(column_encodings['input_ids']) + (len(prefix['input_ids']) if prefix is not None else 0)))
+                                    attention_mask += ([1] * len(column_encodings['input_ids']))
                                 else: # kargs["conversation_input_key_mask"][key] is False
                                     # This means it is false, lets not pay attention to it
-                                    attention_mask += ([0] * (len(column_encodings['input_ids']) + (len(prefix['input_ids']) if prefix is not None else 0)))
+                                    attention_mask += ([0] * len(column_encodings['input_ids']))
 
                 return {
                     'input_ids': input_ids,

From 926e83030daae550bbae01aa1132524d7ddd1ab2 Mon Sep 17 00:00:00 2001
From: Nathan <me@nwilce.co.uk>
Date: Tue, 5 Dec 2023 09:38:46 +0000
Subject: [PATCH 08/33] add multi turn suffix

---
 RWKV-v5/src/data.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index dda3bfa7..f26202b0 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -238,6 +238,7 @@ def encodeTokens(x):
                 multi_column_separator_encodings = encodeTokens(multi_column_separator)
 
         conversation_prefix_encoding_map = {}
+        conversation_suffix_encoding_map = {}
         conversation_enabled = False
         if 'conversation_format' in kargs and kargs["conversation_format"] is not None:
             if kargs["conversation_format"] == "iopairs":
@@ -252,6 +253,9 @@ def encodeTokens(x):
                         if input_key not in conversation_prefix_encoding_map:
                             conversation_prefix_encoding_map[input_key] = {}
                         conversation_prefix_encoding_map[input_key][key] = encodeTokens(value.replace('{sender}', relabel))
+
+                for key, suffix in kargs['conversation_sender_suffix'].items():
+                    conversation_suffix_encoding_map[key] = encodeTokens(suffix)
                         # example conversation_prefix_encoding_map['message']['user'] = encodeTokens('\n\nUser:')
 
                 conversation_enabled = True
@@ -340,6 +344,13 @@ def map_tokenizer(x):
                                     # This means it is false, lets not pay attention to it
                                     attention_mask += ([0] * len(column_encodings['input_ids']))
 
+                                suffix = conversation_suffix_encoding_map[sender]
+
+                                if suffix is not None:
+                                    input_ids += suffix['input_ids']
+                                    token_type_ids += suffix['token_type_ids']
+                                    attention_mask += suffix['attention_mask']
+
                 return {
                     'input_ids': input_ids,
                     'token_type_ids': token_type_ids,
@@ -705,6 +716,7 @@ def __init__(
         conversation_sender_key: str = None,
         conversation_sender_value_map: dict = None,
         conversation_input_key_map: dict = None,
+        conversation_sender_suffix: dict = None,
         conversation_sender_mask: dict = None,
 
         # prompt/completion format masking support

From 414bdb1f90665a63ef57059ab253adadbcaa97e0 Mon Sep 17 00:00:00 2001
From: root <root@nathan-datasets.dc1.recursal-dev.com>
Date: Sat, 16 Dec 2023 21:52:23 +0000
Subject: [PATCH 09/33] conversational data final

---
 RWKV-v5/config-example.yaml | 13 +++++++------
 RWKV-v5/src/data.py         | 26 +++++++++++++++++++++-----
 2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/RWKV-v5/config-example.yaml b/RWKV-v5/config-example.yaml
index 727e3cc7..49f15037 100644
--- a/RWKV-v5/config-example.yaml
+++ b/RWKV-v5/config-example.yaml
@@ -440,24 +440,25 @@ data:
   # ---
   # conversation_format: 'iopairs'
   # conversation_key: 'conversation'
+  # conversation_end_of_conversation: "\n\nUser:"
 
-  # Iopairs config
+  # Iopairs specific config
   # This means that every object in the conversation object is a pair of input output.
   # In future it will also support a format where one of the keys dictates the format style
   # if conversation_key is set to null, it will use the root object as the conversation object
   # ---
   # conversation_input_key_prefix_map: {'input': "\n\nUser: ", 'output': "\n\nAssistant: "}
   # conversation_input_key_mask: {'input': false, 'output': true}
+  # conversation_sender_suffix: {'input': "", 'output': ""}
 
-  # Sender config
+  # Sender specific config
   # This means that every object in the conversation object is a single message (with sender and message keys - or similar)
-  # conversation_format: 'sender'
-  # conversation_input_key_map: {'message': "\n\n{sender}: "", 'context': ''} - processed in order on each turn
-  #
+  # The output is dictated by the input key map, the rest of the "sender_" config is keyed by the value of the sender key
+  # conversation_input_key_map: {'message': "\n\n{sender}: ", 'context': ''}
   # conversation_sender_key: 'sender'
   # conversation_sender_value_map: {'user': 'User', 'assistant': 'Assistant', 'system': 'System'}
   # conversation_sender_mask: {'user': false, 'assistant': true, 'system': false}
-
+  # conversation_sender_suffix: {'user': "", 'assistant': "", 'system': ""}
 
   # If processing prompt/completion jsonl pairs, the prompt is masked by default
   # use this flag to disable this default behaviour
diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index f26202b0..bf62dd0d 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -239,6 +239,7 @@ def encodeTokens(x):
 
         conversation_prefix_encoding_map = {}
         conversation_suffix_encoding_map = {}
+        conversation_end_of_conversation_token = encodeTokens(kargs['conversation_end_of_conversation'])
         conversation_enabled = False
         if 'conversation_format' in kargs and kargs["conversation_format"] is not None:
             if kargs["conversation_format"] == "iopairs":
@@ -254,9 +255,10 @@ def encodeTokens(x):
                             conversation_prefix_encoding_map[input_key] = {}
                         conversation_prefix_encoding_map[input_key][key] = encodeTokens(value.replace('{sender}', relabel))
 
-                for key, suffix in kargs['conversation_sender_suffix'].items():
-                    conversation_suffix_encoding_map[key] = encodeTokens(suffix)
-                        # example conversation_prefix_encoding_map['message']['user'] = encodeTokens('\n\nUser:')
+                if  kargs['conversation_sender_suffix'] is not None:
+                    for key, suffix in kargs['conversation_sender_suffix'].items():
+                        conversation_suffix_encoding_map[key] = encodeTokens(suffix)
+                            # example conversation_prefix_encoding_map['message']['user'] = encodeTokens('\n\nUser:')
 
                 conversation_enabled = True
 
@@ -313,6 +315,14 @@ def map_tokenizer(x):
                             else: # kargs["conversation_input_key_mask"][key] is False
                                 # This means it is false, lets not pay attention to it
                                 attention_mask += ([0] * len(column_encodings['input_ids']))
+
+                            
+                            suffix = conversation_suffix_encoding_map[key] if sender in conversation_suffix_encoding_map else None
+
+                            if suffix is not None:
+                                input_ids += suffix['input_ids']
+                                token_type_ids += suffix['token_type_ids']
+                                attention_mask += suffix['attention_mask']
                 
                 elif kargs['conversation_format'] == 'sender':
                     for i in range(len(conversation)):
@@ -322,7 +332,7 @@ def map_tokenizer(x):
                         for key, value in kargs['conversation_input_key_map'].items():
                             if key in turn:
                                 # lets get the prefix for this key
-                                prefix = conversation_prefix_encoding_map[key][sender]
+                                prefix = conversation_prefix_encoding_map[key][sender] if sender in conversation_prefix_encoding_map[key] else None
 
                                 # Add the prefix
                                 if prefix is not None:
@@ -344,13 +354,18 @@ def map_tokenizer(x):
                                     # This means it is false, lets not pay attention to it
                                     attention_mask += ([0] * len(column_encodings['input_ids']))
 
-                                suffix = conversation_suffix_encoding_map[sender]
+                                suffix = conversation_suffix_encoding_map[sender] if sender in conversation_suffix_encoding_map else None
 
                                 if suffix is not None:
                                     input_ids += suffix['input_ids']
                                     token_type_ids += suffix['token_type_ids']
                                     attention_mask += suffix['attention_mask']
 
+                if len(input_ids) > 0:
+                    input_ids += conversation_end_of_conversation_token['input_ids']
+                    token_type_ids += conversation_end_of_conversation_token['token_type_ids']
+                    attention_mask += conversation_end_of_conversation_token['attention_mask']
+
                 return {
                     'input_ids': input_ids,
                     'token_type_ids': token_type_ids,
@@ -718,6 +733,7 @@ def __init__(
         conversation_input_key_map: dict = None,
         conversation_sender_suffix: dict = None,
         conversation_sender_mask: dict = None,
+        conversation_end_of_conversation: str = None,
 
         # prompt/completion format masking support
         disable_prompt_completion_mask: bool = False,

From 1db64af57e424493a4700aa77c3719137d2b9893 Mon Sep 17 00:00:00 2001
From: Nathan Wilce <me@nwilce.co.uk>
Date: Sat, 16 Dec 2023 21:54:35 +0000
Subject: [PATCH 10/33] add multi turn suffix

---
 RWKV-v5/src/data.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index dda3bfa7..f26202b0 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -238,6 +238,7 @@ def encodeTokens(x):
                 multi_column_separator_encodings = encodeTokens(multi_column_separator)
 
         conversation_prefix_encoding_map = {}
+        conversation_suffix_encoding_map = {}
         conversation_enabled = False
         if 'conversation_format' in kargs and kargs["conversation_format"] is not None:
             if kargs["conversation_format"] == "iopairs":
@@ -252,6 +253,9 @@ def encodeTokens(x):
                         if input_key not in conversation_prefix_encoding_map:
                             conversation_prefix_encoding_map[input_key] = {}
                         conversation_prefix_encoding_map[input_key][key] = encodeTokens(value.replace('{sender}', relabel))
+
+                for key, suffix in kargs['conversation_sender_suffix'].items():
+                    conversation_suffix_encoding_map[key] = encodeTokens(suffix)
                         # example conversation_prefix_encoding_map['message']['user'] = encodeTokens('\n\nUser:')
 
                 conversation_enabled = True
@@ -340,6 +344,13 @@ def map_tokenizer(x):
                                     # This means it is false, lets not pay attention to it
                                     attention_mask += ([0] * len(column_encodings['input_ids']))
 
+                                suffix = conversation_suffix_encoding_map[sender]
+
+                                if suffix is not None:
+                                    input_ids += suffix['input_ids']
+                                    token_type_ids += suffix['token_type_ids']
+                                    attention_mask += suffix['attention_mask']
+
                 return {
                     'input_ids': input_ids,
                     'token_type_ids': token_type_ids,
@@ -705,6 +716,7 @@ def __init__(
         conversation_sender_key: str = None,
         conversation_sender_value_map: dict = None,
         conversation_input_key_map: dict = None,
+        conversation_sender_suffix: dict = None,
         conversation_sender_mask: dict = None,
 
         # prompt/completion format masking support

From 071e68334bceff48cc9b4837fbe0bfce239e4595 Mon Sep 17 00:00:00 2001
From: "@picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Sat, 16 Dec 2023 16:39:12 -0800
Subject: [PATCH 11/33] prototype dataset packing

---
 RWKV-v5/src/data.py | 180 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 169 insertions(+), 11 deletions(-)

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index 8ba9cae1..f8cf09de 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -4,6 +4,7 @@
 from torch.utils.data import DataLoader
 from torch.utils.data import DistributedSampler
 
+import math
 import wandb
 from datasets import load_from_disk, load_dataset, Dataset
 from transformers import PreTrainedTokenizerFast, AutoTokenizer
@@ -418,6 +419,11 @@ def rechunk_text(x):
                 'attention_mask': out_attention_mask,
             }
             return ret
+        
+        # Get the kargs["processing_max_batch_size"], if not set, we will use the full dataset
+        processing_max_batch_size = kargs["processing_max_batch_size"]
+        if processing_max_batch_size <= 0:
+            processing_max_batch_size = len(src_dataset["train"])
 
         # Remove empty datasets (it causes an error otherwise)
         # and perform min/max length filtering (if configured)
@@ -435,14 +441,14 @@ def dataset_filter(x):
         # Perform rechunking if needed for "text" based datasets
         if kargs["source"] == "text" and kargs["text_rechunk_size"] > 0 and kargs["text_rechunk_auto"]:
             src_dataset = src_dataset.map(rechunk_text, batched=True, 
-                                        batch_size=kargs["text_rechunk_size"]*10,
+                                        batch_size=processing_max_batch_size,
                                         num_proc=num_cpus)
         
         # Perform rechunking after filtering, if source is not a "text" based 
         # dataset and text_rechunk_force is enabled
         if kargs["source"] != "text" and kargs["text_rechunk_size"] > 0 and kargs["text_rechunk_force"]:
             src_dataset = src_dataset.map(rechunk_text, batched=True, 
-                                        batch_size=kargs["text_rechunk_size"]*2,
+                                        batch_size=processing_max_batch_size,
                                         num_proc=num_cpus)
 
         # Check if the dataset does not have a test split
@@ -457,19 +463,134 @@ def dataset_filter(x):
                 seed=42 #Fixed seed, to prevent train/test reshuffling between test runs
             )
         
-        # Perform a sort by length, only after test split
-        if kargs["sort_by_length"]:
-            sort_asc = kargs["sort_asc"]
-            
+        # Compute the sample length, as requried for the sort by length feature
+        if kargs["sort_by_length"] and not kargs["packing_enabled"]:
             def add_length(example):
-                example["input_length"] = len(example['input_ids'])
+                example["sample_length"] = len(example['input_ids'])
                 return example
             
             src_dataset['train'] = src_dataset['train'].map(add_length, batched=False, num_proc=num_cpus)
-            
-            # sort by length (not sorting the columns, just the rows)
-            src_dataset['train'] = src_dataset['train'].sort("input_length", reverse=not sort_asc)
 
+        # Does the actual sorting process (after test split!)
+        # Skipped if dataset packing is enabled (as this woould be redundant)
+        if kargs["sort_by_length"] and not kargs["packing_enabled"]:
+            sort_asc = kargs["sort_asc"]
+            src_dataset['train'] = src_dataset['train'].sort("sample_length", reverse=not sort_asc)
+        
+        # Implement dataset packing, which merges the dataset row records, into "fixed sizes"
+        # this is done by merging multiple dataset samples, with a 0 token in between
+        # to form a single dataset sample of the desired size
+        #
+        # The longest dattaset sample (below the pack size) will be appended with the shortest
+        # dataset samples, until the desired pack size is reached. With the process 
+        # repeated for all samples
+        #
+        # This however will mess up the "real_ctx_len" value, as it will be the length of the
+        # of the merged dataset samples, instead of the original dataset sample.
+        # ---
+        if kargs["packing_enabled"]:
+
+            # def add_length(example):
+            #     example["sample_length"] = len(example['input_ids'])
+            #     return example
+            # src_dataset['train'] = src_dataset['train'].map(add_length, batched=False, num_proc=num_cpus)
+
+            # The pack size
+            packing_batchsize = kargs["packing_batchsize"]
+            packing_chunksize = kargs["packing_chunksize"]
+
+            # The pack function
+            def pack_dataset_in_sequence(x):
+                
+                # The return resulting arrays
+                id_arr = []
+                type_arr = []
+                mask_arr = []
+                sample_len_arr = []
+
+                # batch set chunk counting
+                batchset_chunksize = [0]
+                
+                # The total length of the dataset
+                total_len = len(x["input_ids"])
+
+                # Lets prepare the basic first chunk
+                for i in range(packing_batchsize):
+                    # Port the values to the return arrays
+                    id_arr.append(x["input_ids"][i])
+                    type_arr.append(x["token_type_ids"][i])
+                    mask_arr.append(x["attention_mask"][i])
+                    sample_len_arr.append(x["sample_length"][i])
+
+                    # Keep the chunk count in sync
+                    batchset_chunksize[0] = math.max( math.ceil( x["sample_length"][i] / packing_chunksize ) * packing_chunksize, batchset_chunksize[0] )
+
+                # Given the datasample index, try to scan and merge into existing samples (if possible)
+                def merge_into_existing_samples(i):
+                    # Get the sample length
+                    sample_len = x["sample_length"][i]
+
+                    # Iterate and see if we can merge the sample
+                    for j in range(len(batchset_chunksize)):
+
+                        # Get the current set chunk size
+                        current_set_chunk_size = batchset_chunksize[j]
+                        
+                        # Iterate existing samples for the chunk
+                        for k in range( j * packing_chunksize, math.min((j+1) * packing_chunksize, len(id_arr))):
+                            # Get the existing record length
+                            existing_record_len = len(id_arr[k])
+
+                            # Check if the sample can be merged
+                            if existing_record_len + 1 + sample_len < current_set_chunk_size:
+                                # Merge the sample
+                                id_arr[k] += endOfDoc_tokenSet["input_ids"][0] + x["input_ids"][i]
+                                type_arr[k] += endOfDoc_tokenSet["token_type_ids"][0] + x["token_type_ids"][i]
+                                mask_arr[k] += endOfDoc_tokenSet["attention_mask"][0] + x["attention_mask"][i]
+                                
+                                # We intentionally DO NOT update the sample length, as it used to accurately
+                                # extract the loss involved for the first sample
+                                # sample_len_arr[k] += sample_len
+
+                                # Return that a merge has been done
+                                return True
+                            
+                    # Return that no merge has been done
+                    return False
+
+                # Lets iterate the rest of the dataset, and start packing
+                for i in range(packing_batchsize, total_len):
+                    # Merge if possible
+                    if merge_into_existing_samples(i):
+                        continue
+
+                    # Ok merge failed, lets append and update the chunk size, of the affected batchset
+                    id_arr.append(x["input_ids"][i])
+                    type_arr.append(x["token_type_ids"][i])
+                    mask_arr.append(x["attention_mask"][i])
+                    sample_len_arr.append(x["sample_length"][i])
+
+                    # Update the chunk size
+                    batchset_id = math.floor( i / packing_chunksize )
+                    batchset_chunksize[ batchset_id ] = math.max( math.ceil( x["sample_length"][i] / packing_chunksize ) * packing_chunksize, batchset_chunksize[ batchset_id ] )
+
+                # Prepare and return the output object
+                ret = {
+                    'input_ids': id_arr,
+                    'token_type_ids': type_arr,
+                    'attention_mask': mask_arr,
+                    'sample_length': sample_len_arr
+                }
+                return ret
+
+            # Perform the dataset packing
+            if( kargs["packing_in_sequence"] ):
+                src_dataset['train'] = src_dataset['train'].map(pack_dataset_in_sequence, batched=True, 
+                                            batch_size=processing_max_batch_size,
+                                            num_proc=num_cpus)
+            else:
+                raise NotImplementedError("Packing in random order is not implemented yet")
+        
         # If an int value is used, it is interprated as document count
         # If a floating value (<1.0) is used, it is interprated as a percentage of the dataset
         if kargs["dataset_offset"] > 0 or kargs["dataset_length"] > 0:
@@ -575,7 +696,10 @@ def __init__(
         # HF dataset conversion helpers
         # ---
         # Min / Max token size filtering
-        min_token_size: int = -1,
+        #
+        # default min token size of 1 is chosen, to filter out empty records
+        # which causes errors in the trainer
+        min_token_size: int = 1,
         max_token_size: int = -1,
         
         # Sort by length
@@ -602,6 +726,40 @@ def __init__(
         multi_column_separator: str = None,
         # prompt/completion format masking support
         disable_prompt_completion_mask: bool = False,
+
+        # ----------------------------
+        # dataset packing support
+        # ----------------------------
+
+        # Boolean flag to enable / disable dataset packing
+        packing_enabled: bool = False,
+
+        # Used to ensure all training samples wihin this batch size is the same length
+        # Ideally this should align exactly with your real "batch size"
+        #
+        # Uses, `8 * (3 * 4 * 5 * 6 * 7) = 20160` for default, as it should align across
+        # a large number of batch size combinations. This helps reduce the amount of
+        # misaligned batches, and thus reduce the amount of wasted training time.
+        packing_batchsize: int = 20160,
+
+        # Chunking size to align within each batch, this ideally should be equal to
+        # the training context length used.
+        packing_chunksize: int = 4096,
+
+        # Pack the data sequentially if possible, in accordance to the dataset sequence
+        # this can be used together with sort_by_length
+        packing_in_sequence: bool = False,
+
+        # ----------------------------
+        # System tweaks
+        # ----------------------------
+
+        # Batch size scanning range, used for deciding the max number of documents
+        # to process simultaneously at a time. This is used to prevent OOM errors
+        # while rearranging the dataset, etc. Used for both packing / sorting operations
+        # ( Defaults to all records )
+        processing_max_batch_size: int = -1,
+
         # Skip database setup checks if datapath exists, ignored if using preload_datapath.py
         skip_datapath_setup: bool = False
     ):

From e1d27a2be31fb04744338cb9663383d3e23876a4 Mon Sep 17 00:00:00 2001
From: "@picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Sat, 16 Dec 2023 16:40:38 -0800
Subject: [PATCH 12/33] sample_length trimming

---
 RWKV-v5/src/data.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index f8cf09de..6339296f 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -590,6 +590,9 @@ def merge_into_existing_samples(i):
                                             num_proc=num_cpus)
             else:
                 raise NotImplementedError("Packing in random order is not implemented yet")
+        else:
+            # Remove the sample_length column, as it is no longer needed
+            src_dataset['train'] = src_dataset['train'].remove_columns(["sample_length"])
         
         # If an int value is used, it is interprated as document count
         # If a floating value (<1.0) is used, it is interprated as a percentage of the dataset

From cfc8b206f302ef08f71b4159ab31a507cbbf6e11 Mon Sep 17 00:00:00 2001
From: "@picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Sun, 17 Dec 2023 10:30:49 -0800
Subject: [PATCH 13/33] enforce sample_length

---
 RWKV-v5/src/data.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index 6339296f..3612d698 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -463,13 +463,11 @@ def dataset_filter(x):
                 seed=42 #Fixed seed, to prevent train/test reshuffling between test runs
             )
         
-        # Compute the sample length, as requried for the sort by length feature
-        if kargs["sort_by_length"] and not kargs["packing_enabled"]:
-            def add_length(example):
-                example["sample_length"] = len(example['input_ids'])
-                return example
-            
-            src_dataset['train'] = src_dataset['train'].map(add_length, batched=False, num_proc=num_cpus)
+        # Compute the sample length, as requried for the sort by length feature, and packing
+        def add_length(example):
+            example["sample_length"] = len(example['input_ids'])
+            return example
+        src_dataset['train'] = src_dataset['train'].map(add_length, batched=False, num_proc=num_cpus)
 
         # Does the actual sorting process (after test split!)
         # Skipped if dataset packing is enabled (as this woould be redundant)

From ac1dab29d470089c17433e6b8268505145ff080c Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Mon, 18 Dec 2023 06:59:14 +0800
Subject: [PATCH 14/33] dataset packing

---
 RWKV-v5/src/data.py                           |  75 ++-
 .../config/enwiki_100k-world-4096.yaml        | 265 +++++++++
 .../config/enwiki_10k-world-4096.yaml         |   2 +-
 .../config/test-dataset-repack-chunks.yaml    | 265 +++++++++
 .../config/test-dataset-repack.yaml           | 295 ++++++++++
 .../test-dataset-packing.ipynb                | 535 ++++++++++++++++++
 6 files changed, 1409 insertions(+), 28 deletions(-)
 create mode 100644 notebook/trainer-v5-validation/config/enwiki_100k-world-4096.yaml
 create mode 100644 notebook/trainer-v5-validation/config/test-dataset-repack-chunks.yaml
 create mode 100644 notebook/trainer-v5-validation/config/test-dataset-repack.yaml
 create mode 100644 notebook/trainer-v5-validation/test-dataset-packing.ipynb

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index 3612d698..af8da599 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -470,10 +470,13 @@ def add_length(example):
         src_dataset['train'] = src_dataset['train'].map(add_length, batched=False, num_proc=num_cpus)
 
         # Does the actual sorting process (after test split!)
-        # Skipped if dataset packing is enabled (as this woould be redundant)
-        if kargs["sort_by_length"] and not kargs["packing_enabled"]:
-            sort_asc = kargs["sort_asc"]
-            src_dataset['train'] = src_dataset['train'].sort("sample_length", reverse=not sort_asc)
+        if kargs["sort_by_length"]:
+            if kargs["packing_enable"] and not kargs["packing_in_sequence"]:
+                # Show warning if sort_by_length is enabled, with packing
+                print("Warning: sort_by_length=true, packing_enable=true, with packing_in_sequence=False - sort_by_length to be ignored")
+            else:
+                sort_asc = kargs["sort_asc"]
+                src_dataset['train'] = src_dataset['train'].sort("sample_length", reverse=not sort_asc)
         
         # Implement dataset packing, which merges the dataset row records, into "fixed sizes"
         # this is done by merging multiple dataset samples, with a 0 token in between
@@ -486,7 +489,7 @@ def add_length(example):
         # This however will mess up the "real_ctx_len" value, as it will be the length of the
         # of the merged dataset samples, instead of the original dataset sample.
         # ---
-        if kargs["packing_enabled"]:
+        if kargs["packing_enable"]:
 
             # def add_length(example):
             #     example["sample_length"] = len(example['input_ids'])
@@ -494,8 +497,12 @@ def add_length(example):
             # src_dataset['train'] = src_dataset['train'].map(add_length, batched=False, num_proc=num_cpus)
 
             # The pack size
-            packing_batchsize = kargs["packing_batchsize"]
-            packing_chunksize = kargs["packing_chunksize"]
+            packing_batchsize = int(kargs["packing_batchsize"])
+            packing_chunksize = int(kargs["packing_chunksize"])
+            packing_min_ctx_len = int(kargs["packing_min_ctx_len"])
+
+            if packing_min_ctx_len <= 0:
+                packing_min_ctx_len = packing_chunksize
 
             # The pack function
             def pack_dataset_in_sequence(x):
@@ -512,16 +519,22 @@ def pack_dataset_in_sequence(x):
                 # The total length of the dataset
                 total_len = len(x["input_ids"])
 
+                # Preload size (we can use either packing_batchsize, or just 1)
+                preload_size = 1 # packing_batchsize
+
                 # Lets prepare the basic first chunk
-                for i in range(packing_batchsize):
+                for i in range(preload_size):
                     # Port the values to the return arrays
                     id_arr.append(x["input_ids"][i])
                     type_arr.append(x["token_type_ids"][i])
                     mask_arr.append(x["attention_mask"][i])
-                    sample_len_arr.append(x["sample_length"][i])
+                    sample_len_arr.append([x["sample_length"][i]])
 
                     # Keep the chunk count in sync
-                    batchset_chunksize[0] = math.max( math.ceil( x["sample_length"][i] / packing_chunksize ) * packing_chunksize, batchset_chunksize[0] )
+                    batchset_chunksize[0] = max( 
+                        math.ceil( max(x["sample_length"][i],packing_min_ctx_len) / packing_chunksize ) * packing_chunksize, 
+                        batchset_chunksize[0] 
+                    )
 
                 # Given the datasample index, try to scan and merge into existing samples (if possible)
                 def merge_into_existing_samples(i):
@@ -535,7 +548,7 @@ def merge_into_existing_samples(i):
                         current_set_chunk_size = batchset_chunksize[j]
                         
                         # Iterate existing samples for the chunk
-                        for k in range( j * packing_chunksize, math.min((j+1) * packing_chunksize, len(id_arr))):
+                        for k in range( j * packing_chunksize, min((j+1) * packing_chunksize, len(id_arr))):
                             # Get the existing record length
                             existing_record_len = len(id_arr[k])
 
@@ -545,11 +558,8 @@ def merge_into_existing_samples(i):
                                 id_arr[k] += endOfDoc_tokenSet["input_ids"][0] + x["input_ids"][i]
                                 type_arr[k] += endOfDoc_tokenSet["token_type_ids"][0] + x["token_type_ids"][i]
                                 mask_arr[k] += endOfDoc_tokenSet["attention_mask"][0] + x["attention_mask"][i]
+                                sample_len_arr[k].append(sample_len)
                                 
-                                # We intentionally DO NOT update the sample length, as it used to accurately
-                                # extract the loss involved for the first sample
-                                # sample_len_arr[k] += sample_len
-
                                 # Return that a merge has been done
                                 return True
                             
@@ -557,7 +567,7 @@ def merge_into_existing_samples(i):
                     return False
 
                 # Lets iterate the rest of the dataset, and start packing
-                for i in range(packing_batchsize, total_len):
+                for i in range(preload_size, total_len):
                     # Merge if possible
                     if merge_into_existing_samples(i):
                         continue
@@ -566,11 +576,16 @@ def merge_into_existing_samples(i):
                     id_arr.append(x["input_ids"][i])
                     type_arr.append(x["token_type_ids"][i])
                     mask_arr.append(x["attention_mask"][i])
-                    sample_len_arr.append(x["sample_length"][i])
+                    sample_len_arr.append([x["sample_length"][i]])
 
                     # Update the chunk size
-                    batchset_id = math.floor( i / packing_chunksize )
-                    batchset_chunksize[ batchset_id ] = math.max( math.ceil( x["sample_length"][i] / packing_chunksize ) * packing_chunksize, batchset_chunksize[ batchset_id ] )
+                    batchset_id = math.floor( len(id_arr) / packing_chunksize )
+                    updated_chunksize = max( math.ceil( x["sample_length"][i] / packing_chunksize ) * packing_chunksize, packing_min_ctx_len )
+
+                    if batchset_id >= len(batchset_chunksize):
+                        batchset_chunksize.append(updated_chunksize)
+                    else:
+                        batchset_chunksize[batchset_id] = max(updated_chunksize, batchset_chunksize[batchset_id])
 
                 # Prepare and return the output object
                 ret = {
@@ -581,13 +596,14 @@ def merge_into_existing_samples(i):
                 }
                 return ret
 
+            # Shuffle the dataset if needed
+            if not kargs["packing_in_sequence"]:
+                src_dataset['train'] = src_dataset['train'].shuffle(seed=101)
+
             # Perform the dataset packing
-            if( kargs["packing_in_sequence"] ):
-                src_dataset['train'] = src_dataset['train'].map(pack_dataset_in_sequence, batched=True, 
-                                            batch_size=processing_max_batch_size,
-                                            num_proc=num_cpus)
-            else:
-                raise NotImplementedError("Packing in random order is not implemented yet")
+            src_dataset['train'] = src_dataset['train'].map(pack_dataset_in_sequence, batched=True, 
+                                        batch_size=processing_max_batch_size,
+                                        num_proc=num_cpus)
         else:
             # Remove the sample_length column, as it is no longer needed
             src_dataset['train'] = src_dataset['train'].remove_columns(["sample_length"])
@@ -733,7 +749,7 @@ def __init__(
         # ----------------------------
 
         # Boolean flag to enable / disable dataset packing
-        packing_enabled: bool = False,
+        packing_enable: bool = False,
 
         # Used to ensure all training samples wihin this batch size is the same length
         # Ideally this should align exactly with your real "batch size"
@@ -747,8 +763,13 @@ def __init__(
         # the training context length used.
         packing_chunksize: int = 4096,
 
+        # Minimum size to pack up to, this should be a multiple of packing_chunksize
+        # defautls to -1, which equals to packing_chunksize
+        packing_min_ctx_len: int = -1,
+
         # Pack the data sequentially if possible, in accordance to the dataset sequence
-        # this can be used together with sort_by_length
+        # this can be used together with sort_by_length, otherwise a shuffle will be done
+        # prior to packing
         packing_in_sequence: bool = False,
 
         # ----------------------------
diff --git a/notebook/trainer-v5-validation/config/enwiki_100k-world-4096.yaml b/notebook/trainer-v5-validation/config/enwiki_100k-world-4096.yaml
new file mode 100644
index 00000000..50a3b0f1
--- /dev/null
+++ b/notebook/trainer-v5-validation/config/enwiki_100k-world-4096.yaml
@@ -0,0 +1,265 @@
+# lightning.pytorch==2.0.2
+seed_everything: 3941088705
+trainer:
+
+  #
+  # Configure the deepspeed strategy, we recommend you start with `deepspeed_stage_2_offload` 
+  # and adjust from there according to your training needs. `deepspeed_stage_3_offload` is useful  
+  # for training LoRA on large models on a single GPU.
+  #
+  # In general you would want to use the following:
+  #
+  # - deepspeed_stage_1 : Each of your GPU has too much vram, and you do not know what to do
+  #
+  # - deepspeed_stage_2 : Optimal distributed training strategy, across multiple gpu each with sufficient vram
+  # - deepspeed_stage_2_offload : Reduce vram usage by offloading the optimizer state and work to cpu
+  #
+  # - deepspeed_stage_3 : Split up the model across multiple gpu, useful for large models, at a performance cost
+  # - deepspeed_stage_3_offload : Additional offloading, for even greater performance cost
+  #
+  # For more details see:
+  # https://lightning.ai/docs/pytorch/stable/advanced/model_parallel.html#deepspeed-zero-stage-2
+  #
+  strategy: deepspeed_stage_2_offload
+
+  # Logger setting for wandb, if you want to enable wandb, uncomment the whole logger section
+  # ---
+  logger:
+    class_path: lightning.pytorch.loggers.WandbLogger
+    init_args:
+      name: 'infctx-v5-unit-test-baseline (train-ctx=4096, data-ctx=4096)'
+      project: 'RWKV-infctx-unit-test'
+      tags: ['RWKV', 'infctx']
+  
+  # Checkpoint settings for the training process
+  callbacks:
+    class_path: lightning.pytorch.callbacks.ModelCheckpoint
+    init_args:
+      # Configure this to the path you want to save your checkpoints to
+      # note that a subdir will be created with the name `epoch=x-step=y.ckpt`
+      # 
+      # to convert a checkpoint to a model, you can use the 
+      # `python3 export_checkpoint.py <checkpoint path>` script, 
+      # which will create a `rwkv_model.pth` in the checkpoint directory.
+      #
+      # Do not use the `zero_to_fp32.py` script as that will have export format issues
+      dirpath: ../checkpoint/trainer-validaiton/infctx-v5-enwiki-100k-4096
+      filename: null
+      
+      # Save the top/last K checkpoints
+      save_top_k: 3
+      # Choose by the most recent checkpoints (step based)
+      monitor: 'step'
+      mode: max
+      
+      # If enabled (true), save a copy of the latest checkpoint to 'last.ckpt'
+      # useful to simply checkpoint resume scripts, at a price of disk performance
+      save_last: false
+
+      # DO NOT set this as true, as the model weight exported will have format issues
+      # expert as checkpoint, and use the `export_checkpoint.py` script to convert to model instead
+      save_weights_only: false
+
+      # How frequent you want to save a checkpoint for every step.
+      # This will happen for every X data sample, where X = every_n_train_steps * accumulate_grad_batches
+      #
+      # In general you will want to avoid putting a low number (expecially if accumulate_grad_batches <= 100)
+      # as the checkpoint process, will pause all the gpu training for some time, slowing down the overall process
+      # However you do not want to configure too high of a number, where you will lose too much progress if the training crashes
+      every_n_train_steps: 100
+      every_n_epochs: null
+      save_on_train_epoch_end: true
+      train_time_interval: null
+
+      # Other settings, you can probably leave alone
+      verbose: false
+      auto_insert_metric_name: true
+  
+  ########################################
+  ## Training run parameter settings
+  ########################################
+
+  # Generally what you want to configure is the maximum number of epochs
+  # Leave it as -1, and it will keep going forever till interrupted
+  # Or set it as a number, and it will stop after that number of epochs
+  max_epochs: 1
+  min_epochs: null
+  max_steps: -1
+  min_steps: null
+  max_time: null
+
+  # Number of datasamples to train for each step, a data sample is considered
+  # a "substep" in wandb logs, and a "step" is tracked as "trainer/global_step"
+  #
+  # This decides the number of datasample, to learn together from, before backproping
+  # any weight changes at the end of the batch.
+  #
+  # Recommended to be a big enough number (like 128/256) where it prevents the training 
+  # loss from flucuating in the process. But not too big of a number where the increased
+  # GPU vRAM / offloaded RAM usage will cause the training to crash.
+  #
+  # You are also recommended to configure this to a large enough number to fully utilize
+  # your GPU processing time %, and avoid idle time for the GPU between batches
+  #
+  # This number is divided by the number of GPUs, and nodes configured
+  # So if you have 4 GPUs, and 2 nodes, and this is configured as 128
+  # Each GPU will process 128/4/2 = 16 datasamples per step, via accumulate_grad_batches
+  target_batch_size: 16
+
+########################################
+## Training model settings
+########################################
+model:
+  # Model to start the finetune/training process from
+  load_model: ../model/L24-D2048-world-v5base-init.pth
+
+  # Context length to use for the training process
+  # the larger the number (and batch size) the larger the vram usage
+  # 
+  # Note that if the datasample context length is larger then the ctx_len
+  # its training process would be split into ctx_len sized chunks.
+  #
+  # This allows the training of extreamly large context length (eg. 100k),
+  # without eating up too much vram by keeping the training context length
+  # to a resonable number sutible to the current GPU setup
+  ctx_len: 4096
+  
+  # Data samples would be cut down to the respective max ctx_len_cutoffs
+  # values if its larger then ctx_len. If the data sample is larger then
+  # the largest len_cutoff, the remaining data will be discarded
+  ctx_len_cutoffs: []
+  # Experimental settings, number of tokens to skip in the data sample
+  # prefix, for the respective cutoff length. Used to speed up the process
+  ctx_len_warmup_steps: []
+
+  # Learning rate of the training process
+  # ---
+
+  # Initia learning rate of the process
+  lr_init: 8e-4
+  # Final learning rate after the learning rate period
+  # learning rate will stay at final value from then onwards
+  lr_final: 4e-4
+
+  # Number of epoch to reduce the learning rate from lr_init to lr_final
+  #  1 means a single epoch (so lr would be lr_final from epoch 2 onwards)
+  #  0 means lr_final will apply immediately
+  # -1 means we take the current max_step / max_epoch as the period
+  lr_period: 1
+  # lr_period type if its set, defaults to epoch
+  lr_period_type: epoch
+
+  # Adam optimizer settings
+  # You probably want to leave this alone, unless you know what you are doing
+  beta1: 0.9
+  beta2: 0.99
+  adam_eps: 1.0e-08
+  weight_decay: 0.01
+
+  # torch.set_float32_matmul_precision, used to optimize operations with tensor cores
+  # this should be set as null, for non cuda core GPUs
+  torch_set_float32_matmul_precision: 'high'
+  # torch_set_float32_matmul_precision: null
+
+  # Segmented based learning, used to work around training of large context length
+  # beyond what can be supported by the current GPU vram architecture
+  #
+  # This is not 1:1 equivalent to the same training process with required vram
+  # as the training process is split into multiple segments, part by part.
+  # with limited learnings from the previous segment.
+  bptt_learning: true
+
+  # Segmented range to performing backprop learning on
+  # 1 means to apply only for the last segment
+  # -1 means to apply for all segments
+  bptt_learning_range: -1
+
+data:
+  # dataset_path for the prebuilt dataset, using HF `load_from_disk()`
+  #
+  # Use this if you have built your own dataset and saved it with `save_to_disk()`
+  # with source left as null. Other wise configure this to a directory which the 
+  # dataset will be built and tokenized by the huggingface dataset process.
+  data_path: ../datapath/enwiki_100k-world-4096/
+
+  # Other wise provide the source path, which is used as huggingface dataset path
+  # this will be used to populate the dataset_path
+  #
+  # Use either the following
+  # - hugging face dataset 
+  # - Directory path to a directory containing dataset files
+  # - Path to a single dataset file
+  # - hugging face dataset mode (ie: text,csv,etc - use data_dir, to configure the path then)
+  # - null
+  #
+  # If source is disabled, all other params, except data_path, is ignored
+  source: "teven/enwiki_100k"
+  # source: text
+  # source: /home/ubuntu/RWKV-LM-LoRA/dataset-text/enwik8.txt
+
+  # Use data_dir, if you are using source=text/json/etc
+  # this should be relative to the trainer script path
+  source_data_dir: null
+
+  # After loading the dataset, split out test data used for unit-test, 
+  # This process is skipped if the dataset includes a test split
+  # This process is skipped if set to zero
+  test_split: 0.01
+  test_split_shuffle: false
+
+  # Tokenizer to use, use either the inbuilt 'neox', or 'world' tokenizer
+  # If using a custom tokenizer, provide the tokenizer file path
+  # ---
+  tokenizer: world
+
+  # Minimum / Maximum token size of the dataset to use
+  # useful for filtering out small noisy data samples from large datasets
+  # (eg. removal of small articles of less then 512 tokens from wikipedia)
+  #
+  # This is ignored, if set to -1
+  min_token_size: 1024
+  max_token_size: -1
+
+  # Rechunking of text dataset, this is done only when source is set as 'text'
+  # and will merge the various sentencees, into larger chunks up to the target size
+  #
+  # Defaults to 4096
+  #
+  # This is ignored, if source is not set as text
+  # This is ignored, if set to zero
+  # ---
+  text_rechunk_size: 4096
+
+  # Apply text rechunk to the dataset, even if its not a 'text' source
+  # This is done only after dataset filtering, and if source is not 'text'
+  # ---
+  text_rechunk_force: true
+
+  # Custom text column to use, useful for dataset with alternative training columns labels
+  # This is checked before multi column merging, default is null (disabled)
+  # eg: 'code'
+  # ---
+  # custom_text_key: 'code'
+
+  # Multi Column merging process, default setting is used to support and merge
+  # "instruction", "input", "output", datasets. To disable set multi_column_keys to []
+  #
+  # A minimum of 2 columns is required, with non empty data, for the merge to occur
+  # If no match is found, this will fallback to the default prompt/completion or text column, 
+  # or throw an error if the default fallback is not found
+  # ---
+  # multi_column_keys: ['instruction', 'input', 'output']
+  # multi_column_prefix: ['Instruction:\n', 'Input:\n', 'Output:\n']
+  # multi_column_train_mask: [true, false, true]
+  # multi_column_separator: '\n\n'
+
+  # If processing prompt/completion jsonl pairs, the prompt is masked by default
+  # use this flag to disable this default behaviour
+  # ---
+  # disable_prompt_completion_mask: false
+
+# Path to the current checkpoint to continue training from
+# Enable this to the last checkpoint after the first run 
+# (if it crash and you want to resume)
+# ckpt_path: ../checkpoint/trainer-validaiton/infctx-unit-test-baseline/epoch=0-step=20.ckpt
+ckpt_path: null
diff --git a/notebook/trainer-v5-validation/config/enwiki_10k-world-4096.yaml b/notebook/trainer-v5-validation/config/enwiki_10k-world-4096.yaml
index f5192812..d96ce79e 100644
--- a/notebook/trainer-v5-validation/config/enwiki_10k-world-4096.yaml
+++ b/notebook/trainer-v5-validation/config/enwiki_10k-world-4096.yaml
@@ -111,7 +111,7 @@ trainer:
 ########################################
 model:
   # Model to start the finetune/training process from
-  load_model: ../model/L24-D2048-neox-v5base-init.pth
+  load_model: ../model/L24-D2048-world-v5base-init.pth
 
   # Context length to use for the training process
   # the larger the number (and batch size) the larger the vram usage
diff --git a/notebook/trainer-v5-validation/config/test-dataset-repack-chunks.yaml b/notebook/trainer-v5-validation/config/test-dataset-repack-chunks.yaml
new file mode 100644
index 00000000..655d771f
--- /dev/null
+++ b/notebook/trainer-v5-validation/config/test-dataset-repack-chunks.yaml
@@ -0,0 +1,265 @@
+# lightning.pytorch==2.0.2
+seed_everything: 3941088705
+trainer:
+
+  #
+  # Configure the deepspeed strategy, we recommend you start with `deepspeed_stage_2_offload` 
+  # and adjust from there according to your training needs. `deepspeed_stage_3_offload` is useful  
+  # for training LoRA on large models on a single GPU.
+  #
+  # In general you would want to use the following:
+  #
+  # - deepspeed_stage_1 : Each of your GPU has too much vram, and you do not know what to do
+  #
+  # - deepspeed_stage_2 : Optimal distributed training strategy, across multiple gpu each with sufficient vram
+  # - deepspeed_stage_2_offload : Reduce vram usage by offloading the optimizer state and work to cpu
+  #
+  # - deepspeed_stage_3 : Split up the model across multiple gpu, useful for large models, at a performance cost
+  # - deepspeed_stage_3_offload : Additional offloading, for even greater performance cost
+  #
+  # For more details see:
+  # https://lightning.ai/docs/pytorch/stable/advanced/model_parallel.html#deepspeed-zero-stage-2
+  #
+  strategy: deepspeed_stage_2_offload
+
+  # Logger setting for wandb, if you want to enable wandb, uncomment the whole logger section
+  # ---
+  logger:
+    class_path: lightning.pytorch.loggers.WandbLogger
+    init_args:
+      name: 'infctx-v5-unit-test-baseline (train-ctx=4096, data-ctx=4096)'
+      project: 'RWKV-infctx-unit-test'
+      tags: ['RWKV', 'infctx']
+  
+  # Checkpoint settings for the training process
+  callbacks:
+    class_path: lightning.pytorch.callbacks.ModelCheckpoint
+    init_args:
+      # Configure this to the path you want to save your checkpoints to
+      # note that a subdir will be created with the name `epoch=x-step=y.ckpt`
+      # 
+      # to convert a checkpoint to a model, you can use the 
+      # `python3 export_checkpoint.py <checkpoint path>` script, 
+      # which will create a `rwkv_model.pth` in the checkpoint directory.
+      #
+      # Do not use the `zero_to_fp32.py` script as that will have export format issues
+      dirpath: ../checkpoint/trainer-validaiton/infctx-v5-enwiki-10k-4096
+      filename: null
+      
+      # Save the top/last K checkpoints
+      save_top_k: 3
+      # Choose by the most recent checkpoints (step based)
+      monitor: 'step'
+      mode: max
+      
+      # If enabled (true), save a copy of the latest checkpoint to 'last.ckpt'
+      # useful to simply checkpoint resume scripts, at a price of disk performance
+      save_last: false
+
+      # DO NOT set this as true, as the model weight exported will have format issues
+      # expert as checkpoint, and use the `export_checkpoint.py` script to convert to model instead
+      save_weights_only: false
+
+      # How frequent you want to save a checkpoint for every step.
+      # This will happen for every X data sample, where X = every_n_train_steps * accumulate_grad_batches
+      #
+      # In general you will want to avoid putting a low number (expecially if accumulate_grad_batches <= 100)
+      # as the checkpoint process, will pause all the gpu training for some time, slowing down the overall process
+      # However you do not want to configure too high of a number, where you will lose too much progress if the training crashes
+      every_n_train_steps: 100
+      every_n_epochs: null
+      save_on_train_epoch_end: true
+      train_time_interval: null
+
+      # Other settings, you can probably leave alone
+      verbose: false
+      auto_insert_metric_name: true
+  
+  ########################################
+  ## Training run parameter settings
+  ########################################
+
+  # Generally what you want to configure is the maximum number of epochs
+  # Leave it as -1, and it will keep going forever till interrupted
+  # Or set it as a number, and it will stop after that number of epochs
+  max_epochs: 1
+  min_epochs: null
+  max_steps: -1
+  min_steps: null
+  max_time: null
+
+  # Number of datasamples to train for each step, a data sample is considered
+  # a "substep" in wandb logs, and a "step" is tracked as "trainer/global_step"
+  #
+  # This decides the number of datasample, to learn together from, before backproping
+  # any weight changes at the end of the batch.
+  #
+  # Recommended to be a big enough number (like 128/256) where it prevents the training 
+  # loss from flucuating in the process. But not too big of a number where the increased
+  # GPU vRAM / offloaded RAM usage will cause the training to crash.
+  #
+  # You are also recommended to configure this to a large enough number to fully utilize
+  # your GPU processing time %, and avoid idle time for the GPU between batches
+  #
+  # This number is divided by the number of GPUs, and nodes configured
+  # So if you have 4 GPUs, and 2 nodes, and this is configured as 128
+  # Each GPU will process 128/4/2 = 16 datasamples per step, via accumulate_grad_batches
+  target_batch_size: 8
+
+########################################
+## Training model settings
+########################################
+model:
+  # Model to start the finetune/training process from
+  load_model: ../model/L24-D2048-world-v5base-init.pth
+
+  # Context length to use for the training process
+  # the larger the number (and batch size) the larger the vram usage
+  # 
+  # Note that if the datasample context length is larger then the ctx_len
+  # its training process would be split into ctx_len sized chunks.
+  #
+  # This allows the training of extreamly large context length (eg. 100k),
+  # without eating up too much vram by keeping the training context length
+  # to a resonable number sutible to the current GPU setup
+  ctx_len: 4096
+  
+  # Data samples would be cut down to the respective max ctx_len_cutoffs
+  # values if its larger then ctx_len. If the data sample is larger then
+  # the largest len_cutoff, the remaining data will be discarded
+  ctx_len_cutoffs: []
+  # Experimental settings, number of tokens to skip in the data sample
+  # prefix, for the respective cutoff length. Used to speed up the process
+  ctx_len_warmup_steps: []
+
+  # Learning rate of the training process
+  # ---
+
+  # Initia learning rate of the process
+  lr_init: 8e-4
+  # Final learning rate after the learning rate period
+  # learning rate will stay at final value from then onwards
+  lr_final: 4e-4
+
+  # Number of epoch to reduce the learning rate from lr_init to lr_final
+  #  1 means a single epoch (so lr would be lr_final from epoch 2 onwards)
+  #  0 means lr_final will apply immediately
+  # -1 means we take the current max_step / max_epoch as the period
+  lr_period: 1
+  # lr_period type if its set, defaults to epoch
+  lr_period_type: epoch
+
+  # Adam optimizer settings
+  # You probably want to leave this alone, unless you know what you are doing
+  beta1: 0.9
+  beta2: 0.99
+  adam_eps: 1.0e-08
+  weight_decay: 0.01
+
+  # torch.set_float32_matmul_precision, used to optimize operations with tensor cores
+  # this should be set as null, for non cuda core GPUs
+  torch_set_float32_matmul_precision: 'high'
+  # torch_set_float32_matmul_precision: null
+
+  # Segmented based learning, used to work around training of large context length
+  # beyond what can be supported by the current GPU vram architecture
+  #
+  # This is not 1:1 equivalent to the same training process with required vram
+  # as the training process is split into multiple segments, part by part.
+  # with limited learnings from the previous segment.
+  bptt_learning: true
+
+  # Segmented range to performing backprop learning on
+  # 1 means to apply only for the last segment
+  # -1 means to apply for all segments
+  bptt_learning_range: -1
+
+data:
+  # dataset_path for the prebuilt dataset, using HF `load_from_disk()`
+  #
+  # Use this if you have built your own dataset and saved it with `save_to_disk()`
+  # with source left as null. Other wise configure this to a directory which the 
+  # dataset will be built and tokenized by the huggingface dataset process.
+  data_path: ../datapath/enwiki_10k-world-4096-repacked-chunks/
+
+  # Other wise provide the source path, which is used as huggingface dataset path
+  # this will be used to populate the dataset_path
+  #
+  # Use either the following
+  # - hugging face dataset 
+  # - Directory path to a directory containing dataset files
+  # - Path to a single dataset file
+  # - hugging face dataset mode (ie: text,csv,etc - use data_dir, to configure the path then)
+  # - null
+  #
+  # If source is disabled, all other params, except data_path, is ignored
+  source: "teven/enwiki_10k"
+  # source: text
+  # source: /home/ubuntu/RWKV-LM-LoRA/dataset-text/enwik8.txt
+
+  # Use data_dir, if you are using source=text/json/etc
+  # this should be relative to the trainer script path
+  source_data_dir: null
+
+  # After loading the dataset, split out test data used for unit-test, 
+  # This process is skipped if the dataset includes a test split
+  # This process is skipped if set to zero
+  test_split: 0.01
+  test_split_shuffle: false
+
+  # Tokenizer to use, use either the inbuilt 'neox', or 'world' tokenizer
+  # If using a custom tokenizer, provide the tokenizer file path
+  # ---
+  tokenizer: world
+
+  # Minimum / Maximum token size of the dataset to use
+  # useful for filtering out small noisy data samples from large datasets
+  # (eg. removal of small articles of less then 512 tokens from wikipedia)
+  #
+  # This is ignored, if set to -1
+  min_token_size: 128
+  max_token_size: -1
+
+  # Rechunking of text dataset, this is done only when source is set as 'text'
+  # and will merge the various sentencees, into larger chunks up to the target size
+  #
+  # Defaults to 4096
+  #
+  # This is ignored, if source is not set as text
+  # This is ignored, if set to zero
+  # ---
+  text_rechunk_size: 16384
+
+  # Apply text rechunk to the dataset, even if its not a 'text' source
+  # This is done only after dataset filtering, and if source is not 'text'
+  # ---
+  text_rechunk_force: true
+
+  # Custom text column to use, useful for dataset with alternative training columns labels
+  # This is checked before multi column merging, default is null (disabled)
+  # eg: 'code'
+  # ---
+  # custom_text_key: 'code'
+
+  # Multi Column merging process, default setting is used to support and merge
+  # "instruction", "input", "output", datasets. To disable set multi_column_keys to []
+  #
+  # A minimum of 2 columns is required, with non empty data, for the merge to occur
+  # If no match is found, this will fallback to the default prompt/completion or text column, 
+  # or throw an error if the default fallback is not found
+  # ---
+  # multi_column_keys: ['instruction', 'input', 'output']
+  # multi_column_prefix: ['Instruction:\n', 'Input:\n', 'Output:\n']
+  # multi_column_train_mask: [true, false, true]
+  # multi_column_separator: '\n\n'
+
+  # If processing prompt/completion jsonl pairs, the prompt is masked by default
+  # use this flag to disable this default behaviour
+  # ---
+  # disable_prompt_completion_mask: false
+
+# Path to the current checkpoint to continue training from
+# Enable this to the last checkpoint after the first run 
+# (if it crash and you want to resume)
+# ckpt_path: ../checkpoint/trainer-validaiton/infctx-unit-test-baseline/epoch=0-step=20.ckpt
+ckpt_path: null
diff --git a/notebook/trainer-v5-validation/config/test-dataset-repack.yaml b/notebook/trainer-v5-validation/config/test-dataset-repack.yaml
new file mode 100644
index 00000000..e1ae0ab8
--- /dev/null
+++ b/notebook/trainer-v5-validation/config/test-dataset-repack.yaml
@@ -0,0 +1,295 @@
+# lightning.pytorch==2.0.2
+seed_everything: 3941088705
+trainer:
+
+  #
+  # Configure the deepspeed strategy, we recommend you start with `deepspeed_stage_2_offload` 
+  # and adjust from there according to your training needs. `deepspeed_stage_3_offload` is useful  
+  # for training LoRA on large models on a single GPU.
+  #
+  # In general you would want to use the following:
+  #
+  # - deepspeed_stage_1 : Each of your GPU has too much vram, and you do not know what to do
+  #
+  # - deepspeed_stage_2 : Optimal distributed training strategy, across multiple gpu each with sufficient vram
+  # - deepspeed_stage_2_offload : Reduce vram usage by offloading the optimizer state and work to cpu
+  #
+  # - deepspeed_stage_3 : Split up the model across multiple gpu, useful for large models, at a performance cost
+  # - deepspeed_stage_3_offload : Additional offloading, for even greater performance cost
+  #
+  # For more details see:
+  # https://lightning.ai/docs/pytorch/stable/advanced/model_parallel.html#deepspeed-zero-stage-2
+  #
+  strategy: deepspeed_stage_2_offload
+
+  # Logger setting for wandb, if you want to enable wandb, uncomment the whole logger section
+  # ---
+  logger:
+    class_path: lightning.pytorch.loggers.WandbLogger
+    init_args:
+      name: 'infctx-v5-dataset-packing (train-ctx=4096, data-ctx=4096)'
+      project: 'RWKV-infctx-unit-test'
+      tags: ['RWKV', 'infctx']
+  
+  # Checkpoint settings for the training process
+  callbacks:
+    class_path: lightning.pytorch.callbacks.ModelCheckpoint
+    init_args:
+      # Configure this to the path you want to save your checkpoints to
+      # note that a subdir will be created with the name `epoch=x-step=y.ckpt`
+      # 
+      # to convert a checkpoint to a model, you can use the 
+      # `python3 export_checkpoint.py <checkpoint path>` script, 
+      # which will create a `rwkv_model.pth` in the checkpoint directory.
+      #
+      # Do not use the `zero_to_fp32.py` script as that will have export format issues
+      dirpath: ../checkpoint/trainer-validaiton/infctx-v5-dataset-packing
+      filename: null
+      
+      # Save the top/last K checkpoints
+      save_top_k: 3
+      # Choose by the most recent checkpoints (step based)
+      monitor: 'step'
+      mode: max
+      
+      # If enabled (true), save a copy of the latest checkpoint to 'last.ckpt'
+      # useful to simply checkpoint resume scripts, at a price of disk performance
+      save_last: false
+
+      # DO NOT set this as true, as the model weight exported will have format issues
+      # expert as checkpoint, and use the `export_checkpoint.py` script to convert to model instead
+      save_weights_only: false
+
+      # How frequent you want to save a checkpoint for every step.
+      # This will happen for every X data sample, where X = every_n_train_steps * accumulate_grad_batches
+      #
+      # In general you will want to avoid putting a low number (expecially if accumulate_grad_batches <= 100)
+      # as the checkpoint process, will pause all the gpu training for some time, slowing down the overall process
+      # However you do not want to configure too high of a number, where you will lose too much progress if the training crashes
+      every_n_train_steps: 100
+      every_n_epochs: null
+      save_on_train_epoch_end: true
+      train_time_interval: null
+
+      # Other settings, you can probably leave alone
+      verbose: false
+      auto_insert_metric_name: true
+  
+  ########################################
+  ## Training run parameter settings
+  ########################################
+
+  # Generally what you want to configure is the maximum number of epochs
+  # Leave it as -1, and it will keep going forever till interrupted
+  # Or set it as a number, and it will stop after that number of epochs
+  max_epochs: 1
+  min_epochs: null
+  max_steps: -1
+  min_steps: null
+  max_time: null
+
+  # Number of datasamples to train for each step, a data sample is considered
+  # a "substep" in wandb logs, and a "step" is tracked as "trainer/global_step"
+  #
+  # This decides the number of datasample, to learn together from, before backproping
+  # any weight changes at the end of the batch.
+  #
+  # Recommended to be a big enough number (like 128/256) where it prevents the training 
+  # loss from flucuating in the process. But not too big of a number where the increased
+  # GPU vRAM / offloaded RAM usage will cause the training to crash.
+  #
+  # You are also recommended to configure this to a large enough number to fully utilize
+  # your GPU processing time %, and avoid idle time for the GPU between batches
+  #
+  # This number is divided by the number of GPUs, and nodes configured
+  # So if you have 4 GPUs, and 2 nodes, and this is configured as 128
+  # Each GPU will process 128/4/2 = 16 datasamples per step, via accumulate_grad_batches
+  target_batch_size: 16
+
+########################################
+## Training model settings
+########################################
+model:
+  # Model to start the finetune/training process from
+  load_model: ../model/L6-D2048-world-v5base-init.pth
+
+  # Context length to use for the training process
+  # the larger the number (and batch size) the larger the vram usage
+  # 
+  # Note that if the datasample context length is larger then the ctx_len
+  # its training process would be split into ctx_len sized chunks.
+  #
+  # This allows the training of extreamly large context length (eg. 10k),
+  # without eating up too much vram by keeping the training context length
+  # to a resonable number sutible to the current GPU setup
+  ctx_len: 4096
+  
+  # Data samples would be cut down to the respective max ctx_len_cutoffs
+  # values if its larger then ctx_len. If the data sample is larger then
+  # the largest len_cutoff, the remaining data will be discarded
+  ctx_len_cutoffs: []
+  # Experimental settings, number of tokens to skip in the data sample
+  # prefix, for the respective cutoff length. Used to speed up the process
+  ctx_len_warmup_steps: []
+
+  # Learning rate of the training process
+  # ---
+
+  # Initia learning rate of the process
+  lr_init: 8e-4
+  # Final learning rate after the learning rate period
+  # learning rate will stay at final value from then onwards
+  lr_final: 4e-4
+
+  # Number of epoch to reduce the learning rate from lr_init to lr_final
+  #  1 means a single epoch (so lr would be lr_final from epoch 2 onwards)
+  #  0 means lr_final will apply immediately
+  # -1 means we take the current max_step / max_epoch as the period
+  lr_period: 1
+  # lr_period type if its set, defaults to epoch
+  lr_period_type: epoch
+
+  # Adam optimizer settings
+  # You probably want to leave this alone, unless you know what you are doing
+  beta1: 0.9
+  beta2: 0.99
+  adam_eps: 1.0e-08
+  weight_decay: 0.01
+
+  # torch.set_float32_matmul_precision, used to optimize operations with tensor cores
+  # this should be set as null, for non cuda core GPUs
+  torch_set_float32_matmul_precision: 'high'
+  # torch_set_float32_matmul_precision: null
+
+  # Segmented based learning, used to work around training of large context length
+  # beyond what can be supported by the current GPU vram architecture
+  #
+  # This is not 1:1 equivalent to the same training process with required vram
+  # as the training process is split into multiple segments, part by part.
+  # with limited learnings from the previous segment.
+  bptt_learning: true
+
+  # Segmented range to performing backprop learning on
+  # 1 means to apply only for the last segment
+  # -1 means to apply for all segments
+  bptt_learning_range: -1
+
+data:
+  # dataset_path for the prebuilt dataset, using HF `load_from_disk()`
+  #
+  # Use this if you have built your own dataset and saved it with `save_to_disk()`
+  # with source left as null. Other wise configure this to a directory which the 
+  # dataset will be built and tokenized by the huggingface dataset process.
+  data_path: ../datapath/enwiki_10k-world-4096-repacked/
+
+  # Other wise provide the source path, which is used as huggingface dataset path
+  # this will be used to populate the dataset_path
+  #
+  # Use either the following
+  # - hugging face dataset 
+  # - Directory path to a directory containing dataset files
+  # - Path to a single dataset file
+  # - hugging face dataset mode (ie: text,csv,etc - use data_dir, to configure the path then)
+  # - null
+  #
+  # If source is disabled, all other params, except data_path, is ignored
+  source: "teven/enwiki_10k"
+  # source: text
+  # source: /home/ubuntu/RWKV-LM-LoRA/dataset-text/enwik8.txt
+
+  # Use data_dir, if you are using source=text/json/etc
+  # this should be relative to the trainer script path
+  source_data_dir: null
+
+  # After loading the dataset, split out test data used for unit-test, 
+  # This process is skipped if the dataset includes a test split
+  # This process is skipped if set to zero
+  test_split: 0.01
+  test_split_shuffle: false
+
+  # Tokenizer to use, use either the inbuilt 'neox', or 'world' tokenizer
+  # If using a custom tokenizer, provide the tokenizer file path
+  # ---
+  tokenizer: world
+
+  # Minimum / Maximum token size of the dataset to use
+  # useful for filtering out small noisy data samples from large datasets
+  # (eg. removal of small articles of less then 512 tokens from wikipedia)
+  #
+  # This is ignored, if set to -1
+  min_token_size: 128
+  max_token_size: -1
+
+  # Rechunking of text dataset, this is done only when source is set as 'text'
+  # and will merge the various sentencees, into larger chunks up to the target size
+  #
+  # Defaults to 4096
+  #
+  # This is ignored, if source is not set as text
+  # This is ignored, if set to zero
+  # ---
+  # text_rechunk_size: 4096
+
+  # Apply text rechunk to the dataset, even if its not a 'text' source
+  # This is done only after dataset filtering, and if source is not 'text'
+  # ---
+  text_rechunk_force: false
+
+  # Custom text column to use, useful for dataset with alternative training columns labels
+  # This is checked before multi column merging, default is null (disabled)
+  # eg: 'code'
+  # ---
+  # custom_text_key: 'code'
+
+  # Multi Column merging process, default setting is used to support and merge
+  # "instruction", "input", "output", datasets. To disable set multi_column_keys to []
+  #
+  # A minimum of 2 columns is required, with non empty data, for the merge to occur
+  # If no match is found, this will fallback to the default prompt/completion or text column, 
+  # or throw an error if the default fallback is not found
+  # ---
+  # multi_column_keys: ['instruction', 'input', 'output']
+  # multi_column_prefix: ['Instruction:\n', 'Input:\n', 'Output:\n']
+  # multi_column_train_mask: [true, false, true]
+  # multi_column_separator: '\n\n'
+
+  # If processing prompt/completion jsonl pairs, the prompt is masked by default
+  # use this flag to disable this default behaviour
+  # ---
+  # disable_prompt_completion_mask: false
+
+  # ----------------------------
+  # Dataset packing support
+  # Recommended to be used with mixed documents sized finetuning
+  # For foundation model "from scratch", rechunking is typically used instead
+  # ----------------------------
+
+  # Boolean flag to enable / disable dataset packing
+  packing_enable: True
+
+  # Used to ensure all training samples wihin this batch size is the same length
+  # Ideally this should align exactly with your real "batch size"
+  #
+  # Uses, `8 * (3 * 4 * 5 * 6 * 7) = 20160` for default, as it should align across
+  # a large number of batch size combinations. This helps reduce the amount of
+  # misaligned batches, and thus reduce the amount of wasted training time.
+  packing_batchsize: 8
+
+  # Chunking size to align within each batch, this ideally should be equal to
+  # the training context length used.
+  packing_chunksize: 4096
+
+  # Minimum size to pack up to, this should be a multiple of packing_chunksize
+  # defautls to -1, which equals to packing_chunksize
+  packing_min_ctx_len: 16384
+
+  # Pack the data sequentially if possible, in accordance to the dataset sequence
+  # this can be used together with sort_by_length
+  packing_in_sequence: False
+
+
+# Path to the current checkpoint to continue training from
+# Enable this to the last checkpoint after the first run 
+# (if it crash and you want to resume)
+# ckpt_path: ../checkpoint/trainer-validaiton/infctx-unit-test-baseline/epoch=0-step=20.ckpt
+ckpt_path: null
diff --git a/notebook/trainer-v5-validation/test-dataset-packing.ipynb b/notebook/trainer-v5-validation/test-dataset-packing.ipynb
new file mode 100644
index 00000000..d2c194a8
--- /dev/null
+++ b/notebook/trainer-v5-validation/test-dataset-packing.ipynb
@@ -0,0 +1,535 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Dataset repacking implementation\n",
+    "\n",
+    "Advance dataset operations, of sorting, offset, and length support"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ENABLE_WANDB: True\n",
+      "GPU_DEVICES: auto\n",
+      "NOTEBOOK_DIR: /home/picocreator/rwkv-proj/infctx-dev/notebook/trainer-v5-validation\n",
+      "TRAINER_DIR: /home/picocreator/rwkv-proj/infctx-dev/RWKV-v5\n",
+      "PROJECT_DIR: /home/picocreator/rwkv-proj/infctx-dev\n"
+     ]
+    }
+   ],
+   "source": [
+    "GPU_DEVICES=\"auto\"\n",
+    "ENABLE_WANDB=True\n",
+    "WANDB_PREFIX=\"infctx-v5-dataset\"\n",
+    "\n",
+    "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n",
+    "print(\"GPU_DEVICES:\", GPU_DEVICES)\n",
+    "\n",
+    "if ENABLE_WANDB:\n",
+    "    WANDB_MODE=\"online\"\n",
+    "else:\n",
+    "    WANDB_MODE=\"disabled\"\n",
+    "\n",
+    "# Computing the notebook, and various paths\n",
+    "import os\n",
+    "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n",
+    "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../\"))\n",
+    "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n",
+    "\n",
+    "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n",
+    "print(\"TRAINER_DIR:\", TRAINER_DIR)\n",
+    "print(\"PROJECT_DIR:\", PROJECT_DIR)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2023-12-18 04:54:01,470] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.1'\n",
+      "---- Initializing model ----\n",
+      "No of layers: 6\n",
+      "Embedding size: 2048\n",
+      "Output model path: ../model/L6-D2048-world-v5base-init.pth\n",
+      "Vocab size: 65536\n",
+      "Emb scale: 0.0001\n",
+      "Note: this process takes a significant time (and ram) for large models\n",
+      "---- ----- ----\n",
+      "Model exists, skipping init_model\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Init the model\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    python3 ./init_model.py \\\n",
+    "        --n_layer 6 --n_embd 2048 \\\n",
+    "        --vocab_size world --skip-if-exists \\\n",
+    "        \"../model/L6-D2048-world-v5base-init.pth\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Without dataset packing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Map (num_proc=16): 100%|█████████| 10000/10000 [00:01<00:00, 9777.04 examples/s]\n",
+      "Filter (num_proc=16): 100%|█████| 10000/10000 [00:00<00:00, 12552.92 examples/s]\n",
+      "Map (num_proc=16): 100%|███████████| 6474/6474 [00:00<00:00, 6823.61 examples/s]\n",
+      "Map (num_proc=16): 100%|██████████████| 310/310 [00:00<00:00, 798.15 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 310/310 [00:00<00:00, 6597.03 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|███| 4/4 [00:00<00:00, 1833.98 examples/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Lets preload the requried dataset \n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    python3 preload_datapath.py \"{NOTEBOOK_DIR}/config/test-dataset-repack-chunks.yaml\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2023-12-18 06:44:14,494] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.1'\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/picocreator/rwkv-proj/infctx-dev/notebook/trainer-v5-validation/config/test-dataset-repack-chunks.yaml', '--model.load_model=../model/L6-D2048-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-chunked/', '--trainer.logger.init_args.name=infctx-v5-dataset-packing - Chunking 4096 - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=2', '--trainer.devices=auto'], args=['fit', '-c', '/home/picocreator/rwkv-proj/infctx-dev/notebook/trainer-v5-validation/config/test-dataset-repack-chunks.yaml', '--model.load_model=../model/L6-D2048-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-chunked/', '--trainer.logger.init_args.name=infctx-v5-dataset-packing - Chunking 4096 - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=2', '--trainer.devices=auto'].\n",
+      "Seed set to 3941088705\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /home/picocreator/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "\n",
+      "\n",
+      "[RWKV.Trainer] Applying 'target_batch_size' with the following:\n",
+      "   - target_batch_size:       8\n",
+      "   - num_nodes:               1\n",
+      "   - num_devices:             1\n",
+      "   - microbatch_size:         2\n",
+      "   - accumulate_grad_batches: 4\n",
+      "   - effective_batch_size:    8\n",
+      "\n",
+      "Map (num_proc=16): 100%|█████████| 10000/10000 [00:01<00:00, 8824.09 examples/s]\n",
+      "Filter (num_proc=16): 100%|█████| 10000/10000 [00:00<00:00, 11669.38 examples/s]\n",
+      "Map (num_proc=16): 100%|███████████| 6474/6474 [00:01<00:00, 6306.40 examples/s]\n",
+      "Map (num_proc=16): 100%|██████████████| 310/310 [00:00<00:00, 655.81 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 310/310 [00:00<00:00, 6126.50 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|███| 4/4 [00:00<00:00, 1839.81 examples/s]\n",
+      "[rank: 0] Seed set to 3941088705\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/1\n",
+      "Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.16.1 is available!  To upgrade, please run:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:  $ pip install wandb --upgrade\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.16.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20231218_064433-wzs0je77\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33minfctx-v5-dataset-packing - Chunking 4096 - (deepspeed_stage_1)\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-infctx-unit-test\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-infctx-unit-test/runs/wzs0je77\u001b[0m\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
+      "#\n",
+      "# RWKV lighting_trainer.py important notes \n",
+      "# https://github.com/RWKV/RWKV-infctx-trainer \n",
+      "#\n",
+      "# - Ensure your host is not running cuda 12.0 (use either 11.8, or >=12.1), as this is known to have freeze issues\n",
+      "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\n",
+      "# - When resuming from checkpoint, the estimated time is inaccurate\n",
+      "#\n",
+      "\n",
+      "[RWKV.model] Configuring optimizer with\n",
+      "    - lr_init:  8.000e-04 (0.0008)\n",
+      "    - lr_final: 4.000e-04 (0.0004)\n",
+      "\n",
+      "Using /home/picocreator/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu121/fused_adam/build.ninja...\n",
+      "Building extension module fused_adam...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.05170488357543945 seconds\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201450/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading `train_dataloader` to estimate number of stepping batches.\n",
+      "\n",
+      "  | Name   | Type       | Params\n",
+      "--------------------------------------\n",
+      "0 | emb    | Embedding  | 134 M \n",
+      "1 | blocks | ModuleList | 327 M \n",
+      "2 | ln_out | LayerNorm  | 4.1 K \n",
+      "3 | head   | Linear     | 134 M \n",
+      "--------------------------------------\n",
+      "595 M     Trainable params\n",
+      "0         Non-trainable params\n",
+      "595 M     Total params\n",
+      "2,383.086 Total estimated model params size (MB)\n",
+      "Epoch 0: 100%|██| 155/155 [03:27<00:00,  0.75it/s, v_num=je77, train/loss=6.660]\n",
+      "Validation: |                                             | 0/? [00:00<?, ?it/s]\u001b[A\n",
+      "Validation:   0%|                                         | 0/4 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0:   0%|                            | 0/4 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0:  25%|█████               | 1/4 [00:01<00:03,  0.88it/s]\u001b[A\n",
+      "Validation DataLoader 0:  50%|██████████          | 2/4 [00:02<00:02,  0.90it/s]\u001b[A\n",
+      "Validation DataLoader 0:  75%|███████████████     | 3/4 [00:03<00:01,  0.90it/s]\u001b[A\n",
+      "Validation DataLoader 0: 100%|████████████████████| 4/4 [00:04<00:00,  0.91it/s]\u001b[A\n",
+      "Epoch 0: 100%|█| 155/155 [03:31<00:00,  0.73it/s, v_num=je77, train/loss=6.660, `Trainer.fit` stopped: `max_epochs=1` reached.\n",
+      "Epoch 0: 100%|█| 155/155 [03:31<00:00,  0.73it/s, v_num=je77, train/loss=6.660, \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run history:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                  batchidx ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:              data_ctx_len ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                     epoch ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:               global_rank ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/tokens_per_sec.gpu.0 ▁▄▆▆▇▇▇▇▇███████████████████████████████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/tokens_total.gpu.0 ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   substep ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/loss ██▆▅▄▃▄▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▂▂▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:       trainer/global_step ▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:     trainer/learning_rate ████▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:           validation/loss ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run summary:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                  batchidx 154\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:              data_ctx_len 16384.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                     epoch 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:               global_rank 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/tokens_per_sec.gpu.0 24564.1372\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/tokens_total.gpu.0 5079040\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   substep 154\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/loss 6.65625\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:       trainer/global_step 38\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:     trainer/learning_rate 0.00041\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:           validation/loss 6.76562\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33minfctx-v5-dataset-packing - Chunking 4096 - (deepspeed_stage_1)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-infctx-unit-test/runs/wzs0je77\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-infctx-unit-test/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjEyNDE5OTQyNQ==/version_details/v4\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 6 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20231218_064433-wzs0je77/logs\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
+    "    python3 lightning_trainer.py fit \\\n",
+    "        -c \"{NOTEBOOK_DIR}/config/test-dataset-repack-chunks.yaml\" \\\n",
+    "        --model.load_model=\"../model/L6-D2048-world-v5base-init.pth\" \\\n",
+    "        --trainer.callbacks.init_args.dirpath=\"../checkpoint/v5-enwiki-10k-chunked/\" \\\n",
+    "        --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Chunking 4096 - (deepspeed_stage_1)\" \\\n",
+    "        --trainer.strategy=\"deepspeed_stage_1\" \\\n",
+    "        --trainer.microbatch_size=2 \\\n",
+    "        --trainer.devices=\"{GPU_DEVICES}\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# With dataset packing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Map (num_proc=16): 100%|█████████| 10000/10000 [00:01<00:00, 9460.14 examples/s]\n",
+      "Filter (num_proc=16): 100%|█████| 10000/10000 [00:00<00:00, 12602.40 examples/s]\n",
+      "Map (num_proc=16): 100%|██████████| 6409/6409 [00:00<00:00, 14092.36 examples/s]\n",
+      "Map (num_proc=16): 100%|███████████| 6409/6409 [00:00<00:00, 6571.76 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 323/323 [00:00<00:00, 6750.55 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█| 65/65 [00:00<00:00, 17764.37 examples/s\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Lets preload the requried dataset \n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    python3 preload_datapath.py \"{NOTEBOOK_DIR}/config/test-dataset-repack.yaml\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2023-12-18 06:39:25,218] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.1'\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/picocreator/rwkv-proj/infctx-dev/notebook/trainer-v5-validation/config/test-dataset-repack.yaml', '--model.load_model=../model/L6-D2048-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-packing/', '--trainer.logger.init_args.name=infctx-v5-dataset-packing - Packing - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=2', '--trainer.devices=auto'], args=['fit', '-c', '/home/picocreator/rwkv-proj/infctx-dev/notebook/trainer-v5-validation/config/test-dataset-repack.yaml', '--model.load_model=../model/L6-D2048-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-packing/', '--trainer.logger.init_args.name=infctx-v5-dataset-packing - Packing - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=2', '--trainer.devices=auto'].\n",
+      "Seed set to 3941088705\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /home/picocreator/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "\n",
+      "\n",
+      "[RWKV.Trainer] Applying 'target_batch_size' with the following:\n",
+      "   - target_batch_size:       16\n",
+      "   - num_nodes:               1\n",
+      "   - num_devices:             1\n",
+      "   - microbatch_size:         2\n",
+      "   - accumulate_grad_batches: 8\n",
+      "   - effective_batch_size:    16\n",
+      "\n",
+      "Map (num_proc=16): 100%|█████████| 10000/10000 [00:01<00:00, 9173.67 examples/s]\n",
+      "Filter (num_proc=16): 100%|█████| 10000/10000 [00:00<00:00, 11370.68 examples/s]\n",
+      "Map (num_proc=16): 100%|██████████| 6409/6409 [00:00<00:00, 12052.27 examples/s]\n",
+      "Map (num_proc=16): 100%|███████████| 6409/6409 [00:01<00:00, 6061.82 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 323/323 [00:00<00:00, 6603.79 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█| 65/65 [00:00<00:00, 17760.90 examples/s\n",
+      "[rank: 0] Seed set to 3941088705\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/1\n",
+      "Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.16.1 is available!  To upgrade, please run:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:  $ pip install wandb --upgrade\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.16.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20231218_063943-8valsg1z\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33minfctx-v5-dataset-packing - Packing - (deepspeed_stage_1)\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-infctx-unit-test\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-infctx-unit-test/runs/8valsg1z\u001b[0m\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
+      "#\n",
+      "# RWKV lighting_trainer.py important notes \n",
+      "# https://github.com/RWKV/RWKV-infctx-trainer \n",
+      "#\n",
+      "# - Ensure your host is not running cuda 12.0 (use either 11.8, or >=12.1), as this is known to have freeze issues\n",
+      "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\n",
+      "# - When resuming from checkpoint, the estimated time is inaccurate\n",
+      "#\n",
+      "\n",
+      "[RWKV.model] Configuring optimizer with\n",
+      "    - lr_init:  8.000e-04 (0.0008)\n",
+      "    - lr_final: 4.000e-04 (0.0004)\n",
+      "\n",
+      "Using /home/picocreator/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu121/fused_adam/build.ninja...\n",
+      "Building extension module fused_adam...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.05209159851074219 seconds\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201450/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading `train_dataloader` to estimate number of stepping batches.\n",
+      "\n",
+      "  | Name   | Type       | Params\n",
+      "--------------------------------------\n",
+      "0 | emb    | Embedding  | 134 M \n",
+      "1 | blocks | ModuleList | 327 M \n",
+      "2 | ln_out | LayerNorm  | 4.1 K \n",
+      "3 | head   | Linear     | 134 M \n",
+      "--------------------------------------\n",
+      "595 M     Trainable params\n",
+      "0         Non-trainable params\n",
+      "595 M     Total params\n",
+      "2,383.086 Total estimated model params size (MB)\n",
+      "Epoch 0: 100%|██| 162/162 [03:40<00:00,  0.74it/s, v_num=sg1z, train/loss=7.340]\n",
+      "Validation: |                                             | 0/? [00:00<?, ?it/s]\u001b[A\n",
+      "Validation:   0%|                                        | 0/65 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0:   0%|                           | 0/65 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0:   2%|▎                  | 1/65 [00:00<00:20,  3.12it/s]\u001b[A\n",
+      "Validation DataLoader 0:   3%|▌                  | 2/65 [00:00<00:19,  3.30it/s]\u001b[A\n",
+      "Validation DataLoader 0:   5%|▉                  | 3/65 [00:00<00:18,  3.40it/s]\u001b[A\n",
+      "Validation DataLoader 0:   6%|█▏                 | 4/65 [00:01<00:17,  3.46it/s]\u001b[A\n",
+      "Validation DataLoader 0:   8%|█▍                 | 5/65 [00:01<00:17,  3.50it/s]\u001b[A\n",
+      "Validation DataLoader 0:   9%|█▊                 | 6/65 [00:01<00:16,  3.52it/s]\u001b[A\n",
+      "Validation DataLoader 0:  11%|██                 | 7/65 [00:01<00:16,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:  12%|██▎                | 8/65 [00:02<00:16,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  14%|██▋                | 9/65 [00:03<00:19,  2.94it/s]\u001b[A\n",
+      "Validation DataLoader 0:  15%|██▊               | 10/65 [00:03<00:18,  3.00it/s]\u001b[A\n",
+      "Validation DataLoader 0:  17%|███               | 11/65 [00:03<00:17,  3.05it/s]\u001b[A\n",
+      "Validation DataLoader 0:  18%|███▎              | 12/65 [00:03<00:17,  3.09it/s]\u001b[A\n",
+      "Validation DataLoader 0:  20%|███▌              | 13/65 [00:04<00:16,  3.12it/s]\u001b[A\n",
+      "Validation DataLoader 0:  22%|███▉              | 14/65 [00:04<00:16,  3.16it/s]\u001b[A\n",
+      "Validation DataLoader 0:  23%|████▏             | 15/65 [00:04<00:15,  3.19it/s]\u001b[A\n",
+      "Validation DataLoader 0:  25%|████▍             | 16/65 [00:04<00:15,  3.21it/s]\u001b[A\n",
+      "Validation DataLoader 0:  26%|████▋             | 17/65 [00:05<00:14,  3.24it/s]\u001b[A\n",
+      "Validation DataLoader 0:  28%|████▉             | 18/65 [00:05<00:14,  3.26it/s]\u001b[A\n",
+      "Validation DataLoader 0:  29%|█████▎            | 19/65 [00:05<00:14,  3.28it/s]\u001b[A\n",
+      "Validation DataLoader 0:  31%|█████▌            | 20/65 [00:06<00:13,  3.30it/s]\u001b[A\n",
+      "Validation DataLoader 0:  32%|█████▊            | 21/65 [00:06<00:13,  3.31it/s]\u001b[A\n",
+      "Validation DataLoader 0:  34%|██████            | 22/65 [00:06<00:12,  3.32it/s]\u001b[A\n",
+      "Validation DataLoader 0:  35%|██████▎           | 23/65 [00:06<00:12,  3.34it/s]\u001b[A\n",
+      "Validation DataLoader 0:  37%|██████▋           | 24/65 [00:07<00:12,  3.35it/s]\u001b[A\n",
+      "Validation DataLoader 0:  38%|██████▉           | 25/65 [00:07<00:11,  3.36it/s]\u001b[A\n",
+      "Validation DataLoader 0:  40%|███████▏          | 26/65 [00:07<00:11,  3.37it/s]\u001b[A\n",
+      "Validation DataLoader 0:  42%|███████▍          | 27/65 [00:07<00:11,  3.38it/s]\u001b[A\n",
+      "Validation DataLoader 0:  43%|███████▊          | 28/65 [00:08<00:10,  3.39it/s]\u001b[A\n",
+      "Validation DataLoader 0:  45%|████████          | 29/65 [00:08<00:10,  3.40it/s]\u001b[A\n",
+      "Validation DataLoader 0:  46%|████████▎         | 30/65 [00:08<00:10,  3.41it/s]\u001b[A\n",
+      "Validation DataLoader 0:  48%|████████▌         | 31/65 [00:09<00:09,  3.42it/s]\u001b[A\n",
+      "Validation DataLoader 0:  49%|████████▊         | 32/65 [00:09<00:09,  3.42it/s]\u001b[A\n",
+      "Validation DataLoader 0:  51%|█████████▏        | 33/65 [00:09<00:09,  3.43it/s]\u001b[A\n",
+      "Validation DataLoader 0:  52%|█████████▍        | 34/65 [00:09<00:09,  3.44it/s]\u001b[A\n",
+      "Validation DataLoader 0:  54%|█████████▋        | 35/65 [00:10<00:08,  3.44it/s]\u001b[A\n",
+      "Validation DataLoader 0:  55%|█████████▉        | 36/65 [00:10<00:08,  3.45it/s]\u001b[A\n",
+      "Validation DataLoader 0:  57%|██████████▏       | 37/65 [00:10<00:08,  3.45it/s]\u001b[A\n",
+      "Validation DataLoader 0:  58%|██████████▌       | 38/65 [00:10<00:07,  3.46it/s]\u001b[A\n",
+      "Validation DataLoader 0:  60%|██████████▊       | 39/65 [00:11<00:07,  3.46it/s]\u001b[A\n",
+      "Validation DataLoader 0:  62%|███████████       | 40/65 [00:11<00:07,  3.47it/s]\u001b[A\n",
+      "Validation DataLoader 0:  63%|███████████▎      | 41/65 [00:11<00:06,  3.47it/s]\u001b[A\n",
+      "Validation DataLoader 0:  65%|███████████▋      | 42/65 [00:12<00:06,  3.48it/s]\u001b[A\n",
+      "Validation DataLoader 0:  66%|███████████▉      | 43/65 [00:12<00:06,  3.48it/s]\u001b[A\n",
+      "Validation DataLoader 0:  68%|████████████▏     | 44/65 [00:12<00:06,  3.49it/s]\u001b[A\n",
+      "Validation DataLoader 0:  69%|████████████▍     | 45/65 [00:12<00:05,  3.49it/s]\u001b[A\n",
+      "Validation DataLoader 0:  71%|████████████▋     | 46/65 [00:13<00:05,  3.50it/s]\u001b[A\n",
+      "Validation DataLoader 0:  72%|█████████████     | 47/65 [00:13<00:05,  3.50it/s]\u001b[A\n",
+      "Validation DataLoader 0:  74%|█████████████▎    | 48/65 [00:13<00:04,  3.50it/s]\u001b[A\n",
+      "Validation DataLoader 0:  75%|█████████████▌    | 49/65 [00:13<00:04,  3.51it/s]\u001b[A\n",
+      "Validation DataLoader 0:  77%|█████████████▊    | 50/65 [00:14<00:04,  3.51it/s]\u001b[A\n",
+      "Validation DataLoader 0:  78%|██████████████    | 51/65 [00:14<00:03,  3.51it/s]\u001b[A\n",
+      "Validation DataLoader 0:  80%|██████████████▍   | 52/65 [00:14<00:03,  3.52it/s]\u001b[A\n",
+      "Validation DataLoader 0:  82%|██████████████▋   | 53/65 [00:15<00:03,  3.52it/s]\u001b[A\n",
+      "Validation DataLoader 0:  83%|██████████████▉   | 54/65 [00:15<00:03,  3.52it/s]\u001b[A\n",
+      "Validation DataLoader 0:  85%|███████████████▏  | 55/65 [00:15<00:02,  3.52it/s]\u001b[A\n",
+      "Validation DataLoader 0:  86%|███████████████▌  | 56/65 [00:15<00:02,  3.53it/s]\u001b[A\n",
+      "Validation DataLoader 0:  88%|███████████████▊  | 57/65 [00:16<00:02,  3.53it/s]\u001b[A\n",
+      "Validation DataLoader 0:  89%|████████████████  | 58/65 [00:16<00:01,  3.53it/s]\u001b[A\n",
+      "Validation DataLoader 0:  91%|████████████████▎ | 59/65 [00:16<00:01,  3.53it/s]\u001b[A\n",
+      "Validation DataLoader 0:  92%|████████████████▌ | 60/65 [00:16<00:01,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:  94%|████████████████▉ | 61/65 [00:17<00:01,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:  95%|█████████████████▏| 62/65 [00:17<00:00,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:  97%|█████████████████▍| 63/65 [00:17<00:00,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:  98%|█████████████████▋| 64/65 [00:18<00:00,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0: 100%|██████████████████| 65/65 [00:18<00:00,  3.54it/s]\u001b[A\n",
+      "Epoch 0: 100%|█| 162/162 [03:58<00:00,  0.68it/s, v_num=sg1z, train/loss=7.340, `Trainer.fit` stopped: `max_epochs=1` reached.\n",
+      "Epoch 0: 100%|█| 162/162 [03:58<00:00,  0.68it/s, v_num=sg1z, train/loss=7.340, \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: / 0.016 MB of 0.016 MB uploaded\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run history:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                  batchidx ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:              data_ctx_len ▆▆▆▅▆▆▆▆▆▅▆▆▆▆▆▆▆▆▆▆▁▆▆▆▆▆█▆▆▆▆▆▆▆▆██▆▆▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                     epoch ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:               global_rank ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/tokens_per_sec.gpu.0 ▁▅▅▆▆▇▇▇▇▇▇█████████████████████████████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/tokens_total.gpu.0 ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   substep ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/loss ██▆▆▄▄▃▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▂▂▁▂▂▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:       trainer/global_step ▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇██\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:     trainer/learning_rate ████▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:           validation/loss ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run summary:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                  batchidx 161\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:              data_ctx_len 8182.5\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                     epoch 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:               global_rank 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/tokens_per_sec.gpu.0 23824.88975\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/tokens_total.gpu.0 5242477\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   substep 161\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/loss 7.34375\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:       trainer/global_step 20\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:     trainer/learning_rate 0.00042\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:           validation/loss 7.39471\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33minfctx-v5-dataset-packing - Packing - (deepspeed_stage_1)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-infctx-unit-test/runs/8valsg1z\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-infctx-unit-test/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjEyNDE5OTQyNQ==/version_details/v4\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 6 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20231218_063943-8valsg1z/logs\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
+    "    python3 lightning_trainer.py fit \\\n",
+    "        -c \"{NOTEBOOK_DIR}/config/test-dataset-repack.yaml\" \\\n",
+    "        --model.load_model=\"../model/L6-D2048-world-v5base-init.pth\" \\\n",
+    "        --trainer.callbacks.init_args.dirpath=\"../checkpoint/v5-enwiki-10k-packing/\" \\\n",
+    "        --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Packing - (deepspeed_stage_1)\" \\\n",
+    "        --trainer.strategy=\"deepspeed_stage_1\" \\\n",
+    "        --trainer.microbatch_size=2 \\\n",
+    "        --trainer.devices=\"{GPU_DEVICES}\""
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "rwkv-infctx",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From a126d51c6ed72badd197d5efe93f507bbe352ce6 Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Mon, 18 Dec 2023 07:07:21 +0800
Subject: [PATCH 15/33] Enable packing by default - for speed boost!

---
 RWKV-v5/config-example.yaml | 69 ++++++++++++++++++++++++++++---------
 RWKV-v5/src/data.py         | 13 +++++--
 2 files changed, 64 insertions(+), 18 deletions(-)

diff --git a/RWKV-v5/config-example.yaml b/RWKV-v5/config-example.yaml
index 3438ac9f..2741b884 100644
--- a/RWKV-v5/config-example.yaml
+++ b/RWKV-v5/config-example.yaml
@@ -382,7 +382,7 @@ data:
   # Tokenizer to use, use either the inbuilt 'neox', or 'world' tokenizer
   # If using a custom tokenizer, provide the HF tokenizer name/path
   # ---
-  tokenizer: neox
+  tokenizer: world
 
   # Minimum / Maximum token size of the dataset to use
   # useful for filtering out small noisy data samples from large datasets
@@ -393,21 +393,6 @@ data:
   # min_token_size: 1024
   # max_token_size: -1
 
-  # Rechunking of text dataset, this is done only when source is set as 'text'
-  # and will merge the various sentencees, into larger chunks up to the target size
-  #
-  # Defaults to 2048
-  #
-  # This is ignored, if source is not set as text (unless text_rechunk_force)
-  # This is ignored, if set to zero
-  # ---
-  # text_rechunk_size: 2048
-
-  # Apply text rechunk to the dataset, even if its not a 'text' source
-  # This is done only after dataset filtering, and if source is not 'text'
-  # ---
-  # text_rechunk_force: True
-
   # Custom text column to use, useful for dataset with alternative training columns labels
   # This is checked before multi column merging, default is null (disabled)
   # eg: 'code'
@@ -439,6 +424,58 @@ data:
   # ---
   # disable_prompt_completion_mask: false
 
+  # ----------------------------
+  # Rechunking support
+  # ----------------------------
+
+  # Rechunking of text dataset, this is done only when source is set as 'text'
+  # and will merge the various sentencees, into larger chunks up to the target size
+  #
+  # Defaults to 2048
+  #
+  # This is ignored, if source is not set as text (unless text_rechunk_force)
+  # This is ignored, if set to zero / -1
+  # ---
+  text_rechunk_size: 2048
+
+  # Apply text rechunk to the dataset, even if its not a 'text' source
+  # This is done only after dataset filtering, and if source is not 'text'
+  # ---
+  text_rechunk_force: False
+
+  # Used to disable the automated text rechunkin for text files, if set as false
+  # ---
+  text_rechunk_auto: True
+
+  # ----------------------------
+  # Dataset packing support
+  # Recommended to be used with mixed documents sized finetuning
+  # For foundation model "from scratch", rechunking is typically used instead
+  # ----------------------------
+
+  # Boolean flag to enable / disable dataset packing
+  packing_enable: True
+
+  # Used to ensure all training samples wihin this batch size is the same length
+  # Ideally this should align exactly with your real "batch size"
+  #
+  # Uses, `8 * (3 * 4 * 5 * 6 * 7) = 20160` for default, as it should align across
+  # a large number of batch size combinations. This helps reduce the amount of
+  # misaligned batches, and thus reduce the amount of wasted training time.
+  packing_batchsize: 20160
+
+  # Chunking size to align within each batch, this ideally should be equal to
+  # the training context length used.
+  packing_chunksize: 4096
+
+  # Minimum size to pack up to, this should be a multiple of packing_chunksize
+  # defautls to -1, which equals to packing_chunksize
+  packing_min_ctx_len: -1
+
+  # Pack the data sequentially if possible, in accordance to the dataset sequence
+  # this can be used together with sort_by_length, otherwise a shuffle will be done
+  packing_in_sequence: False
+
 # Path to the current checkpoint to continue training from
 # this should be the directory path, and ends with `.ckpt/`
 ckpt_path: null
diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index af8da599..028a8448 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -437,9 +437,13 @@ def dataset_filter(x):
                 return False
             return True
         src_dataset = src_dataset.filter(dataset_filter, num_proc=num_cpus)
+
+        # Rechunking happened
+        rechunking_happened = False
         
         # Perform rechunking if needed for "text" based datasets
         if kargs["source"] == "text" and kargs["text_rechunk_size"] > 0 and kargs["text_rechunk_auto"]:
+            rechunking_happened = True
             src_dataset = src_dataset.map(rechunk_text, batched=True, 
                                         batch_size=processing_max_batch_size,
                                         num_proc=num_cpus)
@@ -447,6 +451,7 @@ def dataset_filter(x):
         # Perform rechunking after filtering, if source is not a "text" based 
         # dataset and text_rechunk_force is enabled
         if kargs["source"] != "text" and kargs["text_rechunk_size"] > 0 and kargs["text_rechunk_force"]:
+            rechunking_happened = True
             src_dataset = src_dataset.map(rechunk_text, batched=True, 
                                         batch_size=processing_max_batch_size,
                                         num_proc=num_cpus)
@@ -489,7 +494,12 @@ def add_length(example):
         # This however will mess up the "real_ctx_len" value, as it will be the length of the
         # of the merged dataset samples, instead of the original dataset sample.
         # ---
-        if kargs["packing_enable"]:
+
+        if kargs["packing_enable"] and rechunking_happened:
+            # Show warning if packing_enable is enabled, with rechunking
+            print("Warning: packing_enable=true, with text rechunking (either auto, or forced) - packing_enable will be treated as false")
+
+        if kargs["packing_enable"] and not rechunking_happened:
 
             # def add_length(example):
             #     example["sample_length"] = len(example['input_ids'])
@@ -769,7 +779,6 @@ def __init__(
 
         # Pack the data sequentially if possible, in accordance to the dataset sequence
         # this can be used together with sort_by_length, otherwise a shuffle will be done
-        # prior to packing
         packing_in_sequence: bool = False,
 
         # ----------------------------

From 492c41fb81ac50bdd25bed9dad89f54457dcfc19 Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Tue, 19 Dec 2023 03:55:13 +0800
Subject: [PATCH 16/33] minipile validation - binidx support tweaks

---
 RWKV-v5/src/data.py                           | 757 +++++++++---------
 .../config/minipile-world-512.yaml            | 292 +++++++
 .../minipile-validation.ipynb                 | 421 ++++++++++
 3 files changed, 1098 insertions(+), 372 deletions(-)
 create mode 100644 notebook/trainer-v5-validation/config/minipile-world-512.yaml
 create mode 100644 notebook/trainer-v5-validation/minipile-validation.ipynb

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index 6b426a78..368e918a 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -44,6 +44,8 @@ def prepare_data_static(**kargs):
         if kargs["tokenizer"] is None:
             raise ValueError('Tokenizer must be specified if source is specified')
         
+        # =====================================================
+
         # Special handling for binidx
         #--------------------------------
 
@@ -68,271 +70,237 @@ def gen():
                     }
 
             # Load the huggingface dataset from the generator
-            src_dataset = Dataset.from_generator(gen)
+            raw_src_dataset = Dataset.from_generator(gen)
 
+            # Previous short cut save for binidx, disabled to support chunking/packing
+            # ----------------------
             # Train/test split
             test_split = kargs["test_split"]
             # The minimum test size is 1, if not we will get errors in the trainer?
             if test_split <= 0 or test_split <= 0.0:
                 test_split = 1
-            split_dataset = src_dataset.train_test_split(
+            
+            # Force a split, to normlize the dataset format
+            src_dataset = raw_src_dataset.train_test_split(
                 test_size=test_split,shuffle=kargs["test_split_shuffle"],
                 seed=42 #Fixed seed, to prevent train/test reshuffling between test runs
             )
+            
+            # # Save the dataset to disk
+            # split_dataset.save_to_disk(kargs["data_path"])
+            # # Does nothing else (done)
+            # return
 
-            # Save the dataset to disk
-            split_dataset.save_to_disk(kargs["data_path"])
-            # Does nothing else (done)
-            return
-
-        # Reverting back to general purpose HF dataset / tokenizer handling
-        #--------------------------------
-
-        load_dataset_params = {
-            'path': kargs["source"],
-            'num_proc': num_cpus
-        }
+        else:
+            # Reverting back to general purpose HF dataset / tokenizer handling
+            #--------------------------------
+            load_dataset_params = {
+                'path': kargs["source"],
+                'num_proc': num_cpus
+            }
 
-        # Handle advance params (if set)
-        if kargs["source_data_dir"] is not None:
-            load_dataset_params['data_dir'] = kargs["source_data_dir"]
-        if kargs["source_dataset_params"] is not None:
-            source_dataset_params = kargs["source_dataset_params"]
-            for k, v in source_dataset_params.items():
-                load_dataset_params[k] = v
-
-        # Load the dataset
-        src_dataset = load_dataset(**load_dataset_params)
-
-        # If for some reason the dataset is a "test" only split, and missing a "train" split, we remap it as a "train" split
-        if "train" not in src_dataset.keys():
-            if "test" in src_dataset.keys():
-                src_dataset["train"] = src_dataset["test"]
-                del src_dataset["test"]
+            # Handle advance params (if set)
+            if kargs["source_data_dir"] is not None:
+                load_dataset_params['data_dir'] = kargs["source_data_dir"]
+            if kargs["source_dataset_params"] is not None:
+                source_dataset_params = kargs["source_dataset_params"]
+                for k, v in source_dataset_params.items():
+                    load_dataset_params[k] = v
+
+            # Load the dataset
+            src_dataset = load_dataset(**load_dataset_params)
+
+            # If for some reason the dataset is a "test" only split, and missing a "train" split, we remap it as a "train" split
+            if "train" not in src_dataset.keys():
+                if "test" in src_dataset.keys():
+                    src_dataset["train"] = src_dataset["test"]
+                    del src_dataset["test"]
+                else:
+                    raise ValueError('Dataset must have a "train" split')
+
+            # If an int value is used, it is interprated as document count
+            # If a floating value (<1.0) is used, it is interprated as a percentage of the dataset
+            if kargs["dataset_offset"] > 0 or kargs["dataset_length"] > 0:
+                # src dataset length
+                train_length = len(src_dataset["train"])
+
+                # Compute the offset position
+                offset_val = kargs["dataset_offset"]
+
+                # If offset is a float, we will use it as a percentage
+                if offset_val < 0:
+                    offset_val = 0
+                if offset_val > 0 and offset_val < 1.0:
+                    offset_val = int(train_length * offset_val) # Rounded down value
+
+                # Compute the length position
+                length_val = kargs["dataset_length"]
+                if length_val < 0:
+                    length_val = train_length - offset_val
+                if length_val > 0 and length_val < 1.0:
+                    length_val = int(train_length * length_val)
+                if length_val > (train_length - offset_val):
+                    length_val = (train_length - offset_val)
+
+                # Get the subset of the dataset
+                src_dataset["train"] = src_dataset["train"].select(range(offset_val, offset_val + length_val))
+
+            # Tokenizer vars
+            hf_tokenizer = None
+            world_tokenizer = None
+
+            # Load the tokenizer according to either its predefined name or its path
+            # (defaults to neox)
+            if kargs["tokenizer"] == "neox":
+                tokenizer_file = os.path.join(SRC_DIR, "./dataflow/20B_tokenizer.json")
+                hf_tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file)
+            elif kargs["tokenizer"] == "world":
+                # Setup the tokenizer
+                world_tokenizer = True
             else:
-                raise ValueError('Dataset must have a "train" split')
-
-        # If an int value is used, it is interprated as document count
-        # If a floating value (<1.0) is used, it is interprated as a percentage of the dataset
-        if kargs["dataset_offset"] > 0 or kargs["dataset_length"] > 0:
-            # src dataset length
-            train_length = len(src_dataset["train"])
-
-            # Compute the offset position
-            offset_val = kargs["dataset_offset"]
-
-            # If offset is a float, we will use it as a percentage
-            if offset_val < 0:
-                offset_val = 0
-            if offset_val > 0 and offset_val < 1.0:
-                offset_val = int(train_length * offset_val) # Rounded down value
-
-            # Compute the length position
-            length_val = kargs["dataset_length"]
-            if length_val < 0:
-                length_val = train_length - offset_val
-            if length_val > 0 and length_val < 1.0:
-                length_val = int(train_length * length_val)
-            if length_val > (train_length - offset_val):
-                length_val = (train_length - offset_val)
-
-            # Get the subset of the dataset
-            src_dataset["train"] = src_dataset["train"].select(range(offset_val, offset_val + length_val))
-
-        # Tokenizer vars
-        hf_tokenizer = None
-        world_tokenizer = None
-
-        # Load the tokenizer according to either its predefined name or its path
-        # (defaults to neox)
-        if kargs["tokenizer"] == "neox":
-            tokenizer_file = os.path.join(SRC_DIR, "./dataflow/20B_tokenizer.json")
-            hf_tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file)
-        elif kargs["tokenizer"] == "world":
-            # Setup the tokenizer
-            world_tokenizer = True
-        else:
-            # AutoTokenizer
-            tokenizerName = kargs["tokenizer"]
-
-            # with custom args and props
-            tokenizerKWArgs = {}
-            tokenizerProps = {}
-            if kargs["autoTokenizer"] is not None:
-                if kargs["autoTokenizer"]["kwargs"] is not None:
-                    tokenizerKWArgs = kargs["autoTokenizer"]["kwargs"]
-                if kargs["autoTokenizer"]["props"] is not None:
-                    tokenizerProps  = kargs["autoTokenizer"]["props"]
-
-            # Intialize the tokenizer, with kwargs
-            hf_tokenizer = AutoTokenizer.from_pretrained(tokenizerName, **tokenizerKWArgs)
-
-            # Configure the tokenizer properties
-            for k, v in tokenizerProps.items():
-                setattr(hf_tokenizer, k, v)
-
-        # Function used to tokenize the dataset as per HF tokenizer format
-        # if given the textual data, it will return the tokenized data
-        def encodeTokens(x):
-            if world_tokenizer is True:
-                # If x is an array of strings, we encode them seperately, and conslidate the result
-                if isinstance(x, list):
-                    id_arr = []
-                    type_arr = []
-                    mask_arr = []
-                    for i in range(len(x)):
-                        enc_str = world_tokenizer_encode(x[i], world_add_endoftext_token=world_add_endoftext_token)
-                        id_arr.append(enc_str)
-                        type_arr.append([0] * len(enc_str))
-                        mask_arr.append([1] * len(enc_str))
-
-                    # Consolidate the result
+                # AutoTokenizer
+                tokenizerName = kargs["tokenizer"]
+
+                # with custom args and props
+                tokenizerKWArgs = {}
+                tokenizerProps = {}
+                if kargs["autoTokenizer"] is not None:
+                    if kargs["autoTokenizer"]["kwargs"] is not None:
+                        tokenizerKWArgs = kargs["autoTokenizer"]["kwargs"]
+                    if kargs["autoTokenizer"]["props"] is not None:
+                        tokenizerProps  = kargs["autoTokenizer"]["props"]
+
+                # Intialize the tokenizer, with kwargs
+                hf_tokenizer = AutoTokenizer.from_pretrained(tokenizerName, **tokenizerKWArgs)
+
+                # Configure the tokenizer properties
+                for k, v in tokenizerProps.items():
+                    setattr(hf_tokenizer, k, v)
+
+            # Function used to tokenize the dataset as per HF tokenizer format
+            # if given the textual data, it will return the tokenized data
+            def encodeTokens(x):
+                if world_tokenizer is True:
+                    # If x is an array of strings, we encode them seperately, and conslidate the result
+                    if isinstance(x, list):
+                        id_arr = []
+                        type_arr = []
+                        mask_arr = []
+                        for i in range(len(x)):
+                            enc_str = world_tokenizer_encode(x[i], world_add_endoftext_token=world_add_endoftext_token)
+                            id_arr.append(enc_str)
+                            type_arr.append([0] * len(enc_str))
+                            mask_arr.append([1] * len(enc_str))
+
+                        # Consolidate the result
+                        return {
+                            'input_ids': id_arr,
+                            'token_type_ids': type_arr,
+                            'attention_mask': mask_arr
+                        }
+                    
+                    # Else we encode the string and return it following the HF tokenizer format
+                    enc_str = world_tokenizer_encode(x, world_add_endoftext_token=world_add_endoftext_token)
                     return {
-                        'input_ids': id_arr,
-                        'token_type_ids': type_arr,
-                        'attention_mask': mask_arr
+                        'input_ids': enc_str,
+                        'token_type_ids': [0] * len(enc_str),
+                        'attention_mask': [1] * len(enc_str)
                     }
-                
-                # Else we encode the string and return it following the HF tokenizer format
-                enc_str = world_tokenizer_encode(x, world_add_endoftext_token=world_add_endoftext_token)
-                return {
-                    'input_ids': enc_str,
-                    'token_type_ids': [0] * len(enc_str),
-                    'attention_mask': [1] * len(enc_str)
-                }
 
-            # We use the HF tokenizer as it is, and get the input_ids
-            return hf_tokenizer(x)
-        
-        # Multi column merging default values setup
-        if kargs["multi_column_keys"] is None:
-            multi_column_keys = ['instruction', 'input', 'output']
-            multi_column_prefix = ['Instruction:\n', 'Input:\n', 'Output:\n']
-            multi_column_suffix = ['', '', '']
-            multi_column_train_mask = [True, False, True]
-            multi_column_separator = '\n\n'
-        else:
-            multi_column_keys = kargs["multi_column_keys"]
-            multi_column_prefix = kargs["multi_column_prefix"]
-            multi_column_suffix = kargs["multi_column_suffix"]
-            multi_column_train_mask = kargs["multi_column_train_mask"]
-            multi_column_separator = kargs["multi_column_separator"]
-        
-        # Tokenized encodings for multi column keys
-        multi_column_enabled = len(multi_column_keys) > 0
-        multi_column_prefix_encodings = []
-        multi_column_suffix_encodings = []
-        multi_column_separator_encodings = None
-
-        # Process the multi column settings
-        if multi_column_enabled:
+                # We use the HF tokenizer as it is, and get the input_ids
+                return hf_tokenizer(x)
             
-            # Tokenize the multi column strings
-            for i in range(len(multi_column_keys)):
-                if multi_column_prefix is not None and multi_column_prefix[i] is not None:
-                    multi_column_prefix_encodings.append(encodeTokens(multi_column_prefix[i]))
-                if multi_column_suffix is not None and multi_column_suffix[i] is not None:
-                    multi_column_suffix_encodings.append(encodeTokens(multi_column_suffix[i]))    
+            # Multi column merging default values setup
+            if kargs["multi_column_keys"] is None:
+                multi_column_keys = ['instruction', 'input', 'output']
+                multi_column_prefix = ['Instruction:\n', 'Input:\n', 'Output:\n']
+                multi_column_suffix = ['', '', '']
+                multi_column_train_mask = [True, False, True]
+                multi_column_separator = '\n\n'
+            else:
+                multi_column_keys = kargs["multi_column_keys"]
+                multi_column_prefix = kargs["multi_column_prefix"]
+                multi_column_suffix = kargs["multi_column_suffix"]
+                multi_column_train_mask = kargs["multi_column_train_mask"]
+                multi_column_separator = kargs["multi_column_separator"]
             
-            # Tokenize the multi column separator
-            if multi_column_separator is not None and len(multi_column_separator) > 0:
-                multi_column_separator_encodings = encodeTokens(multi_column_separator)
-
-        conversation_prefix_encoding_map = {}
-        conversation_suffix_encoding_map = {}
-        conversation_end_of_conversation_token = encodeTokens(kargs["conversation_end_of_conversation"]) if kargs["conversation_end_of_conversation"] is not None else None
-        conversation_enabled = False
-        if 'conversation_format' in kargs and kargs["conversation_format"] is not None:
-            if kargs["conversation_format"] == "iopairs":
-                # preencode all prefixes (keyed by the input key)
-                for key, prefix in kargs['conversation_input_key_prefix_map'].items():
-                    conversation_prefix_encoding_map[key] = encodeTokens(prefix)
-                conversation_enabled = True
-            elif kargs["conversation_format"] == "sender":
-                # preencode all prefixes (keyed by the sender value)
-                for key, relabel in kargs['conversation_sender_value_map'].items():
-                    for input_key, value in kargs['conversation_input_key_map'].items():
-                        if input_key not in conversation_prefix_encoding_map:
-                            conversation_prefix_encoding_map[input_key] = {}
-                        conversation_prefix_encoding_map[input_key][key] = encodeTokens(value.replace('{sender}', relabel))
-
-            for key, suffix in kargs['conversation_sender_suffix'].items():
-                conversation_suffix_encoding_map[key] = encodeTokens(suffix)
-                        # example conversation_prefix_encoding_map['message']['user'] = encodeTokens('\n\nUser:')
-
-                conversation_enabled = True
-
-        # Maps the dataset record to the tokenized result
-        # handles a wide variety of format according to the data configuration
-        #
-        # - custom text keys
-        # - multiple key columns merged
-        # - prompt/completion format
-        # - text column itself
-        #
-        # Throws an error, if it failed to process the record
-        #
-        # This is called for each row record in the dataset
-        def map_tokenizer(x):
-            # Custom text column support
-            if kargs["custom_text_key"] is not None:
-                if kargs["custom_text_key"] in x:
-                    return encodeTokens(x[kargs["custom_text_key"]])
+            # Tokenized encodings for multi column keys
+            multi_column_enabled = len(multi_column_keys) > 0
+            multi_column_prefix_encodings = []
+            multi_column_suffix_encodings = []
+            multi_column_separator_encodings = None
+
+            # Process the multi column settings
+            if multi_column_enabled:
                 
-            if conversation_enabled:
-                conv_key = kargs['conversation_key'] if 'conversation_key' in kargs else None
-                conversation = x[conv_key] if conv_key is not None else x
-
-                # Array of output values we will return
-                input_ids = []
-                token_type_ids = []
-                attention_mask = []
-
-                if kargs['conversation_format'] == 'iopairs':
-                    # lets loop through each io pair
-                    for i in range(len(conversation)):
-                        # lets loop through each key in the io pair
-                        for key, value in conversation[i].items():
-                            # lets get the prefix for this key
-                            prefix = conversation_prefix_encoding_map[key] if sender in conversation_prefix_encoding_map[key] else None
-
-                            # Add the prefix
-                            if prefix is not None:
-                                input_ids += prefix['input_ids']
-                                token_type_ids += prefix['token_type_ids']
-                                attention_mask += prefix['attention_mask']
-
-                            # Tokenize the column
-                            column_encodings = encodeTokens(value)
-
-                            # Add the column
-                            input_ids += column_encodings['input_ids']
-                            token_type_ids += column_encodings['token_type_ids']
-
-                            if key not in kargs["conversation_input_key_mask"] or kargs["conversation_input_key_mask"][key]:
-                                # If the corresponding `conversation_input_key_mask` is not set, we will assume as valid training data
-                                attention_mask += ([1] * len(column_encodings['input_ids']))
-                            else: # kargs["conversation_input_key_mask"][key] is False
-                                # This means it is false, lets not pay attention to it
-                                attention_mask += ([0] * len(column_encodings['input_ids']))
+                # Tokenize the multi column strings
+                for i in range(len(multi_column_keys)):
+                    if multi_column_prefix is not None and multi_column_prefix[i] is not None:
+                        multi_column_prefix_encodings.append(encodeTokens(multi_column_prefix[i]))
+                    if multi_column_suffix is not None and multi_column_suffix[i] is not None:
+                        multi_column_suffix_encodings.append(encodeTokens(multi_column_suffix[i]))    
+                
+                # Tokenize the multi column separator
+                if multi_column_separator is not None and len(multi_column_separator) > 0:
+                    multi_column_separator_encodings = encodeTokens(multi_column_separator)
+
+            conversation_prefix_encoding_map = {}
+            conversation_suffix_encoding_map = {}
+            conversation_end_of_conversation_token = encodeTokens(kargs["conversation_end_of_conversation"]) if kargs["conversation_end_of_conversation"] is not None else None
+            conversation_enabled = False
+            if 'conversation_format' in kargs and kargs["conversation_format"] is not None:
+                if kargs["conversation_format"] == "iopairs":
+                    # preencode all prefixes (keyed by the input key)
+                    for key, prefix in kargs['conversation_input_key_prefix_map'].items():
+                        conversation_prefix_encoding_map[key] = encodeTokens(prefix)
+                    conversation_enabled = True
+                elif kargs["conversation_format"] == "sender":
+                    # preencode all prefixes (keyed by the sender value)
+                    for key, relabel in kargs['conversation_sender_value_map'].items():
+                        for input_key, value in kargs['conversation_input_key_map'].items():
+                            if input_key not in conversation_prefix_encoding_map:
+                                conversation_prefix_encoding_map[input_key] = {}
+                            conversation_prefix_encoding_map[input_key][key] = encodeTokens(value.replace('{sender}', relabel))
+
+                for key, suffix in kargs['conversation_sender_suffix'].items():
+                    conversation_suffix_encoding_map[key] = encodeTokens(suffix)
+                            # example conversation_prefix_encoding_map['message']['user'] = encodeTokens('\n\nUser:')
+
+                    conversation_enabled = True
+
+            # Maps the dataset record to the tokenized result
+            # handles a wide variety of format according to the data configuration
+            #
+            # - custom text keys
+            # - multiple key columns merged
+            # - prompt/completion format
+            # - text column itself
+            #
+            # Throws an error, if it failed to process the record
+            #
+            # This is called for each row record in the dataset
+            def map_tokenizer(x):
+                # Custom text column support
+                if kargs["custom_text_key"] is not None:
+                    if kargs["custom_text_key"] in x:
+                        return encodeTokens(x[kargs["custom_text_key"]])
+                    
+                if conversation_enabled:
+                    conv_key = kargs['conversation_key'] if 'conversation_key' in kargs else None
+                    conversation = x[conv_key] if conv_key is not None else x
 
-                            
-                            suffix = conversation_suffix_encoding_map[key] if sender in conversation_suffix_encoding_map else None
+                    # Array of output values we will return
+                    input_ids = []
+                    token_type_ids = []
+                    attention_mask = []
 
-                            if suffix is not None:
-                                input_ids += suffix['input_ids']
-                                token_type_ids += suffix['token_type_ids']
-                                attention_mask += suffix['attention_mask']
-                
-                elif kargs['conversation_format'] == 'sender':
-                    for i in range(len(conversation)):
-                        turn = conversation[i]
-                        sender = turn[kargs['conversation_sender_key']]
-                            
-                        for key, value in kargs['conversation_input_key_map'].items():
-                            if key in turn:
+                    if kargs['conversation_format'] == 'iopairs':
+                        # lets loop through each io pair
+                        for i in range(len(conversation)):
+                            # lets loop through each key in the io pair
+                            for key, value in conversation[i].items():
                                 # lets get the prefix for this key
-                                prefix = conversation_prefix_encoding_map[key][sender] if sender in conversation_prefix_encoding_map[key] else None
+                                prefix = conversation_prefix_encoding_map[key] if sender in conversation_prefix_encoding_map[key] else None
 
                                 # Add the prefix
                                 if prefix is not None:
@@ -341,152 +309,196 @@ def map_tokenizer(x):
                                     attention_mask += prefix['attention_mask']
 
                                 # Tokenize the column
-                                column_encodings = encodeTokens(turn[key])
+                                column_encodings = encodeTokens(value)
 
                                 # Add the column
                                 input_ids += column_encodings['input_ids']
                                 token_type_ids += column_encodings['token_type_ids']
 
-                                if sender not in kargs["conversation_sender_mask"] or kargs["conversation_sender_mask"][sender]:
+                                if key not in kargs["conversation_input_key_mask"] or kargs["conversation_input_key_mask"][key]:
                                     # If the corresponding `conversation_input_key_mask` is not set, we will assume as valid training data
                                     attention_mask += ([1] * len(column_encodings['input_ids']))
                                 else: # kargs["conversation_input_key_mask"][key] is False
                                     # This means it is false, lets not pay attention to it
                                     attention_mask += ([0] * len(column_encodings['input_ids']))
 
-                                suffix = conversation_suffix_encoding_map[sender] if sender in conversation_suffix_encoding_map else None
+                                
+                                suffix = conversation_suffix_encoding_map[key] if sender in conversation_suffix_encoding_map else None
 
                                 if suffix is not None:
                                     input_ids += suffix['input_ids']
                                     token_type_ids += suffix['token_type_ids']
                                     attention_mask += suffix['attention_mask']
-
-                if len(input_ids) > 0  and conversation_end_of_conversation_token is not None:
-                    input_ids += conversation_end_of_conversation_token['input_ids']
-                    token_type_ids += conversation_end_of_conversation_token['token_type_ids']
-                    attention_mask += conversation_end_of_conversation_token['attention_mask']
-
-                return {
-                    'input_ids': input_ids,
-                    'token_type_ids': token_type_ids,
-                    'attention_mask': attention_mask
-                }
                     
-            # Multi column merging support
-            if multi_column_enabled:
-                # Lets count the number of columns we have
-                # that have data in them
-                num_columns = 0
-                for i in range(len(multi_column_keys)):
-                    if multi_column_keys[i] in x and x[multi_column_keys[i]] is not None and len(x[multi_column_keys[i]]) > 0:
-                        num_columns += 1
-                # If we have more than 1 column, we will have to merge them
-                if num_columns > 1:
-                    # Array of output values we will return
-                    input_ids = []
-                    token_type_ids = []
-                    attention_mask = []
-
-                    # First item flag
-                    is_first_item = True
+                    elif kargs['conversation_format'] == 'sender':
+                        for i in range(len(conversation)):
+                            turn = conversation[i]
+                            sender = turn[kargs['conversation_sender_key']]
+                                
+                            for key, value in kargs['conversation_input_key_map'].items():
+                                if key in turn:
+                                    # lets get the prefix for this key
+                                    prefix = conversation_prefix_encoding_map[key][sender] if sender in conversation_prefix_encoding_map[key] else None
+
+                                    # Add the prefix
+                                    if prefix is not None:
+                                        input_ids += prefix['input_ids']
+                                        token_type_ids += prefix['token_type_ids']
+                                        attention_mask += prefix['attention_mask']
+
+                                    # Tokenize the column
+                                    column_encodings = encodeTokens(turn[key])
+
+                                    # Add the column
+                                    input_ids += column_encodings['input_ids']
+                                    token_type_ids += column_encodings['token_type_ids']
+
+                                    if sender not in kargs["conversation_sender_mask"] or kargs["conversation_sender_mask"][sender]:
+                                        # If the corresponding `conversation_input_key_mask` is not set, we will assume as valid training data
+                                        attention_mask += ([1] * len(column_encodings['input_ids']))
+                                    else: # kargs["conversation_input_key_mask"][key] is False
+                                        # This means it is false, lets not pay attention to it
+                                        attention_mask += ([0] * len(column_encodings['input_ids']))
+
+                                    suffix = conversation_suffix_encoding_map[sender] if sender in conversation_suffix_encoding_map else None
+
+                                    if suffix is not None:
+                                        input_ids += suffix['input_ids']
+                                        token_type_ids += suffix['token_type_ids']
+                                        attention_mask += suffix['attention_mask']
+
+                    if len(input_ids) > 0  and conversation_end_of_conversation_token is not None:
+                        input_ids += conversation_end_of_conversation_token['input_ids']
+                        token_type_ids += conversation_end_of_conversation_token['token_type_ids']
+                        attention_mask += conversation_end_of_conversation_token['attention_mask']
 
-                    # Lets loop through each column
+                    return {
+                        'input_ids': input_ids,
+                        'token_type_ids': token_type_ids,
+                        'attention_mask': attention_mask
+                    }
+                        
+                # Multi column merging support
+                if multi_column_enabled:
+                    # Lets count the number of columns we have
+                    # that have data in them
+                    num_columns = 0
                     for i in range(len(multi_column_keys)):
-                        # And process the column if it has data
                         if multi_column_keys[i] in x and x[multi_column_keys[i]] is not None and len(x[multi_column_keys[i]]) > 0:
-                            # Add the separator if this is not the first item
-                            if not is_first_item and multi_column_separator_encodings is not None:
-                                input_ids += multi_column_separator_encodings['input_ids']
-                                token_type_ids += multi_column_separator_encodings['token_type_ids']
-                                attention_mask += multi_column_separator_encodings['attention_mask']
-                            
-                            # Add the prefix
-                            if len(multi_column_prefix_encodings) > i and multi_column_prefix_encodings[i] is not None:
-                                input_ids += multi_column_prefix_encodings[i]['input_ids']
-                                token_type_ids += multi_column_prefix_encodings[i]['token_type_ids']
-                                attention_mask += multi_column_prefix_encodings[i]['attention_mask']
-
-                            # Tokenize the column
-                            column_encodings = encodeTokens(x[multi_column_keys[i]])
-
-                            # Add the column
-                            input_ids += column_encodings['input_ids']
-                            token_type_ids += column_encodings['token_type_ids']
-
-                            # Configure the attention masks accordingly
-                            if i > len(multi_column_train_mask):
-                                # If the corresponding `multi_column_train_mask` is not set, we will assume as valid training data
-                                attention_mask += ([1] * len(column_encodings['input_ids']))
-                            elif multi_column_train_mask[i] is False:
-                                # If the `multi_column_train_mask` is set, but configured as false, we should not pay attention to it
-                                attention_mask += ([0] * len(column_encodings['input_ids']))
-                            else: # multi_column_train_mask[i] is True
-                                # This means it is true, lets pay attention once again
-                                attention_mask += ([1] * len(column_encodings['input_ids']))
+                            num_columns += 1
+                    # If we have more than 1 column, we will have to merge them
+                    if num_columns > 1:
+                        # Array of output values we will return
+                        input_ids = []
+                        token_type_ids = []
+                        attention_mask = []
+
+                        # First item flag
+                        is_first_item = True
+
+                        # Lets loop through each column
+                        for i in range(len(multi_column_keys)):
+                            # And process the column if it has data
+                            if multi_column_keys[i] in x and x[multi_column_keys[i]] is not None and len(x[multi_column_keys[i]]) > 0:
+                                # Add the separator if this is not the first item
+                                if not is_first_item and multi_column_separator_encodings is not None:
+                                    input_ids += multi_column_separator_encodings['input_ids']
+                                    token_type_ids += multi_column_separator_encodings['token_type_ids']
+                                    attention_mask += multi_column_separator_encodings['attention_mask']
                                 
-                            # Add the suffix
-                            if len(multi_column_suffix_encodings) > i and multi_column_suffix_encodings[i] is not None:
-                                input_ids += multi_column_suffix_encodings[i]['input_ids']
-                                token_type_ids += multi_column_suffix_encodings[i]['token_type_ids']
-                                attention_mask += multi_column_suffix_encodings[i]['attention_mask']
-                            
-                            # Set the first item flag to false
-                            is_first_item = False
-                    
-                    # Return the merged columns
+                                # Add the prefix
+                                if len(multi_column_prefix_encodings) > i and multi_column_prefix_encodings[i] is not None:
+                                    input_ids += multi_column_prefix_encodings[i]['input_ids']
+                                    token_type_ids += multi_column_prefix_encodings[i]['token_type_ids']
+                                    attention_mask += multi_column_prefix_encodings[i]['attention_mask']
+
+                                # Tokenize the column
+                                column_encodings = encodeTokens(x[multi_column_keys[i]])
+
+                                # Add the column
+                                input_ids += column_encodings['input_ids']
+                                token_type_ids += column_encodings['token_type_ids']
+
+                                # Configure the attention masks accordingly
+                                if i > len(multi_column_train_mask):
+                                    # If the corresponding `multi_column_train_mask` is not set, we will assume as valid training data
+                                    attention_mask += ([1] * len(column_encodings['input_ids']))
+                                elif multi_column_train_mask[i] is False:
+                                    # If the `multi_column_train_mask` is set, but configured as false, we should not pay attention to it
+                                    attention_mask += ([0] * len(column_encodings['input_ids']))
+                                else: # multi_column_train_mask[i] is True
+                                    # This means it is true, lets pay attention once again
+                                    attention_mask += ([1] * len(column_encodings['input_ids']))
+                                    
+                                # Add the suffix
+                                if len(multi_column_suffix_encodings) > i and multi_column_suffix_encodings[i] is not None:
+                                    input_ids += multi_column_suffix_encodings[i]['input_ids']
+                                    token_type_ids += multi_column_suffix_encodings[i]['token_type_ids']
+                                    attention_mask += multi_column_suffix_encodings[i]['attention_mask']
+                                
+                                # Set the first item flag to false
+                                is_first_item = False
+                        
+                        # Return the merged columns
+                        return {
+                            'input_ids': input_ids,
+                            'token_type_ids': token_type_ids,
+                            'attention_mask': attention_mask
+                        }
+
+                # Prompt completion support
+                if 'prompt' in x and 'completion' in x:
+                    # Array of output values we will return
+                    input_ids = None
+                    token_type_ids = None
+                    attention_mask = None
+
+                    # Tokenize both prompt and completion
+                    # Note that the tokenizer will process and return the input_ids in batches
+                    prompt_encodings = encodeTokens(x['prompt'])
+                    completion_encodings = encodeTokens(x['completion'])
+
+                    # Join the two input_ids lists
+                    input_ids = prompt_encodings['input_ids'] + completion_encodings['input_ids']
+                    # Join the two token_type_ids lists
+                    token_type_ids = prompt_encodings['token_type_ids'] + completion_encodings['token_type_ids']
+                    # Setup the attention mask, 0 for prompt, 1 for completion, if masking is enabled
+                    if kargs["disable_prompt_completion_mask"]:
+                        attention_mask = ([1] * len(prompt_encodings['input_ids']) + [1] * len(completion_encodings['input_ids']))
+                    else:
+                        attention_mask = ([0] * len(prompt_encodings['input_ids']) + [1] * len(completion_encodings['input_ids']))
+
+                    # Prepare and return the output object
                     return {
                         'input_ids': input_ids,
                         'token_type_ids': token_type_ids,
-                        'attention_mask': attention_mask
+                        'attention_mask': attention_mask,
                     }
+                
+                # Fallback to standard text tokenization
+                if 'text' in x:
+                    return encodeTokens(x['text'])
+                
+                raise ValueError('Invalid dataset format, must contain either the configured "multi column" or prompt/completion or text')
 
-            # Prompt completion support
-            if 'prompt' in x and 'completion' in x:
-                # Array of output values we will return
-                input_ids = None
-                token_type_ids = None
-                attention_mask = None
-
-                # Tokenize both prompt and completion
-                # Note that the tokenizer will process and return the input_ids in batches
-                prompt_encodings = encodeTokens(x['prompt'])
-                completion_encodings = encodeTokens(x['completion'])
-
-                # Join the two input_ids lists
-                input_ids = prompt_encodings['input_ids'] + completion_encodings['input_ids']
-                # Join the two token_type_ids lists
-                token_type_ids = prompt_encodings['token_type_ids'] + completion_encodings['token_type_ids']
-                # Setup the attention mask, 0 for prompt, 1 for completion, if masking is enabled
-                if kargs["disable_prompt_completion_mask"]:
-                    attention_mask = ([1] * len(prompt_encodings['input_ids']) + [1] * len(completion_encodings['input_ids']))
-                else:
-                    attention_mask = ([0] * len(prompt_encodings['input_ids']) + [1] * len(completion_encodings['input_ids']))
-
-                # Prepare and return the output object
-                return {
-                    'input_ids': input_ids,
-                    'token_type_ids': token_type_ids,
-                    'attention_mask': attention_mask,
-                }
+            # Map the dataset to the tokenizer, removing the old text column
+            src_dataset = src_dataset.map(map_tokenizer, batched=False, num_proc=num_cpus)
             
-            # Fallback to standard text tokenization
-            if 'text' in x:
-                return encodeTokens(x['text'])
-            
-            raise ValueError('Invalid dataset format, must contain either the configured "multi column" or prompt/completion or text')
+        # =====================================================
 
-        # Map the dataset to the tokenizer, removing the old text column
-        src_dataset = src_dataset.map(map_tokenizer, batched=False, num_proc=num_cpus)
-        
         # Remove all features, except input_ids, token_type_ids and attention_mask
         # as the metadata/etc columns may cause problems down the line (when passed to the trainer)
         dataset_features = src_dataset["train"].features
         dataset_features_to_remove = {k: v for k, v in dataset_features.items() if k not in ["input_ids", "token_type_ids", "attention_mask"]}
         src_dataset = src_dataset.remove_columns(list(dataset_features_to_remove.keys()))
-        
+    
         # Get the newline token
-        endOfDoc_tokenSet = encodeTokens(["\n"])
-        endOfDoc_tokenSet["input_ids"][0][0] = 0
+        endOfDoc_tokenSet = {
+            'input_ids': [[0]],
+            'token_type_ids': [[0]],
+            'attention_mask': [[1]],
+        }
+
 
         # See if rechunking is needed, this is useful mostly for "text" based datasets
         # where we would need to split them into "digestable" context length sizes 
@@ -560,19 +572,21 @@ def dataset_filter(x):
         rechunking_happened = False
         
         # Perform rechunking if needed for "text" based datasets
-        if kargs["source"] == "text" and kargs["text_rechunk_size"] > 0 and kargs["text_rechunk_auto"]:
-            rechunking_happened = True
-            src_dataset = src_dataset.map(rechunk_text, batched=True, 
-                                        batch_size=processing_max_batch_size,
-                                        num_proc=num_cpus)
-        
-        # Perform rechunking after filtering, if source is not a "text" based 
-        # dataset and text_rechunk_force is enabled
-        if kargs["source"] != "text" and kargs["text_rechunk_size"] > 0 and kargs["text_rechunk_force"]:
-            rechunking_happened = True
-            src_dataset = src_dataset.map(rechunk_text, batched=True, 
-                                        batch_size=processing_max_batch_size,
-                                        num_proc=num_cpus)
+        text_rechunk_size = int(kargs["text_rechunk_size"])
+        if text_rechunk_size > 0:
+            if kargs["source"] == "text" and (kargs["text_rechunk_auto"] or kargs["text_rechunk_force"]):
+                rechunking_happened = True
+                src_dataset = src_dataset.map(rechunk_text, batched=True, 
+                                            batch_size=min(text_rechunk_size*8, processing_max_batch_size),
+                                            num_proc=num_cpus)
+            
+            # Perform rechunking after filtering, if source is not a "text" based 
+            # dataset and text_rechunk_force is enabled
+            if kargs["source"] != "text" and kargs["text_rechunk_force"]:
+                rechunking_happened = True
+                src_dataset = src_dataset.map(rechunk_text, batched=True, 
+                                            batch_size=min(text_rechunk_size*8, processing_max_batch_size),
+                                            num_proc=num_cpus)
 
         # Check if the dataset does not have a test split
         # and if so, perform the split
@@ -730,7 +744,7 @@ def merge_into_existing_samples(i):
 
             # Perform the dataset packing
             src_dataset['train'] = src_dataset['train'].map(pack_dataset_in_sequence, batched=True, 
-                                        batch_size=processing_max_batch_size,
+                                        batch_size=min(packing_min_ctx_len*2*3*5, processing_max_batch_size),
                                         num_proc=num_cpus)
         else:
             # Remove the sample_length column, as it is no longer needed
@@ -922,8 +936,7 @@ def __init__(
         # Batch size scanning range, used for deciding the max number of documents
         # to process simultaneously at a time. This is used to prevent OOM errors
         # while rearranging the dataset, etc. Used for both packing / sorting operations
-        # ( Defaults to all records )
-        processing_max_batch_size: int = -1,
+        processing_max_batch_size: int = 100000,
 
         # Skip database setup checks if datapath exists, ignored if using preload_datapath.py
         skip_datapath_setup: bool = False
diff --git a/notebook/trainer-v5-validation/config/minipile-world-512.yaml b/notebook/trainer-v5-validation/config/minipile-world-512.yaml
new file mode 100644
index 00000000..e7801196
--- /dev/null
+++ b/notebook/trainer-v5-validation/config/minipile-world-512.yaml
@@ -0,0 +1,292 @@
+# lightning.pytorch==2.0.2
+seed_everything: 3941088705
+trainer:
+
+  #
+  # Configure the deepspeed strategy, we recommend you start with `deepspeed_stage_2_offload` 
+  # and adjust from there according to your training needs. `deepspeed_stage_3_offload` is useful  
+  # for training LoRA on large models on a single GPU.
+  #
+  # In general you would want to use the following:
+  #
+  # - deepspeed_stage_1 : Each of your GPU has too much vram, and you do not know what to do
+  #
+  # - deepspeed_stage_2 : Optimal distributed training strategy, across multiple gpu each with sufficient vram
+  # - deepspeed_stage_2_offload : Reduce vram usage by offloading the optimizer state and work to cpu
+  #
+  # - deepspeed_stage_3 : Split up the model across multiple gpu, useful for large models, at a performance cost
+  # - deepspeed_stage_3_offload : Additional offloading, for even greater performance cost
+  #
+  # For more details see:
+  # https://lightning.ai/docs/pytorch/stable/advanced/model_parallel.html#deepspeed-zero-stage-2
+  #
+  strategy: deepspeed_stage_2
+
+  # Logger setting for wandb, if you want to enable wandb, uncomment the whole logger section
+  # ---
+  logger:
+    class_path: lightning.pytorch.loggers.WandbLogger
+    init_args:
+      name: 'infctx-v5-validation minipile-512 (train-ctx=512, data-ctx=512)'
+      project: 'RWKV-infctx-unit-test'
+      tags: ['RWKV', 'infctx']
+  
+  # Checkpoint settings for the training process
+  callbacks:
+    class_path: lightning.pytorch.callbacks.ModelCheckpoint
+    init_args:
+      # Configure this to the path you want to save your checkpoints to
+      # note that a subdir will be created with the name `epoch=x-step=y.ckpt`
+      # 
+      # to convert a checkpoint to a model, you can use the 
+      # `python3 export_checkpoint.py <checkpoint path>` script, 
+      # which will create a `rwkv_model.pth` in the checkpoint directory.
+      #
+      # Do not use the `zero_to_fp32.py` script as that will have export format issues
+      dirpath: ../checkpoint/trainer-validaiton/infctx-v5-minipile-512
+      filename: null
+      
+      # Save the top/last K checkpoints
+      save_top_k: 3
+      # Choose by the most recent checkpoints (step based)
+      monitor: 'step'
+      mode: max
+      
+      # If enabled (true), save a copy of the latest checkpoint to 'last.ckpt'
+      # useful to simply checkpoint resume scripts, at a price of disk performance
+      save_last: false
+
+      # DO NOT set this as true, as the model weight exported will have format issues
+      # expert as checkpoint, and use the `export_checkpoint.py` script to convert to model instead
+      save_weights_only: false
+
+      # How frequent you want to save a checkpoint for every step.
+      # This will happen for every X data sample, where X = every_n_train_steps * accumulate_grad_batches
+      #
+      # In general you will want to avoid putting a low number (expecially if accumulate_grad_batches <= 100)
+      # as the checkpoint process, will pause all the gpu training for some time, slowing down the overall process
+      # However you do not want to configure too high of a number, where you will lose too much progress if the training crashes
+      every_n_train_steps: 1000
+      every_n_epochs: null
+      save_on_train_epoch_end: true
+      train_time_interval: null
+
+      # Other settings, you can probably leave alone
+      verbose: false
+      auto_insert_metric_name: true
+  
+  ########################################
+  ## Training run parameter settings
+  ########################################
+
+  # Generally what you want to configure is the maximum number of epochs
+  # Leave it as -1, and it will keep going forever till interrupted
+  # Or set it as a number, and it will stop after that number of epochs
+  max_epochs: 1
+  min_epochs: null
+  max_steps: -1
+  min_steps: null
+  max_time: null
+
+  # Number of datasamples to train for each step, a data sample is considered
+  # a "substep" in wandb logs, and a "step" is tracked as "trainer/global_step"
+  #
+  # This decides the number of datasample, to learn together from, before backproping
+  # any weight changes at the end of the batch.
+  #
+  # Recommended to be a big enough number (like 128/256) where it prevents the training 
+  # loss from flucuating in the process. But not too big of a number where the increased
+  # GPU vRAM / offloaded RAM usage will cause the training to crash.
+  #
+  # You are also recommended to configure this to a large enough number to fully utilize
+  # your GPU processing time %, and avoid idle time for the GPU between batches
+  #
+  # This number is divided by the number of GPUs, and nodes configured
+  # So if you have 4 GPUs, and 2 nodes, and this is configured as 128
+  # Each GPU will process 128/4/2 = 16 datasamples per step, via accumulate_grad_batches
+  target_batch_size: 16
+
+  # Microbatching chunks which we split our data by, this substentially increase vram usage 
+  # for each GPU step, but increase throughput of the training process substentially. 
+  #
+  # So if you have 16 datasample per batch per GPU. And microbatch_size of 2, you have 8 substep
+  #
+  # It is generally recommended to tune this to be the highest you can resonably support 
+  # on your GPU as it has a direct impact on your overall tokens / second count.
+  #
+  # Typically you tune the microbatch_size first, before tuning the target_batch_size
+  microbatch_size: 16
+
+  
+
+########################################
+## Training model settings
+########################################
+model:
+  # Model to start the finetune/training process from
+  load_model: ../model/L12-D768-world-init.pth
+
+  # Context length to use for the training process
+  # the larger the number (and batch size) the larger the vram usage
+  # 
+  # Note that if the datasample context length is larger then the ctx_len
+  # its training process would be split into ctx_len sized chunks.
+  #
+  # This allows the training of extreamly large context length (eg. 100k),
+  # without eating up too much vram by keeping the training context length
+  # to a resonable number sutible to the current GPU setup
+  ctx_len: 512
+  
+  # Data samples would be cut down to the respective max ctx_len_cutoffs
+  # values if its larger then ctx_len. If the data sample is larger then
+  # the largest len_cutoff, the remaining data will be discarded
+  ctx_len_cutoffs: []
+  # Experimental settings, number of tokens to skip in the data sample
+  # prefix, for the respective cutoff length. Used to speed up the process
+  ctx_len_warmup_steps: []
+
+  # Learning rate of the training process
+  # ---
+
+  # Initia learning rate of the process
+  lr_init: 6e-4
+  # Final learning rate after the learning rate period
+  # learning rate will stay at final value from then onwards
+  lr_final: 6e-5
+
+  # Number of epoch to reduce the learning rate from lr_init to lr_final
+  #  1 means a single epoch (so lr would be lr_final from epoch 2 onwards)
+  #  0 means lr_final will apply immediately
+  # -1 means we take the current max_step / max_epoch as the period
+  lr_period: 1
+  # lr_period type if its set, defaults to epoch
+  lr_period_type: epoch
+
+  # Adam optimizer settings
+  # You probably want to leave this alone, unless you know what you are doing
+  beta1: 0.9
+  beta2: 0.99
+  adam_eps: 1.0e-08
+  weight_decay: 0.001
+
+  # torch.set_float32_matmul_precision, used to optimize operations with tensor cores
+  # this should be set as null, for non cuda core GPUs
+  torch_set_float32_matmul_precision: 'high'
+  # torch_set_float32_matmul_precision: null
+
+  # Segmented based learning, used to work around training of large context length
+  # beyond what can be supported by the current GPU vram architecture
+  #
+  # This is not 1:1 equivalent to the same training process with required vram
+  # as the training process is split into multiple segments, part by part.
+  # with limited learnings from the previous segment.
+  bptt_learning: true
+
+  # Segmented range to performing backprop learning on
+  # 1 means to apply only for the last segment
+  # -1 means to apply for all segments
+  bptt_learning_range: -1
+
+  # various other pytorch lightning settings you probably should leave alone
+  # ---
+  # grad_cp: true
+  # warmup_steps: -1
+  # layerwise_lr: true
+  # dim_att: null
+  # dim_ffn: null
+
+  # Disable gradCP for SPEED
+  grad_cp: false
+data:
+  # dataset_path for the prebuilt dataset, using HF `load_from_disk()`
+  #
+  # Use this if you have built your own dataset and saved it with `save_to_disk()`
+  # with source left as null. Other wise configure this to a directory which the 
+  # dataset will be built and tokenized by the huggingface dataset process.
+  data_path: ../datapath/minipile-world-512/
+
+  # Other wise provide the source path, which is used as huggingface dataset path
+  # this will be used to populate the dataset_path
+  #
+  # Use either the following
+  # - hugging face dataset 
+  # - Directory path to a directory containing dataset files
+  # - Path to a single dataset file
+  # - hugging face dataset mode (ie: text,csv,etc - use data_dir, to configure the path then)
+  # - null
+  #
+  # If source is disabled, all other params, except data_path, is ignored
+  source: "../dataset/minipile"
+  # source: text
+  # source: /home/ubuntu/RWKV-LM-LoRA/dataset-text/enwik8.txt
+
+  # Use data_dir, if you are using source=text/json/etc
+  # this should be relative to the trainer script path
+  # source_data_dir: null
+
+  # After loading the dataset, split out test data used for unit-test, 
+  # This process is skipped if the dataset includes a test split
+  # This process is skipped if set to zero
+  test_split: 0
+  test_split_shuffle: false
+
+  # Tokenizer to use, use either the inbuilt 'neox', or 'world' tokenizer
+  # If using a custom tokenizer, provide the tokenizer file path
+  # ---
+  tokenizer: binidx
+
+  # Minimum / Maximum token size of the dataset to use
+  # useful for filtering out small noisy data samples from large datasets
+  # (eg. removal of small articles of less then 512 tokens from wikipedia)
+  #
+  # This is ignored, if set to -1
+  # ---
+  # min_token_size: 1
+  # max_token_size: -1
+
+  # Rechunking of text dataset, this is done only when source is set as 'text'
+  # and will merge the various sentencees, into larger chunks up to the target size
+  #
+  # Defaults to 4096
+  #
+  # This is ignored, if source is not set as text
+  # This is ignored, if set to zero
+  # ---
+  text_rechunk_size: 512
+
+  # Apply text rechunk to the dataset, even if its not a 'text' source
+  # This is done only after dataset filtering, and if source is not 'text'
+  # ---
+  text_rechunk_force: true
+
+  # Custom text column to use, useful for dataset with alternative training columns labels
+  # This is checked before multi column merging, default is null (disabled)
+  # eg: 'code'
+  # ---
+  # custom_text_key: 'code'
+
+  # Multi Column merging process, default setting is used to support and merge
+  # "instruction", "input", "output", datasets. To disable set multi_column_keys to []
+  #
+  # A minimum of 2 columns is required, with non empty data, for the merge to occur
+  # If no match is found, this will fallback to the default prompt/completion or text column, 
+  # or throw an error if the default fallback is not found
+  # ---
+  # multi_column_keys: ['instruction', 'input', 'output']
+  # multi_column_prefix: ['Instruction:\n', 'Input:\n', 'Output:\n']
+  # multi_column_train_mask: [true, false, true]
+  # multi_column_separator: '\n\n'
+
+  # If processing prompt/completion jsonl pairs, the prompt is masked by default
+  # use this flag to disable this default behaviour
+  # ---
+  # disable_prompt_completion_mask: false
+
+  # Skip database setup checks if datapath exists, ignored if using preload_datapath.py
+  skip_datapath_setup: True 
+
+# Path to the current checkpoint to continue training from
+# Enable this to the last checkpoint after the first run 
+# (if it crash and you want to resume)
+# ckpt_path: ../checkpoint/trainer-validaiton/infctx-unit-test-baseline/epoch=0-step=20.ckpt
+ckpt_path: null
diff --git a/notebook/trainer-v5-validation/minipile-validation.ipynb b/notebook/trainer-v5-validation/minipile-validation.ipynb
new file mode 100644
index 00000000..0810e624
--- /dev/null
+++ b/notebook/trainer-v5-validation/minipile-validation.ipynb
@@ -0,0 +1,421 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# RWKV v5 minipile validation\n",
+    "\n",
+    "**L12-D768 model**"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Preparing the init model and test dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ENABLE_WANDB: True\n",
+      "GPU_DEVICES: auto\n",
+      "NOTEBOOK_DIR: /home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation\n",
+      "TRAINER_DIR: /home/picocreator/rwkv-proj/RWKV-infctx-trainer/RWKV-v5\n",
+      "PROJECT_DIR: /home/picocreator/rwkv-proj/RWKV-infctx-trainer\n"
+     ]
+    }
+   ],
+   "source": [
+    "GPU_DEVICES=\"1\"\n",
+    "ENABLE_WANDB=True\n",
+    "WANDB_PREFIX=\"infctx-v5-validation - MiniPile\"\n",
+    "DEEPSPEED_STRAT=\"deepspeed_stage_1\"\n",
+    "\n",
+    "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n",
+    "print(\"GPU_DEVICES:\", GPU_DEVICES)\n",
+    "\n",
+    "if ENABLE_WANDB:\n",
+    "    WANDB_MODE=\"online\"\n",
+    "else:\n",
+    "    WANDB_MODE=\"disabled\"\n",
+    "\n",
+    "# Computing the notebook, and various paths\n",
+    "import os\n",
+    "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n",
+    "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../\"))\n",
+    "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n",
+    "\n",
+    "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n",
+    "print(\"TRAINER_DIR:\", TRAINER_DIR)\n",
+    "print(\"PROJECT_DIR:\", PROJECT_DIR)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# First lets setup the various directories\n",
+    "!mkdir -p \"{PROJECT_DIR}/model/\"\n",
+    "!mkdir -p \"{PROJECT_DIR}/dataset/\"\n",
+    "!mkdir -p \"{PROJECT_DIR}/datapath/\"\n",
+    "!mkdir -p \"{PROJECT_DIR}/checkpoint/\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2023-12-18 07:39:15--  https://huggingface.co/datasets/BlinkDL/minipile-tokenized/resolve/main/rwkv_vocab_v20230424/minipile.idx\n",
+      "Resolving huggingface.co (huggingface.co)... 13.33.33.55, 13.33.33.110, 13.33.33.102, ...\n",
+      "Connecting to huggingface.co (huggingface.co)|13.33.33.55|:443... connected.\n",
+      "HTTP request sent, awaiting response... "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "302 Found\n",
+      "Location: https://cdn-lfs-us-1.huggingface.co/repos/09/8d/098d39f30da901c320a0b91b647dbfcdb64742d734ad97ab2247383b7265662e/f526abddaa06d376443e69c9a6c0fcbe4302afc0cb1aed08faf3fb97fc5acd10?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27minipile.idx%3B+filename%3D%22minipile.idx%22%3B&Expires=1703115555&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwMzExNTU1NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzA5LzhkLzA5OGQzOWYzMGRhOTAxYzMyMGEwYjkxYjY0N2RiZmNkYjY0NzQyZDczNGFkOTdhYjIyNDczODNiNzI2NTY2MmUvZjUyNmFiZGRhYTA2ZDM3NjQ0M2U2OWM5YTZjMGZjYmU0MzAyYWZjMGNiMWFlZDA4ZmFmM2ZiOTdmYzVhY2QxMD9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=UycA%7Eo%7EEWgjN6kwZtAZSB6k5Nz7B5MQttQCeTVw5OD5T1lTLfhRIX3aFxwLTAyMDMOnWK0KGcnWfha6OcBl9%7EGTSfu408xpCk-PyW0E9W45m5fvR5FqLWgR41zakLePM0Ssu0Wb2syrSKCFElocrwluDvNykuHhUQgdhN9hutXENfd6qC8LZmn68eo-PlqIh6ka8sFyfJa-Bteb3mT1SAPmW19if1jiwcWmtFrB-HrdVtrxrGf033MkimToaxtDR310VEkdYmVnwaPSRcd4Hkfc2CR%7Emdd%7Eg-nzfMERz7Qh2CM%7EV6KBEOB%7EfX2fXXI8mTPVJNqxcIw23ZBEhArczmQ__&Key-Pair-Id=KCD77M1F0VK2B [following]\n",
+      "--2023-12-18 07:39:15--  https://cdn-lfs-us-1.huggingface.co/repos/09/8d/098d39f30da901c320a0b91b647dbfcdb64742d734ad97ab2247383b7265662e/f526abddaa06d376443e69c9a6c0fcbe4302afc0cb1aed08faf3fb97fc5acd10?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27minipile.idx%3B+filename%3D%22minipile.idx%22%3B&Expires=1703115555&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwMzExNTU1NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzA5LzhkLzA5OGQzOWYzMGRhOTAxYzMyMGEwYjkxYjY0N2RiZmNkYjY0NzQyZDczNGFkOTdhYjIyNDczODNiNzI2NTY2MmUvZjUyNmFiZGRhYTA2ZDM3NjQ0M2U2OWM5YTZjMGZjYmU0MzAyYWZjMGNiMWFlZDA4ZmFmM2ZiOTdmYzVhY2QxMD9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=UycA%7Eo%7EEWgjN6kwZtAZSB6k5Nz7B5MQttQCeTVw5OD5T1lTLfhRIX3aFxwLTAyMDMOnWK0KGcnWfha6OcBl9%7EGTSfu408xpCk-PyW0E9W45m5fvR5FqLWgR41zakLePM0Ssu0Wb2syrSKCFElocrwluDvNykuHhUQgdhN9hutXENfd6qC8LZmn68eo-PlqIh6ka8sFyfJa-Bteb3mT1SAPmW19if1jiwcWmtFrB-HrdVtrxrGf033MkimToaxtDR310VEkdYmVnwaPSRcd4Hkfc2CR%7Emdd%7Eg-nzfMERz7Qh2CM%7EV6KBEOB%7EfX2fXXI8mTPVJNqxcIw23ZBEhArczmQ__&Key-Pair-Id=KCD77M1F0VK2B\n",
+      "Resolving cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)... 13.33.88.54, 13.33.88.84, 13.33.88.7, ...\n",
+      "Connecting to cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)|13.33.88.54|:443... connected.\n",
+      "HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable\n",
+      "\n",
+      "    The file is already fully retrieved; nothing to do.\n",
+      "\n",
+      "--2023-12-18 07:39:15--  https://huggingface.co/datasets/BlinkDL/minipile-tokenized/resolve/main/rwkv_vocab_v20230424/minipile.bin\n",
+      "Resolving huggingface.co (huggingface.co)... 13.33.33.20, 13.33.33.102, 13.33.33.110, ...\n",
+      "Connecting to huggingface.co (huggingface.co)|13.33.33.20|:443... connected.\n",
+      "HTTP request sent, awaiting response... 302 Found\n",
+      "Location: https://cdn-lfs-us-1.huggingface.co/repos/09/8d/098d39f30da901c320a0b91b647dbfcdb64742d734ad97ab2247383b7265662e/9917a52991b9ce5b0b05f92101962ba704cf3c4c20b64431ff8c45ba9d4141a5?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27minipile.bin%3B+filename%3D%22minipile.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1703115555&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwMzExNTU1NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzA5LzhkLzA5OGQzOWYzMGRhOTAxYzMyMGEwYjkxYjY0N2RiZmNkYjY0NzQyZDczNGFkOTdhYjIyNDczODNiNzI2NTY2MmUvOTkxN2E1Mjk5MWI5Y2U1YjBiMDVmOTIxMDE5NjJiYTcwNGNmM2M0YzIwYjY0NDMxZmY4YzQ1YmE5ZDQxNDFhNT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=iDC3gWLKONw98DKGci%7ECza5tw-GGam9Yzp2u-tzqIr8SbJm%7EAWlT8QTLEiof9HrYmViwjTxt7ccXAk1m0Y0h4RchnE3xV1kCcAmCd0i%7EYAn4beKa7SvTgUKETCWGax382LNRM-pFC81TOmrbCPKbMsQKIiKIHCZ6aSjWd%7E-cqNSWs8VhL2Zs9ACnYFQXK%7E%7EOuTklP53PG0BpAfa7IGNxMyLYqQVr%7EzFd2UQAIgqpB2otxphl-e526oYIIun0jb6zcer8Qe93kG4S9O%7ETCKYBYwa2DNEYeeJZT0PAzKQrtbLDHn3LRm%7ES-uit6k-ReRDRJNEwwLsrXo9afWtn%7E9DjxA__&Key-Pair-Id=KCD77M1F0VK2B [following]\n",
+      "--2023-12-18 07:39:15--  https://cdn-lfs-us-1.huggingface.co/repos/09/8d/098d39f30da901c320a0b91b647dbfcdb64742d734ad97ab2247383b7265662e/9917a52991b9ce5b0b05f92101962ba704cf3c4c20b64431ff8c45ba9d4141a5?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27minipile.bin%3B+filename%3D%22minipile.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1703115555&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwMzExNTU1NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzA5LzhkLzA5OGQzOWYzMGRhOTAxYzMyMGEwYjkxYjY0N2RiZmNkYjY0NzQyZDczNGFkOTdhYjIyNDczODNiNzI2NTY2MmUvOTkxN2E1Mjk5MWI5Y2U1YjBiMDVmOTIxMDE5NjJiYTcwNGNmM2M0YzIwYjY0NDMxZmY4YzQ1YmE5ZDQxNDFhNT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=iDC3gWLKONw98DKGci%7ECza5tw-GGam9Yzp2u-tzqIr8SbJm%7EAWlT8QTLEiof9HrYmViwjTxt7ccXAk1m0Y0h4RchnE3xV1kCcAmCd0i%7EYAn4beKa7SvTgUKETCWGax382LNRM-pFC81TOmrbCPKbMsQKIiKIHCZ6aSjWd%7E-cqNSWs8VhL2Zs9ACnYFQXK%7E%7EOuTklP53PG0BpAfa7IGNxMyLYqQVr%7EzFd2UQAIgqpB2otxphl-e526oYIIun0jb6zcer8Qe93kG4S9O%7ETCKYBYwa2DNEYeeJZT0PAzKQrtbLDHn3LRm%7ES-uit6k-ReRDRJNEwwLsrXo9afWtn%7E9DjxA__&Key-Pair-Id=KCD77M1F0VK2B\n",
+      "Resolving cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)... 13.33.88.54, 13.33.88.84, 13.33.88.7, ...\n",
+      "Connecting to cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)|13.33.88.54|:443... connected.\n",
+      "HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable\n",
+      "\n",
+      "    The file is already fully retrieved; nothing to do.\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Download the minipile files\n",
+    "!cd \"{PROJECT_DIR}\" && wget --continue -O dataset/minipile.idx https://huggingface.co/datasets/BlinkDL/minipile-tokenized/resolve/main/rwkv_vocab_v20230424/minipile.idx\n",
+    "!cd \"{PROJECT_DIR}\" && wget --continue -O dataset/minipile.bin https://huggingface.co/datasets/BlinkDL/minipile-tokenized/resolve/main/rwkv_vocab_v20230424/minipile.bin\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2023-12-18 07:32:33,143] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.1'\n",
+      "---- Initializing model ----\n",
+      "No of layers: 12\n",
+      "Embedding size: 768\n",
+      "Output model path: ../model/L12-D768-world-init.pth\n",
+      "Vocab size: 65536\n",
+      "Emb scale: 0.0001\n",
+      "Note: this process takes a significant time (and ram) for large models\n",
+      "---- ----- ----\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /home/picocreator/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "65536 768   -0.0001 emb.weight\n",
+      "768   768   1.0  blocks.0.att.receptance.weight\n",
+      "768   768   1.0  blocks.0.att.key.weight\n",
+      "768   768   1.0  blocks.0.att.value.weight\n",
+      "768   768   0    blocks.0.att.output.weight\n",
+      "768   768   1.0  blocks.0.att.gate.weight\n",
+      "2688  768   1.0  blocks.0.ffn.key.weight\n",
+      "768   768   0    blocks.0.ffn.receptance.weight\n",
+      "768   2688  0    blocks.0.ffn.value.weight\n",
+      "768   768   1.0  blocks.1.att.receptance.weight\n",
+      "768   768   1.0  blocks.1.att.key.weight\n",
+      "768   768   1.0  blocks.1.att.value.weight\n",
+      "768   768   0    blocks.1.att.output.weight\n",
+      "768   768   1.0  blocks.1.att.gate.weight\n",
+      "2688  768   1.0  blocks.1.ffn.key.weight\n",
+      "768   768   0    blocks.1.ffn.receptance.weight\n",
+      "768   2688  0    blocks.1.ffn.value.weight\n",
+      "768   768   1.0  blocks.2.att.receptance.weight\n",
+      "768   768   1.0  blocks.2.att.key.weight\n",
+      "768   768   1.0  blocks.2.att.value.weight\n",
+      "768   768   0    blocks.2.att.output.weight\n",
+      "768   768   1.0  blocks.2.att.gate.weight\n",
+      "2688  768   1.0  blocks.2.ffn.key.weight\n",
+      "768   768   0    blocks.2.ffn.receptance.weight\n",
+      "768   2688  0    blocks.2.ffn.value.weight\n",
+      "768   768   1.0  blocks.3.att.receptance.weight\n",
+      "768   768   1.0  blocks.3.att.key.weight\n",
+      "768   768   1.0  blocks.3.att.value.weight\n",
+      "768   768   0    blocks.3.att.output.weight\n",
+      "768   768   1.0  blocks.3.att.gate.weight\n",
+      "2688  768   1.0  blocks.3.ffn.key.weight\n",
+      "768   768   0    blocks.3.ffn.receptance.weight\n",
+      "768   2688  0    blocks.3.ffn.value.weight\n",
+      "768   768   1.0  blocks.4.att.receptance.weight\n",
+      "768   768   1.0  blocks.4.att.key.weight\n",
+      "768   768   1.0  blocks.4.att.value.weight\n",
+      "768   768   0    blocks.4.att.output.weight\n",
+      "768   768   1.0  blocks.4.att.gate.weight\n",
+      "2688  768   1.0  blocks.4.ffn.key.weight\n",
+      "768   768   0    blocks.4.ffn.receptance.weight\n",
+      "768   2688  0    blocks.4.ffn.value.weight\n",
+      "768   768   1.0  blocks.5.att.receptance.weight\n",
+      "768   768   1.0  blocks.5.att.key.weight\n",
+      "768   768   1.0  blocks.5.att.value.weight\n",
+      "768   768   0    blocks.5.att.output.weight\n",
+      "768   768   1.0  blocks.5.att.gate.weight\n",
+      "2688  768   1.0  blocks.5.ffn.key.weight\n",
+      "768   768   0    blocks.5.ffn.receptance.weight\n",
+      "768   2688  0    blocks.5.ffn.value.weight\n",
+      "768   768   1.0  blocks.6.att.receptance.weight\n",
+      "768   768   1.0  blocks.6.att.key.weight\n",
+      "768   768   1.0  blocks.6.att.value.weight\n",
+      "768   768   0    blocks.6.att.output.weight\n",
+      "768   768   1.0  blocks.6.att.gate.weight\n",
+      "2688  768   1.0  blocks.6.ffn.key.weight\n",
+      "768   768   0    blocks.6.ffn.receptance.weight\n",
+      "768   2688  0    blocks.6.ffn.value.weight\n",
+      "768   768   1.0  blocks.7.att.receptance.weight\n",
+      "768   768   1.0  blocks.7.att.key.weight\n",
+      "768   768   1.0  blocks.7.att.value.weight\n",
+      "768   768   0    blocks.7.att.output.weight\n",
+      "768   768   1.0  blocks.7.att.gate.weight\n",
+      "2688  768   1.0  blocks.7.ffn.key.weight\n",
+      "768   768   0    blocks.7.ffn.receptance.weight\n",
+      "768   2688  0    blocks.7.ffn.value.weight\n",
+      "768   768   1.0  blocks.8.att.receptance.weight\n",
+      "768   768   1.0  blocks.8.att.key.weight\n",
+      "768   768   1.0  blocks.8.att.value.weight\n",
+      "768   768   0    blocks.8.att.output.weight\n",
+      "768   768   1.0  blocks.8.att.gate.weight\n",
+      "2688  768   1.0  blocks.8.ffn.key.weight\n",
+      "768   768   0    blocks.8.ffn.receptance.weight\n",
+      "768   2688  0    blocks.8.ffn.value.weight\n",
+      "768   768   1.0  blocks.9.att.receptance.weight\n",
+      "768   768   1.0  blocks.9.att.key.weight\n",
+      "768   768   1.0  blocks.9.att.value.weight\n",
+      "768   768   0    blocks.9.att.output.weight\n",
+      "768   768   1.0  blocks.9.att.gate.weight\n",
+      "2688  768   1.0  blocks.9.ffn.key.weight\n",
+      "768   768   0    blocks.9.ffn.receptance.weight\n",
+      "768   2688  0    blocks.9.ffn.value.weight\n",
+      "768   768   1.0  blocks.10.att.receptance.weight\n",
+      "768   768   1.0  blocks.10.att.key.weight\n",
+      "768   768   1.0  blocks.10.att.value.weight\n",
+      "768   768   0    blocks.10.att.output.weight\n",
+      "768   768   1.0  blocks.10.att.gate.weight\n",
+      "2688  768   1.0  blocks.10.ffn.key.weight\n",
+      "768   768   0    blocks.10.ffn.receptance.weight\n",
+      "768   2688  0    blocks.10.ffn.value.weight\n",
+      "768   768   1.0  blocks.11.att.receptance.weight\n",
+      "768   768   1.0  blocks.11.att.key.weight\n",
+      "768   768   1.0  blocks.11.att.value.weight\n",
+      "768   768   0    blocks.11.att.output.weight\n",
+      "768   768   1.0  blocks.11.att.gate.weight\n",
+      "2688  768   1.0  blocks.11.ffn.key.weight\n",
+      "768   768   0    blocks.11.ffn.receptance.weight\n",
+      "768   2688  0    blocks.11.ffn.value.weight\n",
+      "65536 768   0.5  head.weight\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Lets initialized the L6-D512 model with the init_model.py code\n",
+    "!cd \"{TRAINER_DIR}\" && python3 init_model.py \\\n",
+    "    --n_layer 12 --n_embd 768 \\\n",
+    "    --vocab_size world \\\n",
+    "    --skip-if-exists --safe-init \\\n",
+    "    ../model/L12-D768-world-init.pth"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.\n",
+      "Map (num_proc=16): 100%|█████| 1010499/1010499 [03:35<00:00, 4692.38 examples/s]\n",
+      "num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.\n",
+      "Map: 100%|████████████████████████████████| 1/1 [00:00<00:00, 432.54 examples/s]\n",
+      "Map (num_proc=16): 100%|████| 2928070/2928070 [01:24<00:00, 34652.87 examples/s]\n",
+      "Saving the dataset (19/19 shards): 100%|█| 2928070/2928070 [00:19<00:00, 152003.\n",
+      "Saving the dataset (1/1 shards): : 0 examples [00:00, ? examples/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Preload the dataset\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    python3 preload_datapath.py \"{NOTEBOOK_DIR}/config/minipile-world-512.yaml\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2023-12-18 11:57:34,927] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.1'\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/minipile-world-512.yaml', '--trainer.logger.init_args.name=infctx-v5-validation - MiniPile (train-ctx=512, data-ctx=512, deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.devices=auto', '--model.load_model=../model/L12-D768-world-init.pth'], args=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/minipile-world-512.yaml', '--trainer.logger.init_args.name=infctx-v5-validation - MiniPile (train-ctx=512, data-ctx=512, deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.devices=auto', '--model.load_model=../model/L12-D768-world-init.pth'].\n",
+      "Seed set to 3941088705\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /home/picocreator/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "\n",
+      "\n",
+      "[RWKV.Trainer] Applying 'target_batch_size' with the following:\n",
+      "   - target_batch_size:       16\n",
+      "   - num_nodes:               1\n",
+      "   - num_devices:             1\n",
+      "   - microbatch_size:         16\n",
+      "   - accumulate_grad_batches: 1\n",
+      "   - effective_batch_size:    16\n",
+      "\n",
+      "[rank: 0] Seed set to 3941088705\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/1\n",
+      "Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.16.1 is available!  To upgrade, please run:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:  $ pip install wandb --upgrade\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.16.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20231218_115739-69qe82py\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33minfctx-v5-validation - MiniPile (train-ctx=512, data-ctx=512, deepspeed_stage_1)\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-infctx-unit-test\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-infctx-unit-test/runs/69qe82py\u001b[0m\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:639: Checkpoint directory ../checkpoint/trainer-validaiton/infctx-v5-minipile-512 exists and is not empty.\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
+      "#\n",
+      "# RWKV lighting_trainer.py important notes \n",
+      "# https://github.com/RWKV/RWKV-infctx-trainer \n",
+      "#\n",
+      "# - Ensure your host is not running cuda 12.0 (use either 11.8, or >=12.1), as this is known to have freeze issues\n",
+      "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\n",
+      "# - When resuming from checkpoint, the estimated time is inaccurate\n",
+      "#\n",
+      "\n",
+      "[RWKV.model] Configuring optimizer with\n",
+      "    - lr_init:  6.000e-04 (0.0006)\n",
+      "    - lr_final: 6.000e-05 (6e-05)\n",
+      "\n",
+      "Using /home/picocreator/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu121/fused_adam/build.ninja...\n",
+      "Building extension module fused_adam...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.05427098274230957 seconds\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201450/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading `train_dataloader` to estimate number of stepping batches.\n",
+      "\n",
+      "  | Name   | Type       | Params\n",
+      "--------------------------------------\n",
+      "0 | emb    | Embedding  | 50.3 M\n",
+      "1 | blocks | ModuleList | 92.1 M\n",
+      "2 | ln_out | LayerNorm  | 1.5 K \n",
+      "3 | head   | Linear     | 50.3 M\n",
+      "--------------------------------------\n",
+      "192 M     Trainable params\n",
+      "0         Non-trainable params\n",
+      "192 M     Total params\n",
+      "771.232   Total estimated model params size (MB)\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/utilities/data.py:104: Total length of `DataLoader` across ranks is zero. Please make sure this was your intention.\n",
+      "Epoch 0:   1%| | 1000/183005 [02:20<7:07:18,  7.10it/s, v_num=82py, train/loss=5/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "Epoch 0:  40%|▍| 74044/183005 [2:55:32<4:18:19,  7.03it/s, v_num=82py, train/los"
+     ]
+    }
+   ],
+   "source": [
+    "# Minipile training\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
+    "    python3 lightning_trainer.py fit \\\n",
+    "        -c \"{NOTEBOOK_DIR}/config/minipile-world-512.yaml\" \\\n",
+    "        --trainer.logger.init_args.name=\"{WANDB_PREFIX} (train-ctx=512, data-ctx=512, {DEEPSPEED_STRAT})\" \\\n",
+    "        --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
+    "        --trainer.devices=\"{GPU_DEVICES}\" \\\n",
+    "        --model.load_model=\"../model/L12-D768-world-init.pth\"\n",
+    "        "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "rwkv-infctx",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 9121adc9e7ac4c933ac2719a16fab4bee6b33dae Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Wed, 17 Jan 2024 14:36:21 +0800
Subject: [PATCH 17/33] dropping the loss bias (tmp)

---
 RWKV-v5/src/model.py | 54 ++++++++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/RWKV-v5/src/model.py b/RWKV-v5/src/model.py
index 86cb7025..b090378f 100644
--- a/RWKV-v5/src/model.py
+++ b/RWKV-v5/src/model.py
@@ -803,34 +803,34 @@ def compute_loss(self, batch, batch_idx, is_training_run: bool):
         # should not be allowed
         num_devices = self.trainer.num_devices
 
-        ### ---
-        ### Positional loss bias handling
-        ### ---
+        # ### ---
+        # ### Positional loss bias handling
+        # ### ---
         
-        # Get the starting and ending loss bias
-        loss_bias_start = self.position_loss_bias
-        loss_bias_end   = 2.0 - loss_bias_start
-
-        # Skip loss bias calculation, if loss_bias_start is 1.0
-        if loss_bias_start == 1.0 or (is_training_run == False and self.position_loss_bias_in_validation == False):
-            seq_mask = ori_seq_mask
-        else:
-            # Lets get the torch mask sum
-            total_mask_sum = torch.sum(ori_seq_mask)
-
-            # Lets get a linear multiplier for the loss bias
-            # seq_mask_sum = torch.sum(ori_seq_mask)
-            bias_mask = torch.linspace(loss_bias_start, loss_bias_end, int(total_mask_sum.item()), device=ori_seq_mask.device)
-
-            # Boolean flag of seq_mask > 0
-            seq_mask_index = ori_seq_mask[0] > 0
-
-            # Apply the bias mask only to positive seq_mask values
-            final_mask = torch.zeros(ori_seq_mask.shape[1], device=ori_seq_mask.device)
-            final_mask[seq_mask_index] = ori_seq_mask[0][seq_mask_index] * bias_mask
-
-            # And save it as seq_mask
-            seq_mask = final_mask.unsqueeze(0)
+        # # Get the starting and ending loss bias
+        # loss_bias_start = self.position_loss_bias
+        # loss_bias_end   = 2.0 - loss_bias_start
+
+        # # Skip loss bias calculation, if loss_bias_start is 1.0
+        # if loss_bias_start == 1.0 or (is_training_run == False and self.position_loss_bias_in_validation == False):
+        #     seq_mask = ori_seq_mask
+        # else:
+        #     # Lets get the torch mask sum
+        #     total_mask_sum = torch.sum(ori_seq_mask)
+
+        #     # Lets get a linear multiplier for the loss bias
+        #     # seq_mask_sum = torch.sum(ori_seq_mask)
+        #     bias_mask = torch.linspace(loss_bias_start, loss_bias_end, int(total_mask_sum.item()), device=ori_seq_mask.device)
+
+        #     # Boolean flag of seq_mask > 0
+        #     seq_mask_index = ori_seq_mask[0] > 0
+
+        #     # Apply the bias mask only to positive seq_mask values
+        #     final_mask = torch.zeros(ori_seq_mask.shape[1], device=ori_seq_mask.device)
+        #     final_mask[seq_mask_index] = ori_seq_mask[0][seq_mask_index] * bias_mask
+
+        #     # And save it as seq_mask
+        #     seq_mask = final_mask.unsqueeze(0)
 
         ### ---
         ### Training cutoff logic handling 

From 7e2278a6c4084e6ffc8815a692f379d33db2d669 Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Wed, 17 Jan 2024 15:27:35 +0800
Subject: [PATCH 18/33] wip loss calc tweak

---
 RWKV-v5/src/model.py | 37 +++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/RWKV-v5/src/model.py b/RWKV-v5/src/model.py
index b090378f..de823d83 100644
--- a/RWKV-v5/src/model.py
+++ b/RWKV-v5/src/model.py
@@ -832,6 +832,9 @@ def compute_loss(self, batch, batch_idx, is_training_run: bool):
         #     # And save it as seq_mask
         #     seq_mask = final_mask.unsqueeze(0)
 
+        # Since we are no longer doing positional loss above, use seq_mask directly
+        seq_mask = ori_seq_mask
+
         ### ---
         ### Training cutoff logic handling 
         ### ---
@@ -884,7 +887,7 @@ def compute_loss(self, batch, batch_idx, is_training_run: bool):
             return 0
         
         # Checkpoint steps
-        def checkpointed_step(idx, targets, mask, prev_loss, last_shift_states,
+        def checkpointed_step(idx, targets, mask, last_shift_states,
                               last_wkv_states, prev_steps):
             logits, new_shift_states, new_wkv_states = self(
                 idx, last_shift_states, last_wkv_states)
@@ -895,18 +898,26 @@ def checkpointed_step(idx, targets, mask, prev_loss, last_shift_states,
             targets = targets.contiguous()
             mask = mask.contiguous()
 
-            loss = F.cross_entropy(logits.view(-1, logits.size(-1)),
+            # Compute the token loss
+            token_loss = F.cross_entropy(logits.view(-1, logits.size(-1)),
                                     targets.view(-1),
                                     reduction="none")
-
-            submask = mask.view(-1)[:loss.shape[0]]
+            submask = mask.view(-1)[:token_loss.shape[0]]
             submask_sum = torch.sum(submask)
-            loss = torch.sum(loss * submask) / total_mask_sum  
 
-            loss = L2Wrap.apply(loss, logits, total_mask_sum, submask)
+            # The training loss to use
+            train_loss = torch.sum(token_loss * submask) / total_mask_sum  
+
+            # # Sample loss, without backprop 
+            # sample_loss = torch.sum(token_loss * submask) / total_mask_sum
+            
+
+
+            segment_loss = L2Wrap.apply(train_loss, logits, total_mask_sum, submask)
+
+
             new_steps = prev_steps + submask_sum
-            new_loss = prev_loss + loss
-            return new_loss, new_shift_states, new_wkv_states, new_steps
+            return segment_loss, new_shift_states, new_wkv_states, new_steps
 
         total_loss = torch.tensor(0, dtype=self.emb.weight.dtype).requires_grad_()
         steps = 0
@@ -1056,7 +1067,6 @@ def checkpointed_step(idx, targets, mask, prev_loss, last_shift_states,
                     cur_idx,
                     cur_tar,
                     cur_msk,
-                    torch.tensor(0, dtype=self.emb.weight.dtype, device=cur_device).requires_grad_(True),
                     prv_shift_states,
                     prv_wkv_states,
                     steps,
@@ -1067,7 +1077,7 @@ def checkpointed_step(idx, targets, mask, prev_loss, last_shift_states,
                 # segment_loss_arr[i] = segment_loss
 
                 # Perform the backward pass accordingly, for valid segments (besides the last segment)
-                # In this version, we do backward passes together the forward passes in the main segment loop
+                # In this version, we do backward passes together with the forward passes in the main segment loop
                 # Instead of after all segment losses are computed
                 if i >= start_learning_segment and i < start_learning_segment + backward_segment_count:
                     # The learning loss, should be normalized against the accumulation steps
@@ -1133,26 +1143,25 @@ def checkpointed_step(idx, targets, mask, prev_loss, last_shift_states,
             segment_size = self.ctx_len
             for i in range(segment_count):
                 if i < segment_count-1 and is_training_run:
-                    total_loss, new_shift_states, new_wkv_states, steps = deepspeed_checkpoint(
+                    segment_loss, new_shift_states, new_wkv_states, steps = deepspeed_checkpoint(
                         checkpointed_step,
                         idx[:, i * segment_size:(i + 1) * segment_size],
                         targets[:, i * segment_size:(i + 1) * segment_size],
                         seq_mask[:, i * segment_size:(i + 1) * segment_size],
-                        total_loss,
                         states.shift_states,
                         states.wkv_states,
                         steps,
                     )
                 else:
-                    total_loss, new_shift_states, new_wkv_states, steps = checkpointed_step(
+                    segment_loss, new_shift_states, new_wkv_states, steps = checkpointed_step(
                         idx[:, i * segment_size:(i + 1) * segment_size],
                         targets[:, i * segment_size:(i + 1) * segment_size],
                         seq_mask[:, i * segment_size:(i + 1) * segment_size],
-                        total_loss,
                         states.shift_states,
                         states.wkv_states,
                         steps,
                     )
+                total_loss = total_loss + segment_loss
 
                 states = BlockStateList(new_shift_states, new_wkv_states)
                 gc.collect()

From 1239547492399b73ea17c8cdecd68c9682af328e Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Wed, 17 Jan 2024 16:02:13 +0800
Subject: [PATCH 19/33] refactoring loss handling

---
 RWKV-v5/src/model.py | 121 ++++++++++++++++++++-----------------------
 1 file changed, 56 insertions(+), 65 deletions(-)

diff --git a/RWKV-v5/src/model.py b/RWKV-v5/src/model.py
index de823d83..9a9676d5 100644
--- a/RWKV-v5/src/model.py
+++ b/RWKV-v5/src/model.py
@@ -888,7 +888,7 @@ def compute_loss(self, batch, batch_idx, is_training_run: bool):
         
         # Checkpoint steps
         def checkpointed_step(idx, targets, mask, last_shift_states,
-                              last_wkv_states, prev_steps):
+                              last_wkv_states):
             logits, new_shift_states, new_wkv_states = self(
                 idx, last_shift_states, last_wkv_states)
             
@@ -903,29 +903,33 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
                                     targets.view(-1),
                                     reduction="none")
             submask = mask.view(-1)[:token_loss.shape[0]]
-            submask_sum = torch.sum(submask)
 
             # The training loss to use
             train_loss = torch.sum(token_loss * submask) / total_mask_sum  
+            train_token_count = torch.sum(submask)
 
-            # # Sample loss, without backprop 
-            # sample_loss = torch.sum(token_loss * submask) / total_mask_sum
+            # Sample loss, without backprop 
+            sample_loss = torch.sum(token_loss * submask) / total_mask_sum
             
+            # L2Wrap for the backprop process
+            segment_train_loss = L2Wrap.apply(train_loss, logits, total_mask_sum, submask)
 
+            # Return the checkpoint values
+            return sample_loss, segment_train_loss, new_shift_states, new_wkv_states, train_token_count
 
-            segment_loss = L2Wrap.apply(train_loss, logits, total_mask_sum, submask)
-
-
-            new_steps = prev_steps + submask_sum
-            return segment_loss, new_shift_states, new_wkv_states, new_steps
-
-        total_loss = torch.tensor(0, dtype=self.emb.weight.dtype).requires_grad_()
-        steps = 0
+        # Initialize the states, and compute the segment count
         states = BlockStateList.create(self.n_layer, B, C, 
                                        self.n_head, self.head_size,
                                        seq.device, self.emb.weight.dtype)
         segment_count = math.ceil(T / self.ctx_len)
 
+        # Initialize the training loss, and the token count
+        training_loss = torch.tensor(0, dtype=self.emb.weight.dtype).requires_grad_()
+        training_tokens = 0
+
+        # Raw sample loss (before selective token training)
+        sampling_loss = 0
+
         ### ---
         ### Learning process logic (BPTT or not)
         ### ---
@@ -1063,13 +1067,12 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
                     cur_msk = dummy_2d_zero
 
                 # Segmented learning, applies the forward/pass over each chunk seperately
-                segment_loss, new_shift_states, new_wkv_states, steps = checkpointed_step(
+                segment_sample_loss, segment_train_loss, new_shift_states, new_wkv_states, segment_train_tokens = checkpointed_step(
                     cur_idx,
                     cur_tar,
                     cur_msk,
                     prv_shift_states,
-                    prv_wkv_states,
-                    steps,
+                    prv_wkv_states
                 )
                 states = BlockStateList(new_shift_states, new_wkv_states)
 
@@ -1079,90 +1082,68 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
                 # Perform the backward pass accordingly, for valid segments (besides the last segment)
                 # In this version, we do backward passes together with the forward passes in the main segment loop
                 # Instead of after all segment losses are computed
+                #
+                # In the past, we have implemented to do all forward, and all backwards. But this was found to be "slow"
                 if i >= start_learning_segment and i < start_learning_segment + backward_segment_count:
                     # The learning loss, should be normalized against the accumulation steps
                     # as we are bypassing the pytorch lightning normalization
                     # https://lightning.ai/docs/pytorch/2.0.4/common/lightning_module.html#backward
-                    learning_loss = segment_loss / gradient_accumulation_steps
+                    learning_loss = segment_train_loss / gradient_accumulation_steps
 
                     # Perform the backward pass accordingly, for valid segments (besides the last segment)
                     if i == start_learning_segment + backward_segment_count - 1:
                         # This is the last backward pass, we let the default pytorch lightning handle the backward pass
                         # and return the segment loss as part of the total loss
-                        total_loss = total_loss + segment_loss
+                        training_loss = training_loss + segment_train_loss
                     else:
                         # Undocumented multiple backward pass support
                         # https://github.com/Lightning-AI/lightning/blob/678f642808c54e4c490caee4df5d357301c976bb/tests/trainer/optimization/test_manual_optimization.py#L251
                         self.manual_backward(learning_loss, optimizer, retain_graph=True)
             
                         # Accumulate without gradient, as we already did the backward pass
-                        total_loss = total_loss + segment_loss.clone().detach().requires_grad_(False)
+                        training_loss = training_loss + segment_train_loss.clone().detach().requires_grad_(False)
                 else:
                     # Even if its not the segments we use for backward pass, we still need to accumulate the loss
-                    total_loss = total_loss + segment_loss.clone().detach().requires_grad_(False)
+                    training_loss = training_loss + segment_train_loss.clone().detach().requires_grad_(False)
                 
+                # Add token count and raw sampling loss
+                training_tokens = training_tokens + segment_train_tokens
+                sampling_loss = sampling_loss + segment_sample_loss
+
                 # GC collect unused memory
                 # gc.collect()
                 # torch.cuda.empty_cache()
-
-            # # Lets backpass the respective segments, in reverse
-            # # (including dummy backpass)
-            # for i in range(forward_segment_count-1, -1, -1):
-            #     # Get the segment loss
-            #     segment_loss = segment_loss_arr[i]
-            #
-            #     # Compute the backward pass for the segment
-            #     if i >= start_learning_segment and i < start_learning_segment + backward_segment_count:
-            #         # The learning loss, should be normalized against the accumulation steps
-            #         # as we are bypassing the pytorch lightning normalization
-            #         # https://lightning.ai/docs/pytorch/2.0.4/common/lightning_module.html#backward
-            #         learning_loss = segment_loss / gradient_accumulation_steps
-            #
-            #         # Perform the backward pass accordingly, for valid segments (besides the start_learning_segment)
-            #         if i > start_learning_segment:
-            #             # Undocumented multiple backward pass support
-            #             # https://github.com/Lightning-AI/lightning/blob/678f642808c54e4c490caee4df5d357301c976bb/tests/trainer/optimization/test_manual_optimization.py#L251
-            #             self.manual_backward(learning_loss, optimizer, retain_graph=True)
-            #
-            #             # Accumulate without gradient, as we already did the backward pass
-            #             total_loss = total_loss + segment_loss.clone().detach().requires_grad_(False)
-            #         else:
-            #             # This is the last backward pass, we let the default pytorch lightning handle the backward pass
-            #             # and return the segment loss as part of the total loss
-            #             total_loss = total_loss + segment_loss
-            #     else:
-            #         # Even if its not the segments we use for backward pass, we still need to accumulate the loss
-            #         total_loss = total_loss + segment_loss.clone().detach().requires_grad_(False)
-            #
-            #    # GC collect unused memory
-            #    gc.collect()
-            #    # torch.cuda.empty_cache()
         else:
 
+            #
             # Normal operations without BPTT
+            #
             segment_size = self.ctx_len
             for i in range(segment_count):
                 if i < segment_count-1 and is_training_run:
-                    segment_loss, new_shift_states, new_wkv_states, steps = deepspeed_checkpoint(
+                    segment_sample_loss, segment_train_loss, new_shift_states, new_wkv_states, segment_train_tokens = deepspeed_checkpoint(
                         checkpointed_step,
                         idx[:, i * segment_size:(i + 1) * segment_size],
                         targets[:, i * segment_size:(i + 1) * segment_size],
                         seq_mask[:, i * segment_size:(i + 1) * segment_size],
                         states.shift_states,
-                        states.wkv_states,
-                        steps,
+                        states.wkv_states
                     )
                 else:
-                    segment_loss, new_shift_states, new_wkv_states, steps = checkpointed_step(
+                    segment_sample_loss, segment_train_loss, new_shift_states, new_wkv_states, segment_train_tokens = checkpointed_step(
                         idx[:, i * segment_size:(i + 1) * segment_size],
                         targets[:, i * segment_size:(i + 1) * segment_size],
                         seq_mask[:, i * segment_size:(i + 1) * segment_size],
                         states.shift_states,
-                        states.wkv_states,
-                        steps,
+                        states.wkv_states
                     )
-                total_loss = total_loss + segment_loss
+                
+                # Add them up
+                training_loss = training_loss + segment_train_loss
+                training_tokens = training_tokens + segment_train_tokens
+                sampling_loss = sampling_loss + segment_sample_loss
 
+                # Update the states
                 states = BlockStateList(new_shift_states, new_wkv_states)
                 gc.collect()
                 # torch.cuda.empty_cache()
@@ -1171,24 +1152,34 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
         if wandb.run is not None and is_training_run:
             global_rank = self.global_rank
             global_device_count = self.trainer.num_devices * self.trainer.num_nodes
+            microbatch_size = self.trainer.microbatch_size
 
             # Get the total dataset context length
             batch_ctx_len = 0
             if "data_ctx_len" in batch:
                 batch_ctx_len = torch.sum(batch["data_ctx_len"]).item()
             else:
-                batch_ctx_len = T * self.trainer.microbatch_size
+                batch_ctx_len = T * microbatch_size
 
             # Increment the counting tokens, and log it accordingly
             self._counting_tokens += batch_ctx_len
 
             # Log the line values
             wandb.log({
-                'global_rank': global_rank, 
-                'data_ctx_len': batch_ctx_len / self.trainer.microbatch_size, 
-                'train/loss': total_loss,
+                # The original loss and ctx_len (averaged by batch size)
+                'train/loss': sampling_loss,
+                'train/ctx_len': batch_ctx_len / microbatch_size, 
+
+                # The selective training tokens, and loss
+                'train/tokens': training_tokens / microbatch_size,
+                'train/sel_loss': training_loss,
+
+                # Perf tracking
                 f'perf/tokens_total.gpu.{global_rank}': self._counting_tokens,
                 f'perf/tokens_per_sec.gpu.{global_rank}': self._counting_tokens / max(time.time() - self._counting_time_start, 1),
+
+                # Step and trainer tracking
+                'global_rank': global_rank, 
                 'substep': (batch_idx * global_device_count + global_rank),
                 'trainer/global_step':self.global_step,
                 'trainer/learning_rate': self.trainer.optimizers[0].param_groups[0]['lr'],
@@ -1196,8 +1187,8 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
             })
 
         # Throw if total loss is NaN
-        assert not torch.isnan(total_loss), "total_loss is NaN"
-        return total_loss
+        assert not torch.isnan(training_loss), "training_loss is NaN"
+        return training_loss
 
     #
     # Training and validation steps

From d88c16aa933cc0b48620590fc75fd2fa8d96ed11 Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Wed, 17 Jan 2024 16:19:34 +0800
Subject: [PATCH 20/33] wip sel loss implementation

---
 RWKV-v5/src/model.py | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/RWKV-v5/src/model.py b/RWKV-v5/src/model.py
index 9a9676d5..bc93e907 100644
--- a/RWKV-v5/src/model.py
+++ b/RWKV-v5/src/model.py
@@ -904,13 +904,14 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
                                     reduction="none")
             submask = mask.view(-1)[:token_loss.shape[0]]
 
+            # Sample loss, without backprop 
+            with torch.no_grad():
+                sample_loss = (torch.sum(token_loss * submask) / total_mask_sum).clone().detach().requires_grad_(False)
+            
             # The training loss to use
             train_loss = torch.sum(token_loss * submask) / total_mask_sum  
             train_token_count = torch.sum(submask)
 
-            # Sample loss, without backprop 
-            sample_loss = torch.sum(token_loss * submask) / total_mask_sum
-            
             # L2Wrap for the backprop process
             segment_train_loss = L2Wrap.apply(train_loss, logits, total_mask_sum, submask)
 
@@ -1090,18 +1091,13 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
                     # https://lightning.ai/docs/pytorch/2.0.4/common/lightning_module.html#backward
                     learning_loss = segment_train_loss / gradient_accumulation_steps
 
-                    # Perform the backward pass accordingly, for valid segments (besides the last segment)
-                    if i == start_learning_segment + backward_segment_count - 1:
-                        # This is the last backward pass, we let the default pytorch lightning handle the backward pass
-                        # and return the segment loss as part of the total loss
-                        training_loss = training_loss + segment_train_loss
-                    else:
-                        # Undocumented multiple backward pass support
-                        # https://github.com/Lightning-AI/lightning/blob/678f642808c54e4c490caee4df5d357301c976bb/tests/trainer/optimization/test_manual_optimization.py#L251
-                        self.manual_backward(learning_loss, optimizer, retain_graph=True)
-            
-                        # Accumulate without gradient, as we already did the backward pass
-                        training_loss = training_loss + segment_train_loss.clone().detach().requires_grad_(False)
+                    # Undocumented multiple backward pass support
+                    # https://github.com/Lightning-AI/lightning/blob/678f642808c54e4c490caee4df5d357301c976bb/tests/trainer/optimization/test_manual_optimization.py#L251
+                    self.manual_backward(learning_loss, optimizer, retain_graph=True)
+        
+                    # Accumulate without gradient, as we already did the backward pass
+                    # This does mean, that a single backward pass is "wasted" at the end
+                    training_loss = training_loss + segment_train_loss.clone().detach().requires_grad_(False)
                 else:
                     # Even if its not the segments we use for backward pass, we still need to accumulate the loss
                     training_loss = training_loss + segment_train_loss.clone().detach().requires_grad_(False)
@@ -1167,12 +1163,12 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
             # Log the line values
             wandb.log({
                 # The original loss and ctx_len (averaged by batch size)
-                'train/loss': sampling_loss,
                 'train/ctx_len': batch_ctx_len / microbatch_size, 
+                'train/data_loss': sampling_loss,
 
                 # The selective training tokens, and loss
                 'train/tokens': training_tokens / microbatch_size,
-                'train/sel_loss': training_loss,
+                'train/loss': training_loss,
 
                 # Perf tracking
                 f'perf/tokens_total.gpu.{global_rank}': self._counting_tokens,

From 5fcde26a1cfde53e687eabd1b41aea65241f6cd2 Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Wed, 17 Jan 2024 16:35:36 +0800
Subject: [PATCH 21/33] selective token loss threshold initial implementation

---
 RWKV-v5/src/model.py | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/RWKV-v5/src/model.py b/RWKV-v5/src/model.py
index bc93e907..7b1c2b0a 100644
--- a/RWKV-v5/src/model.py
+++ b/RWKV-v5/src/model.py
@@ -193,9 +193,14 @@ def __init__(self,
                  adam_eps: float = 1.0e-08,
                  weight_decay: float = 0.01,
                  warmup_steps: int = -1,
+
                  # loss bias start
                  position_loss_bias: float = 1.0,
                  position_loss_bias_in_validation: bool = False,
+                 
+                 # Selective loss settings
+                 selective_token_loss_threshold: float = 1.0,
+
                  # Backprop settings
                  grad_cp: bool = True,
                  bptt_learning: bool = True,
@@ -289,9 +294,10 @@ def __init__(self,
             print("====================================================================")
             self.bptt_truncated_learning = True
 
-        # Save the position loss params
+        # Save the position loss params, and selective loss settings
         self.position_loss_bias = position_loss_bias
         self.position_loss_bias_in_validation = position_loss_bias_in_validation
+        self.selective_token_loss_threshold = selective_token_loss_threshold
 
         dim_att = dim_att or n_embd
         dim_ffn = dim_ffn or int((n_embd * 3.5) // 32 * 32)
@@ -904,16 +910,29 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
                                     reduction="none")
             submask = mask.view(-1)[:token_loss.shape[0]]
 
-            # Sample loss, without backprop 
-            with torch.no_grad():
-                sample_loss = (torch.sum(token_loss * submask) / total_mask_sum).clone().detach().requires_grad_(False)
-            
-            # The training loss to use
-            train_loss = torch.sum(token_loss * submask) / total_mask_sum  
-            train_token_count = torch.sum(submask)
+            # Selective token loss logic
+            if self.selective_token_loss_threshold > 0.0:
+
+                # Sample loss, without backprop 
+                with torch.no_grad():
+                    sample_loss = (torch.sum(token_loss * submask) / total_mask_sum).clone().detach().requires_grad_(False)
+
+                # Selective loss gating
+                above_threshold = token_loss > self.selective_token_loss_threshold
+                train_mask = submask * above_threshold
+                
+                # The training loss to use
+                train_loss = torch.sum(token_loss * train_mask) / total_mask_sum  
+                train_token_count = torch.sum(train_mask)
+
+            else:
+                train_loss = torch.sum(token_loss * submask) / total_mask_sum
+                sample_loss = train_loss.clone().detach().requires_grad_(False)
+                train_token_count = torch.sum(submask)
+                train_mask = submask
 
             # L2Wrap for the backprop process
-            segment_train_loss = L2Wrap.apply(train_loss, logits, total_mask_sum, submask)
+            segment_train_loss = L2Wrap.apply(train_loss, logits, total_mask_sum, train_mask)
 
             # Return the checkpoint values
             return sample_loss, segment_train_loss, new_shift_states, new_wkv_states, train_token_count

From 8aa0779f7a3a8da45c8d2dc4039f42c1a08ca0eb Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Wed, 17 Jan 2024 17:29:39 +0800
Subject: [PATCH 22/33] WIP data prefix mask

---
 RWKV-v5/src/data.py | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index 368e918a..b5a0a984 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -268,6 +268,15 @@ def encodeTokens(x):
 
                     conversation_enabled = True
 
+            # Apply the data_prefix_skip_mask to the given mask
+            # where relevent, and disables the training mask for the first X tokens
+            data_prefix_skip_mask_enabled = kargs["data_prefix_skip_mask"] is not None
+            def apply_data_prefix_skip_mask(mask):
+                if data_prefix_skip_mask_enabled > 0:
+                    for i in range(data_prefix_skip_mask_enabled):
+                        mask[i] = 0
+                return mask
+            
             # Maps the dataset record to the tokenized result
             # handles a wide variety of format according to the data configuration
             #
@@ -375,7 +384,7 @@ def map_tokenizer(x):
                     return {
                         'input_ids': input_ids,
                         'token_type_ids': token_type_ids,
-                        'attention_mask': attention_mask
+                        'attention_mask': apply_data_prefix_skip_mask(attention_mask)
                     }
                         
                 # Multi column merging support
@@ -443,7 +452,7 @@ def map_tokenizer(x):
                         return {
                             'input_ids': input_ids,
                             'token_type_ids': token_type_ids,
-                            'attention_mask': attention_mask
+                            'attention_mask': apply_data_prefix_skip_mask(attention_mask)
                         }
 
                 # Prompt completion support
@@ -472,12 +481,17 @@ def map_tokenizer(x):
                     return {
                         'input_ids': input_ids,
                         'token_type_ids': token_type_ids,
-                        'attention_mask': attention_mask,
+                        'attention_mask': apply_data_prefix_skip_mask(attention_mask),
                     }
                 
                 # Fallback to standard text tokenization
                 if 'text' in x:
-                    return encodeTokens(x['text'])
+                    ret = encodeTokens(x['text'])
+                    return {
+                        'input_ids': ret['input_ids'],
+                        'token_type_ids': ret['token_type_ids'],
+                        'attention_mask': apply_data_prefix_skip_mask(ret['attention_mask']),
+                    }
                 
                 raise ValueError('Invalid dataset format, must contain either the configured "multi column" or prompt/completion or text')
 
@@ -902,6 +916,18 @@ def __init__(
         # prompt/completion format masking support
         disable_prompt_completion_mask: bool = False,
 
+        # ----------------------------
+        # Selective loss training
+        # ----------------------------
+
+        # Prefix token masking
+        #
+        # The rationale behind this, is that the first X tokens should not be "backpropped"
+        # for any new training record. As its unfair to expect the model (or a human) make
+        # any resonable guesses at that stage. As such this is used to "mask" the first X tokens
+        # from the loss calculation, and thus not backpropped.
+        data_prefix_skip_mask: int = 0,
+
         # ----------------------------
         # dataset packing support
         # ----------------------------

From 979f961e6f07baba03f0501639655a1de6960a1b Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Wed, 17 Jan 2024 17:36:26 +0800
Subject: [PATCH 23/33] experimental factoring

---
 RWKV-v5/src/model.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/RWKV-v5/src/model.py b/RWKV-v5/src/model.py
index 7b1c2b0a..fb075eba 100644
--- a/RWKV-v5/src/model.py
+++ b/RWKV-v5/src/model.py
@@ -126,7 +126,7 @@ def forward(self, x, last_state: BlockState):
 class L2Wrap(torch.autograd.Function):
 
     @staticmethod
-    def forward(ctx, loss, y, token_amount, currentMask):
+    def forward(ctx, loss, y, factor, currentMask):
         # Currently (8th July 2023), save_for_backward, causes an issue with
         # pytorch.compile (see: https://github.com/pytorch/pytorch/blob/e600505e3209eaf539e8bc99870ea55236cefbf5/torch/_dynamo/variables/higher_order_ops.py#L735)
         # 
@@ -135,15 +135,13 @@ def forward(ctx, loss, y, token_amount, currentMask):
         #
         # See also:
         # - checkpointed_step
-        ctx.save_for_backward(y, token_amount, currentMask)
+        ctx.save_for_backward(y, factor, currentMask)
         return loss
 
     @staticmethod
     def backward(ctx, grad_output):
-        y, token_amount, currentMask = ctx.saved_tensors
+        y, factor, currentMask = ctx.saved_tensors
 
-        # to encourage the logits to be close to 0
-        factor = 1e-4 / token_amount
         maxx, ids = torch.max(y, -1, keepdim=True)
         gy = torch.zeros_like(y)
         gy.scatter_(-1, ids, maxx * factor)
@@ -910,6 +908,10 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
                                     reduction="none")
             submask = mask.view(-1)[:token_loss.shape[0]]
 
+            # to encourage the logits to be close to 0
+            # factor_divisor is typically the total token count
+            L2Wrap_factor = 1e-4 / total_mask_sum
+            
             # Selective token loss logic
             if self.selective_token_loss_threshold > 0.0:
 
@@ -925,6 +927,9 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
                 train_loss = torch.sum(token_loss * train_mask) / total_mask_sum  
                 train_token_count = torch.sum(train_mask)
 
+                # Adjust the factor accordingly
+                L2Wrap_factor = L2Wrap_factor * (torch.sum(submask) / train_token_count)
+
             else:
                 train_loss = torch.sum(token_loss * submask) / total_mask_sum
                 sample_loss = train_loss.clone().detach().requires_grad_(False)
@@ -932,7 +937,7 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
                 train_mask = submask
 
             # L2Wrap for the backprop process
-            segment_train_loss = L2Wrap.apply(train_loss, logits, total_mask_sum, train_mask)
+            segment_train_loss = L2Wrap.apply(train_loss, logits, L2Wrap_factor, train_mask)
 
             # Return the checkpoint values
             return sample_loss, segment_train_loss, new_shift_states, new_wkv_states, train_token_count

From 370b06e5ca4b5464ea25d38e2b13c47c64e941ce Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Thu, 18 Jan 2024 07:40:54 +0800
Subject: [PATCH 24/33] minor notebook reorg

---
 ...et-packing.ipynb => dataset-packing.ipynb} |   0
 .../minipile-validation.ipynb                 | 199 ++++++------------
 ...-length.ipynb => sort-offset-length.ipynb} |   0
 3 files changed, 62 insertions(+), 137 deletions(-)
 rename notebook/trainer-v5-validation/{test-dataset-packing.ipynb => dataset-packing.ipynb} (100%)
 rename notebook/trainer-v5-validation/{test-sort-offset-length.ipynb => sort-offset-length.ipynb} (100%)

diff --git a/notebook/trainer-v5-validation/test-dataset-packing.ipynb b/notebook/trainer-v5-validation/dataset-packing.ipynb
similarity index 100%
rename from notebook/trainer-v5-validation/test-dataset-packing.ipynb
rename to notebook/trainer-v5-validation/dataset-packing.ipynb
diff --git a/notebook/trainer-v5-validation/minipile-validation.ipynb b/notebook/trainer-v5-validation/minipile-validation.ipynb
index 0810e624..07b1dc74 100644
--- a/notebook/trainer-v5-validation/minipile-validation.ipynb
+++ b/notebook/trainer-v5-validation/minipile-validation.ipynb
@@ -26,7 +26,7 @@
      "output_type": "stream",
      "text": [
       "ENABLE_WANDB: True\n",
-      "GPU_DEVICES: auto\n",
+      "GPU_DEVICES: 1\n",
       "NOTEBOOK_DIR: /home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation\n",
       "TRAINER_DIR: /home/picocreator/rwkv-proj/RWKV-infctx-trainer/RWKV-v5\n",
       "PROJECT_DIR: /home/picocreator/rwkv-proj/RWKV-infctx-trainer\n"
@@ -60,7 +60,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -73,16 +73,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "--2023-12-18 07:39:15--  https://huggingface.co/datasets/BlinkDL/minipile-tokenized/resolve/main/rwkv_vocab_v20230424/minipile.idx\n",
-      "Resolving huggingface.co (huggingface.co)... 13.33.33.55, 13.33.33.110, 13.33.33.102, ...\n",
-      "Connecting to huggingface.co (huggingface.co)|13.33.33.55|:443... connected.\n",
+      "--2024-01-17 16:37:45--  https://huggingface.co/datasets/BlinkDL/minipile-tokenized/resolve/main/rwkv_vocab_v20230424/minipile.idx\n",
+      "Resolving huggingface.co (huggingface.co)... 13.33.33.102, 13.33.33.20, 13.33.33.110, ...\n",
+      "Connecting to huggingface.co (huggingface.co)|13.33.33.102|:443... connected.\n",
       "HTTP request sent, awaiting response... "
      ]
     },
@@ -91,21 +91,21 @@
      "output_type": "stream",
      "text": [
       "302 Found\n",
-      "Location: https://cdn-lfs-us-1.huggingface.co/repos/09/8d/098d39f30da901c320a0b91b647dbfcdb64742d734ad97ab2247383b7265662e/f526abddaa06d376443e69c9a6c0fcbe4302afc0cb1aed08faf3fb97fc5acd10?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27minipile.idx%3B+filename%3D%22minipile.idx%22%3B&Expires=1703115555&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwMzExNTU1NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzA5LzhkLzA5OGQzOWYzMGRhOTAxYzMyMGEwYjkxYjY0N2RiZmNkYjY0NzQyZDczNGFkOTdhYjIyNDczODNiNzI2NTY2MmUvZjUyNmFiZGRhYTA2ZDM3NjQ0M2U2OWM5YTZjMGZjYmU0MzAyYWZjMGNiMWFlZDA4ZmFmM2ZiOTdmYzVhY2QxMD9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=UycA%7Eo%7EEWgjN6kwZtAZSB6k5Nz7B5MQttQCeTVw5OD5T1lTLfhRIX3aFxwLTAyMDMOnWK0KGcnWfha6OcBl9%7EGTSfu408xpCk-PyW0E9W45m5fvR5FqLWgR41zakLePM0Ssu0Wb2syrSKCFElocrwluDvNykuHhUQgdhN9hutXENfd6qC8LZmn68eo-PlqIh6ka8sFyfJa-Bteb3mT1SAPmW19if1jiwcWmtFrB-HrdVtrxrGf033MkimToaxtDR310VEkdYmVnwaPSRcd4Hkfc2CR%7Emdd%7Eg-nzfMERz7Qh2CM%7EV6KBEOB%7EfX2fXXI8mTPVJNqxcIw23ZBEhArczmQ__&Key-Pair-Id=KCD77M1F0VK2B [following]\n",
-      "--2023-12-18 07:39:15--  https://cdn-lfs-us-1.huggingface.co/repos/09/8d/098d39f30da901c320a0b91b647dbfcdb64742d734ad97ab2247383b7265662e/f526abddaa06d376443e69c9a6c0fcbe4302afc0cb1aed08faf3fb97fc5acd10?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27minipile.idx%3B+filename%3D%22minipile.idx%22%3B&Expires=1703115555&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwMzExNTU1NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzA5LzhkLzA5OGQzOWYzMGRhOTAxYzMyMGEwYjkxYjY0N2RiZmNkYjY0NzQyZDczNGFkOTdhYjIyNDczODNiNzI2NTY2MmUvZjUyNmFiZGRhYTA2ZDM3NjQ0M2U2OWM5YTZjMGZjYmU0MzAyYWZjMGNiMWFlZDA4ZmFmM2ZiOTdmYzVhY2QxMD9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=UycA%7Eo%7EEWgjN6kwZtAZSB6k5Nz7B5MQttQCeTVw5OD5T1lTLfhRIX3aFxwLTAyMDMOnWK0KGcnWfha6OcBl9%7EGTSfu408xpCk-PyW0E9W45m5fvR5FqLWgR41zakLePM0Ssu0Wb2syrSKCFElocrwluDvNykuHhUQgdhN9hutXENfd6qC8LZmn68eo-PlqIh6ka8sFyfJa-Bteb3mT1SAPmW19if1jiwcWmtFrB-HrdVtrxrGf033MkimToaxtDR310VEkdYmVnwaPSRcd4Hkfc2CR%7Emdd%7Eg-nzfMERz7Qh2CM%7EV6KBEOB%7EfX2fXXI8mTPVJNqxcIw23ZBEhArczmQ__&Key-Pair-Id=KCD77M1F0VK2B\n",
-      "Resolving cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)... 13.33.88.54, 13.33.88.84, 13.33.88.7, ...\n",
+      "Location: https://cdn-lfs-us-1.huggingface.co/repos/09/8d/098d39f30da901c320a0b91b647dbfcdb64742d734ad97ab2247383b7265662e/f526abddaa06d376443e69c9a6c0fcbe4302afc0cb1aed08faf3fb97fc5acd10?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27minipile.idx%3B+filename%3D%22minipile.idx%22%3B&Expires=1705739865&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNTczOTg2NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzA5LzhkLzA5OGQzOWYzMGRhOTAxYzMyMGEwYjkxYjY0N2RiZmNkYjY0NzQyZDczNGFkOTdhYjIyNDczODNiNzI2NTY2MmUvZjUyNmFiZGRhYTA2ZDM3NjQ0M2U2OWM5YTZjMGZjYmU0MzAyYWZjMGNiMWFlZDA4ZmFmM2ZiOTdmYzVhY2QxMD9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=Ca7T7yGaEKb-yz%7EGD34kXCNxrNYrwXXHAs9RwlCecKC9pUblLUPsz2wa1B-tAwJPnf3mjI8aBvhOpqsfeCg4oqM0TBWgwpHRxj%7E1bn8vjZRjYABwsTElLV-Z3rwgtVFKFCxtNQW1WWnf4AZmMDW8mqWjep48Y2-Mw6OzyZ3dWz6pOgA9%7E1osoqHjnZewkRB5RocVgOioqHAZRBc1mrqBd6yy%7E0oBixxb8pXzVOzU-J7JflEZBfvt2vGpuVNzOaYiwcAP7FOiWiFCBHjjWzeGYzcESofs%7E9%7EgALGuLQHGR8NGOZRlA4TvorBZIsd-V2abC1oO05yq8IRo5JmlCot6VQ__&Key-Pair-Id=KCD77M1F0VK2B [following]\n",
+      "--2024-01-17 16:37:45--  https://cdn-lfs-us-1.huggingface.co/repos/09/8d/098d39f30da901c320a0b91b647dbfcdb64742d734ad97ab2247383b7265662e/f526abddaa06d376443e69c9a6c0fcbe4302afc0cb1aed08faf3fb97fc5acd10?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27minipile.idx%3B+filename%3D%22minipile.idx%22%3B&Expires=1705739865&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNTczOTg2NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzA5LzhkLzA5OGQzOWYzMGRhOTAxYzMyMGEwYjkxYjY0N2RiZmNkYjY0NzQyZDczNGFkOTdhYjIyNDczODNiNzI2NTY2MmUvZjUyNmFiZGRhYTA2ZDM3NjQ0M2U2OWM5YTZjMGZjYmU0MzAyYWZjMGNiMWFlZDA4ZmFmM2ZiOTdmYzVhY2QxMD9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=Ca7T7yGaEKb-yz%7EGD34kXCNxrNYrwXXHAs9RwlCecKC9pUblLUPsz2wa1B-tAwJPnf3mjI8aBvhOpqsfeCg4oqM0TBWgwpHRxj%7E1bn8vjZRjYABwsTElLV-Z3rwgtVFKFCxtNQW1WWnf4AZmMDW8mqWjep48Y2-Mw6OzyZ3dWz6pOgA9%7E1osoqHjnZewkRB5RocVgOioqHAZRBc1mrqBd6yy%7E0oBixxb8pXzVOzU-J7JflEZBfvt2vGpuVNzOaYiwcAP7FOiWiFCBHjjWzeGYzcESofs%7E9%7EgALGuLQHGR8NGOZRlA4TvorBZIsd-V2abC1oO05yq8IRo5JmlCot6VQ__&Key-Pair-Id=KCD77M1F0VK2B\n",
+      "Resolving cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)... 13.33.88.54, 13.33.88.62, 13.33.88.7, ...\n",
       "Connecting to cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)|13.33.88.54|:443... connected.\n",
       "HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable\n",
       "\n",
       "    The file is already fully retrieved; nothing to do.\n",
       "\n",
-      "--2023-12-18 07:39:15--  https://huggingface.co/datasets/BlinkDL/minipile-tokenized/resolve/main/rwkv_vocab_v20230424/minipile.bin\n",
-      "Resolving huggingface.co (huggingface.co)... 13.33.33.20, 13.33.33.102, 13.33.33.110, ...\n",
-      "Connecting to huggingface.co (huggingface.co)|13.33.33.20|:443... connected.\n",
+      "--2024-01-17 16:37:46--  https://huggingface.co/datasets/BlinkDL/minipile-tokenized/resolve/main/rwkv_vocab_v20230424/minipile.bin\n",
+      "Resolving huggingface.co (huggingface.co)... 13.33.33.55, 13.33.33.110, 13.33.33.20, ...\n",
+      "Connecting to huggingface.co (huggingface.co)|13.33.33.55|:443... connected.\n",
       "HTTP request sent, awaiting response... 302 Found\n",
-      "Location: https://cdn-lfs-us-1.huggingface.co/repos/09/8d/098d39f30da901c320a0b91b647dbfcdb64742d734ad97ab2247383b7265662e/9917a52991b9ce5b0b05f92101962ba704cf3c4c20b64431ff8c45ba9d4141a5?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27minipile.bin%3B+filename%3D%22minipile.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1703115555&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwMzExNTU1NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzA5LzhkLzA5OGQzOWYzMGRhOTAxYzMyMGEwYjkxYjY0N2RiZmNkYjY0NzQyZDczNGFkOTdhYjIyNDczODNiNzI2NTY2MmUvOTkxN2E1Mjk5MWI5Y2U1YjBiMDVmOTIxMDE5NjJiYTcwNGNmM2M0YzIwYjY0NDMxZmY4YzQ1YmE5ZDQxNDFhNT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=iDC3gWLKONw98DKGci%7ECza5tw-GGam9Yzp2u-tzqIr8SbJm%7EAWlT8QTLEiof9HrYmViwjTxt7ccXAk1m0Y0h4RchnE3xV1kCcAmCd0i%7EYAn4beKa7SvTgUKETCWGax382LNRM-pFC81TOmrbCPKbMsQKIiKIHCZ6aSjWd%7E-cqNSWs8VhL2Zs9ACnYFQXK%7E%7EOuTklP53PG0BpAfa7IGNxMyLYqQVr%7EzFd2UQAIgqpB2otxphl-e526oYIIun0jb6zcer8Qe93kG4S9O%7ETCKYBYwa2DNEYeeJZT0PAzKQrtbLDHn3LRm%7ES-uit6k-ReRDRJNEwwLsrXo9afWtn%7E9DjxA__&Key-Pair-Id=KCD77M1F0VK2B [following]\n",
-      "--2023-12-18 07:39:15--  https://cdn-lfs-us-1.huggingface.co/repos/09/8d/098d39f30da901c320a0b91b647dbfcdb64742d734ad97ab2247383b7265662e/9917a52991b9ce5b0b05f92101962ba704cf3c4c20b64431ff8c45ba9d4141a5?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27minipile.bin%3B+filename%3D%22minipile.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1703115555&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwMzExNTU1NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzA5LzhkLzA5OGQzOWYzMGRhOTAxYzMyMGEwYjkxYjY0N2RiZmNkYjY0NzQyZDczNGFkOTdhYjIyNDczODNiNzI2NTY2MmUvOTkxN2E1Mjk5MWI5Y2U1YjBiMDVmOTIxMDE5NjJiYTcwNGNmM2M0YzIwYjY0NDMxZmY4YzQ1YmE5ZDQxNDFhNT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=iDC3gWLKONw98DKGci%7ECza5tw-GGam9Yzp2u-tzqIr8SbJm%7EAWlT8QTLEiof9HrYmViwjTxt7ccXAk1m0Y0h4RchnE3xV1kCcAmCd0i%7EYAn4beKa7SvTgUKETCWGax382LNRM-pFC81TOmrbCPKbMsQKIiKIHCZ6aSjWd%7E-cqNSWs8VhL2Zs9ACnYFQXK%7E%7EOuTklP53PG0BpAfa7IGNxMyLYqQVr%7EzFd2UQAIgqpB2otxphl-e526oYIIun0jb6zcer8Qe93kG4S9O%7ETCKYBYwa2DNEYeeJZT0PAzKQrtbLDHn3LRm%7ES-uit6k-ReRDRJNEwwLsrXo9afWtn%7E9DjxA__&Key-Pair-Id=KCD77M1F0VK2B\n",
-      "Resolving cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)... 13.33.88.54, 13.33.88.84, 13.33.88.7, ...\n",
+      "Location: https://cdn-lfs-us-1.huggingface.co/repos/09/8d/098d39f30da901c320a0b91b647dbfcdb64742d734ad97ab2247383b7265662e/9917a52991b9ce5b0b05f92101962ba704cf3c4c20b64431ff8c45ba9d4141a5?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27minipile.bin%3B+filename%3D%22minipile.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1705739866&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNTczOTg2Nn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzA5LzhkLzA5OGQzOWYzMGRhOTAxYzMyMGEwYjkxYjY0N2RiZmNkYjY0NzQyZDczNGFkOTdhYjIyNDczODNiNzI2NTY2MmUvOTkxN2E1Mjk5MWI5Y2U1YjBiMDVmOTIxMDE5NjJiYTcwNGNmM2M0YzIwYjY0NDMxZmY4YzQ1YmE5ZDQxNDFhNT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=P90Ai0b76ySfWvt6dvtE2GVikpK-iG9tV1nPlNVKuj52n%7E2XxWBprGEOZ%7EUUK-WSakjaXqum1VlF8WfSB-HtsEbYLG4eWf5oIp2hFDtOZ1u5vxT6q1YaN3FTksCYAemZCYk3rAkyvucmjucOSmbt48eFgBovvQDKdazqtciuU6TQn0eQdxyo7YDY5VMXk8kDitYEjAZKrxxX28PuLV4h9hJxocQnWbDuSp4o7%7E1kih%7EIucA1cECAKfT4f8vUL3O9BGCh5FRb3xSdCyp5FnWrtnrj0eBk%7EyYgUSJziXXc-ZL9ExIdr2xFqVqzCrt3YIiR6uK8U5q6CD9GQbAnoVoquA__&Key-Pair-Id=KCD77M1F0VK2B [following]\n",
+      "--2024-01-17 16:37:46--  https://cdn-lfs-us-1.huggingface.co/repos/09/8d/098d39f30da901c320a0b91b647dbfcdb64742d734ad97ab2247383b7265662e/9917a52991b9ce5b0b05f92101962ba704cf3c4c20b64431ff8c45ba9d4141a5?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27minipile.bin%3B+filename%3D%22minipile.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1705739866&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNTczOTg2Nn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzA5LzhkLzA5OGQzOWYzMGRhOTAxYzMyMGEwYjkxYjY0N2RiZmNkYjY0NzQyZDczNGFkOTdhYjIyNDczODNiNzI2NTY2MmUvOTkxN2E1Mjk5MWI5Y2U1YjBiMDVmOTIxMDE5NjJiYTcwNGNmM2M0YzIwYjY0NDMxZmY4YzQ1YmE5ZDQxNDFhNT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=P90Ai0b76ySfWvt6dvtE2GVikpK-iG9tV1nPlNVKuj52n%7E2XxWBprGEOZ%7EUUK-WSakjaXqum1VlF8WfSB-HtsEbYLG4eWf5oIp2hFDtOZ1u5vxT6q1YaN3FTksCYAemZCYk3rAkyvucmjucOSmbt48eFgBovvQDKdazqtciuU6TQn0eQdxyo7YDY5VMXk8kDitYEjAZKrxxX28PuLV4h9hJxocQnWbDuSp4o7%7E1kih%7EIucA1cECAKfT4f8vUL3O9BGCh5FRb3xSdCyp5FnWrtnrj0eBk%7EyYgUSJziXXc-ZL9ExIdr2xFqVqzCrt3YIiR6uK8U5q6CD9GQbAnoVoquA__&Key-Pair-Id=KCD77M1F0VK2B\n",
+      "Resolving cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)... 13.33.88.54, 13.33.88.62, 13.33.88.7, ...\n",
       "Connecting to cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)|13.33.88.54|:443... connected.\n",
       "HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable\n",
       "\n",
@@ -117,20 +117,19 @@
    "source": [
     "# Download the minipile files\n",
     "!cd \"{PROJECT_DIR}\" && wget --continue -O dataset/minipile.idx https://huggingface.co/datasets/BlinkDL/minipile-tokenized/resolve/main/rwkv_vocab_v20230424/minipile.idx\n",
-    "!cd \"{PROJECT_DIR}\" && wget --continue -O dataset/minipile.bin https://huggingface.co/datasets/BlinkDL/minipile-tokenized/resolve/main/rwkv_vocab_v20230424/minipile.bin\n",
-    "\n"
+    "!cd \"{PROJECT_DIR}\" && wget --continue -O dataset/minipile.bin https://huggingface.co/datasets/BlinkDL/minipile-tokenized/resolve/main/rwkv_vocab_v20230424/minipile.bin"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2023-12-18 07:32:33,143] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-01-17 16:41:50,714] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
       "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.1'\n",
       "---- Initializing model ----\n",
       "No of layers: 12\n",
@@ -140,115 +139,7 @@
       "Emb scale: 0.0001\n",
       "Note: this process takes a significant time (and ram) for large models\n",
       "---- ----- ----\n",
-      "---\n",
-      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
-      "Using /home/picocreator/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
-      "Detected CUDA files, patching ldflags\n",
-      "Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu121/wkv5/build.ninja...\n",
-      "Building extension module wkv5...\n",
-      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
-      "ninja: no work to do.\n",
-      "Loading extension module wkv5...\n",
-      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
-      "---\n",
-      "65536 768   -0.0001 emb.weight\n",
-      "768   768   1.0  blocks.0.att.receptance.weight\n",
-      "768   768   1.0  blocks.0.att.key.weight\n",
-      "768   768   1.0  blocks.0.att.value.weight\n",
-      "768   768   0    blocks.0.att.output.weight\n",
-      "768   768   1.0  blocks.0.att.gate.weight\n",
-      "2688  768   1.0  blocks.0.ffn.key.weight\n",
-      "768   768   0    blocks.0.ffn.receptance.weight\n",
-      "768   2688  0    blocks.0.ffn.value.weight\n",
-      "768   768   1.0  blocks.1.att.receptance.weight\n",
-      "768   768   1.0  blocks.1.att.key.weight\n",
-      "768   768   1.0  blocks.1.att.value.weight\n",
-      "768   768   0    blocks.1.att.output.weight\n",
-      "768   768   1.0  blocks.1.att.gate.weight\n",
-      "2688  768   1.0  blocks.1.ffn.key.weight\n",
-      "768   768   0    blocks.1.ffn.receptance.weight\n",
-      "768   2688  0    blocks.1.ffn.value.weight\n",
-      "768   768   1.0  blocks.2.att.receptance.weight\n",
-      "768   768   1.0  blocks.2.att.key.weight\n",
-      "768   768   1.0  blocks.2.att.value.weight\n",
-      "768   768   0    blocks.2.att.output.weight\n",
-      "768   768   1.0  blocks.2.att.gate.weight\n",
-      "2688  768   1.0  blocks.2.ffn.key.weight\n",
-      "768   768   0    blocks.2.ffn.receptance.weight\n",
-      "768   2688  0    blocks.2.ffn.value.weight\n",
-      "768   768   1.0  blocks.3.att.receptance.weight\n",
-      "768   768   1.0  blocks.3.att.key.weight\n",
-      "768   768   1.0  blocks.3.att.value.weight\n",
-      "768   768   0    blocks.3.att.output.weight\n",
-      "768   768   1.0  blocks.3.att.gate.weight\n",
-      "2688  768   1.0  blocks.3.ffn.key.weight\n",
-      "768   768   0    blocks.3.ffn.receptance.weight\n",
-      "768   2688  0    blocks.3.ffn.value.weight\n",
-      "768   768   1.0  blocks.4.att.receptance.weight\n",
-      "768   768   1.0  blocks.4.att.key.weight\n",
-      "768   768   1.0  blocks.4.att.value.weight\n",
-      "768   768   0    blocks.4.att.output.weight\n",
-      "768   768   1.0  blocks.4.att.gate.weight\n",
-      "2688  768   1.0  blocks.4.ffn.key.weight\n",
-      "768   768   0    blocks.4.ffn.receptance.weight\n",
-      "768   2688  0    blocks.4.ffn.value.weight\n",
-      "768   768   1.0  blocks.5.att.receptance.weight\n",
-      "768   768   1.0  blocks.5.att.key.weight\n",
-      "768   768   1.0  blocks.5.att.value.weight\n",
-      "768   768   0    blocks.5.att.output.weight\n",
-      "768   768   1.0  blocks.5.att.gate.weight\n",
-      "2688  768   1.0  blocks.5.ffn.key.weight\n",
-      "768   768   0    blocks.5.ffn.receptance.weight\n",
-      "768   2688  0    blocks.5.ffn.value.weight\n",
-      "768   768   1.0  blocks.6.att.receptance.weight\n",
-      "768   768   1.0  blocks.6.att.key.weight\n",
-      "768   768   1.0  blocks.6.att.value.weight\n",
-      "768   768   0    blocks.6.att.output.weight\n",
-      "768   768   1.0  blocks.6.att.gate.weight\n",
-      "2688  768   1.0  blocks.6.ffn.key.weight\n",
-      "768   768   0    blocks.6.ffn.receptance.weight\n",
-      "768   2688  0    blocks.6.ffn.value.weight\n",
-      "768   768   1.0  blocks.7.att.receptance.weight\n",
-      "768   768   1.0  blocks.7.att.key.weight\n",
-      "768   768   1.0  blocks.7.att.value.weight\n",
-      "768   768   0    blocks.7.att.output.weight\n",
-      "768   768   1.0  blocks.7.att.gate.weight\n",
-      "2688  768   1.0  blocks.7.ffn.key.weight\n",
-      "768   768   0    blocks.7.ffn.receptance.weight\n",
-      "768   2688  0    blocks.7.ffn.value.weight\n",
-      "768   768   1.0  blocks.8.att.receptance.weight\n",
-      "768   768   1.0  blocks.8.att.key.weight\n",
-      "768   768   1.0  blocks.8.att.value.weight\n",
-      "768   768   0    blocks.8.att.output.weight\n",
-      "768   768   1.0  blocks.8.att.gate.weight\n",
-      "2688  768   1.0  blocks.8.ffn.key.weight\n",
-      "768   768   0    blocks.8.ffn.receptance.weight\n",
-      "768   2688  0    blocks.8.ffn.value.weight\n",
-      "768   768   1.0  blocks.9.att.receptance.weight\n",
-      "768   768   1.0  blocks.9.att.key.weight\n",
-      "768   768   1.0  blocks.9.att.value.weight\n",
-      "768   768   0    blocks.9.att.output.weight\n",
-      "768   768   1.0  blocks.9.att.gate.weight\n",
-      "2688  768   1.0  blocks.9.ffn.key.weight\n",
-      "768   768   0    blocks.9.ffn.receptance.weight\n",
-      "768   2688  0    blocks.9.ffn.value.weight\n",
-      "768   768   1.0  blocks.10.att.receptance.weight\n",
-      "768   768   1.0  blocks.10.att.key.weight\n",
-      "768   768   1.0  blocks.10.att.value.weight\n",
-      "768   768   0    blocks.10.att.output.weight\n",
-      "768   768   1.0  blocks.10.att.gate.weight\n",
-      "2688  768   1.0  blocks.10.ffn.key.weight\n",
-      "768   768   0    blocks.10.ffn.receptance.weight\n",
-      "768   2688  0    blocks.10.ffn.value.weight\n",
-      "768   768   1.0  blocks.11.att.receptance.weight\n",
-      "768   768   1.0  blocks.11.att.key.weight\n",
-      "768   768   1.0  blocks.11.att.value.weight\n",
-      "768   768   0    blocks.11.att.output.weight\n",
-      "768   768   1.0  blocks.11.att.gate.weight\n",
-      "2688  768   1.0  blocks.11.ffn.key.weight\n",
-      "768   768   0    blocks.11.ffn.receptance.weight\n",
-      "768   2688  0    blocks.11.ffn.value.weight\n",
-      "65536 768   0.5  head.weight\n"
+      "Model exists, skipping init_model\n"
      ]
     }
    ],
@@ -288,16 +179,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2023-12-18 11:57:34,927] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-01-17 16:42:01,086] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
       "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.1'\n",
-      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/minipile-world-512.yaml', '--trainer.logger.init_args.name=infctx-v5-validation - MiniPile (train-ctx=512, data-ctx=512, deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.devices=auto', '--model.load_model=../model/L12-D768-world-init.pth'], args=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/minipile-world-512.yaml', '--trainer.logger.init_args.name=infctx-v5-validation - MiniPile (train-ctx=512, data-ctx=512, deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.devices=auto', '--model.load_model=../model/L12-D768-world-init.pth'].\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/minipile-world-512.yaml', '--trainer.logger.init_args.name=infctx-v5-validation - MiniPile (train-ctx=512, data-ctx=512, deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.devices=1', '--model.load_model=../model/L12-D768-world-init.pth'], args=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/minipile-world-512.yaml', '--trainer.logger.init_args.name=infctx-v5-validation - MiniPile (train-ctx=512, data-ctx=512, deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.devices=1', '--model.load_model=../model/L12-D768-world-init.pth'].\n",
       "Seed set to 3941088705\n",
       "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
       "  return self.fget.__get__(instance, owner)()\n",
@@ -330,14 +221,14 @@
       "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/1\n",
       "Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.\n",
       "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.16.1 is available!  To upgrade, please run:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.16.2 is available!  To upgrade, please run:\n",
       "\u001b[34m\u001b[1mwandb\u001b[0m:  $ pip install wandb --upgrade\n",
       "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.16.0\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20231218_115739-69qe82py\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20240117_164210-bvjhu7ex\u001b[0m\n",
       "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\n",
       "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33minfctx-v5-validation - MiniPile (train-ctx=512, data-ctx=512, deepspeed_stage_1)\u001b[0m\n",
       "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-infctx-unit-test\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-infctx-unit-test/runs/69qe82py\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-infctx-unit-test/runs/bvjhu7ex\u001b[0m\n",
       "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:639: Checkpoint directory ../checkpoint/trainer-validaiton/infctx-v5-minipile-512 exists and is not empty.\n",
       "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
       "#\n",
@@ -360,7 +251,7 @@
       "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
       "ninja: no work to do.\n",
       "Loading extension module fused_adam...\n",
-      "Time to load fused_adam op: 0.05427098274230957 seconds\n",
+      "Time to load fused_adam op: 0.05117464065551758 seconds\n",
       "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201450/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
       "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
       "Loading `train_dataloader` to estimate number of stepping batches.\n",
@@ -377,9 +268,43 @@
       "192 M     Total params\n",
       "771.232   Total estimated model params size (MB)\n",
       "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/utilities/data.py:104: Total length of `DataLoader` across ranks is zero. Please make sure this was your intention.\n",
-      "Epoch 0:   1%| | 1000/183005 [02:20<7:07:18,  7.10it/s, v_num=82py, train/loss=5/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "Epoch 0:   1%| | 1000/183005 [02:24<7:19:45,  6.90it/s, v_num=u7ex, train/loss=5/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
       "  warnings.warn(\n",
-      "Epoch 0:  40%|▍| 74044/183005 [2:55:32<4:18:19,  7.03it/s, v_num=82py, train/los"
+      "Epoch 0: 100%|█| 183005/183005 [7:26:54<00:00,  6.82it/s, v_num=u7ex, train/loss`Trainer.fit` stopped: `max_epochs=1` reached.\n",
+      "Epoch 0: 100%|█| 183005/183005 [7:26:54<00:00,  6.82it/s, v_num=u7ex, train/loss\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run history:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                  batchidx ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                     epoch ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:               global_rank ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/tokens_per_sec.gpu.0 ▃▅▂▂▁▂▃▃▄▂▃▃▄▅▅▆▆▆▇▆▆▆▆▇▇▇▇▇▇▇▇█████████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/tokens_total.gpu.0 ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   substep ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:             train/ctx_len ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:           train/data_loss █▇▅▇▆▄▇▄▅▅▄▄▄▂▅▃▅▃▄▄▄▂▃▃▃▄▃▁▃▂▃▂▁▃▂▃▂▃▂▂\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/loss █▇▄▆▃▅▄▃▄▄▄▅▃▄▅▃▄▂▃▂▄▃▂▂▄▃▃▂▃▂▂▃▃▃▃▂▁▂▁▃\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:              train/tokens █▇▅▇▆▄▇▄▆▅▄▃▄▃▅▄▆▄▆▄▅▃▃▄▄▄▄▁▃▃▃▃▁▅▃▄▂▃▃▄\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:       trainer/global_step ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:     trainer/learning_rate ████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run summary:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                  batchidx 183004\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                     epoch 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:               global_rank 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/tokens_per_sec.gpu.0 55909.00069\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/tokens_total.gpu.0 1499171840\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   substep 183004\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:             train/ctx_len 192.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:           train/data_loss 2.9375\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/loss 2.76562\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:              train/tokens 127.375\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:       trainer/global_step 183004\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:     trainer/learning_rate 6e-05\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33minfctx-v5-validation - MiniPile (train-ctx=512, data-ctx=512, deepspeed_stage_1)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-infctx-unit-test/runs/bvjhu7ex\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-infctx-unit-test/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjExNjk3NDc5Mw==/version_details/v9\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 6 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20240117_164210-bvjhu7ex/logs\u001b[0m\n"
      ]
     }
    ],
diff --git a/notebook/trainer-v5-validation/test-sort-offset-length.ipynb b/notebook/trainer-v5-validation/sort-offset-length.ipynb
similarity index 100%
rename from notebook/trainer-v5-validation/test-sort-offset-length.ipynb
rename to notebook/trainer-v5-validation/sort-offset-length.ipynb

From 6ec9641369b5109517e8409296c6c18b1969aa4b Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Thu, 18 Jan 2024 08:26:33 +0800
Subject: [PATCH 25/33] apply data prefix skip mask for rechunk text too

---
 RWKV-v5/src/data.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index b5a0a984..ce30618e 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -272,8 +272,9 @@ def encodeTokens(x):
             # where relevent, and disables the training mask for the first X tokens
             data_prefix_skip_mask_enabled = kargs["data_prefix_skip_mask"] is not None
             def apply_data_prefix_skip_mask(mask):
-                if data_prefix_skip_mask_enabled > 0:
-                    for i in range(data_prefix_skip_mask_enabled):
+                mask_len = len(mask)
+                if data_prefix_skip_mask_enabled > 0 and mask_len:
+                    for i in range(max(data_prefix_skip_mask_enabled, mask_len)):
                         mask[i] = 0
                 return mask
             
@@ -533,7 +534,7 @@ def rechunk_text(x):
                 # with the newline token in between
                 full_input_ids += x["input_ids"][i] + endOfDoc_tokenSet["input_ids"][0]
                 full_token_type_ids += x["token_type_ids"][i] + endOfDoc_tokenSet["token_type_ids"][0]
-                full_attention_mask += x["attention_mask"][i] + endOfDoc_tokenSet["attention_mask"][0]
+                full_attention_mask += apply_data_prefix_skip_mask( x["attention_mask"][i] ) + endOfDoc_tokenSet["attention_mask"][0]
             
             # Total length, and sample count
             # note that thte "remainder" will be discarded
@@ -554,7 +555,7 @@ def rechunk_text(x):
                 # Push the sample to the output arrays
                 out_input_ids.append(full_input_ids[start:end])
                 out_token_type_ids.append(full_token_type_ids[start:end])
-                out_attention_mask.append(full_attention_mask[start:end])
+                out_attention_mask.append(apply_data_prefix_skip_mask( full_attention_mask[start:end] ))
             
             # Prepare and return the output object
             ret = {

From 8805a7372299f7cedfbf0a75878d2d8424055180 Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Thu, 18 Jan 2024 11:00:27 +0800
Subject: [PATCH 26/33] wip data masking tweak

---
 RWKV-v5/src/data.py | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index ce30618e..bd22786e 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -46,6 +46,19 @@ def prepare_data_static(**kargs):
         
         # =====================================================
 
+        # Util functions
+        #--------------------------------
+
+        # Apply the data_prefix_skip_mask to the given mask
+        # where relevent, and disables the training mask for the first X tokens
+        data_prefix_skip_mask_enabled = kargs["data_prefix_skip_mask"] is not None
+        def apply_data_prefix_skip_mask(mask):
+            mask_len = len(mask)
+            if data_prefix_skip_mask_enabled > 0 and mask_len:
+                for i in range(max(data_prefix_skip_mask_enabled, mask_len)):
+                    mask[i] = 0
+            return mask
+        
         # Special handling for binidx
         #--------------------------------
 
@@ -66,7 +79,7 @@ def gen():
                     yield {
                         'input_ids': tokens,
                         'token_type_ids': [0] * len(tokens),
-                        'attention_mask': [1] * len(tokens)
+                        'attention_mask': apply_data_prefix_skip_mask([1] * len(tokens))
                     }
 
             # Load the huggingface dataset from the generator
@@ -268,16 +281,6 @@ def encodeTokens(x):
 
                     conversation_enabled = True
 
-            # Apply the data_prefix_skip_mask to the given mask
-            # where relevent, and disables the training mask for the first X tokens
-            data_prefix_skip_mask_enabled = kargs["data_prefix_skip_mask"] is not None
-            def apply_data_prefix_skip_mask(mask):
-                mask_len = len(mask)
-                if data_prefix_skip_mask_enabled > 0 and mask_len:
-                    for i in range(max(data_prefix_skip_mask_enabled, mask_len)):
-                        mask[i] = 0
-                return mask
-            
             # Maps the dataset record to the tokenized result
             # handles a wide variety of format according to the data configuration
             #

From cf43c2cf40d2f97cfc30d4b8b03a3199fbc3b9bd Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Thu, 18 Jan 2024 11:15:09 +0800
Subject: [PATCH 27/33] fixing data masking

---
 RWKV-v5/src/data.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index bd22786e..9534b582 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -51,11 +51,11 @@ def prepare_data_static(**kargs):
 
         # Apply the data_prefix_skip_mask to the given mask
         # where relevent, and disables the training mask for the first X tokens
-        data_prefix_skip_mask_enabled = kargs["data_prefix_skip_mask"] is not None
+        data_prefix_skip_mask_val = int(kargs["data_prefix_skip_mask"])
         def apply_data_prefix_skip_mask(mask):
             mask_len = len(mask)
-            if data_prefix_skip_mask_enabled > 0 and mask_len:
-                for i in range(max(data_prefix_skip_mask_enabled, mask_len)):
+            if data_prefix_skip_mask_val > 0 and mask_len:
+                for i in range(max(data_prefix_skip_mask_val, mask_len)):
                     mask[i] = 0
             return mask
         
@@ -1052,4 +1052,4 @@ def val_dataloader(self):
             batch_size=1, 
             # Pinned in GPU memory
             pin_memory=True
-        )
+        )
\ No newline at end of file

From f5fdb89b56fa8ef8a87edbb225b3dcd039af3eea Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Thu, 18 Jan 2024 11:19:06 +0800
Subject: [PATCH 28/33] skipped fully masked records

---
 RWKV-v5/src/data.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index 9534b582..b391a2df 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -583,6 +583,8 @@ def dataset_filter(x):
                 return False
             if kargs["max_token_size"] > 0 and row_length > kargs["max_token_size"]:
                 return False
+            if sum(x["attention_mask"]) <= 0:
+                return False
             return True
         src_dataset = src_dataset.filter(dataset_filter, num_proc=num_cpus)
 

From d3b3f182581b71e7169987416ad16558b7746e47 Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Thu, 18 Jan 2024 12:16:55 +0800
Subject: [PATCH 29/33] Fixing microbatches

---
 RWKV-v5/src/model.py                          |  22 +-
 .../config/enwiki_10k-world-full.yaml         | 265 ++++++++++
 .../dataset-microbatch.ipynb                  | 469 ++++++++++++++++++
 3 files changed, 751 insertions(+), 5 deletions(-)
 create mode 100644 notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml
 create mode 100644 notebook/trainer-v5-validation/dataset-microbatch.ipynb

diff --git a/RWKV-v5/src/model.py b/RWKV-v5/src/model.py
index fb075eba..1d065584 100644
--- a/RWKV-v5/src/model.py
+++ b/RWKV-v5/src/model.py
@@ -911,9 +911,18 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
             # to encourage the logits to be close to 0
             # factor_divisor is typically the total token count
             L2Wrap_factor = 1e-4 / total_mask_sum
+
+            # Submask count
+            submask_count = torch.sum(submask)
             
             # Selective token loss logic
-            if self.selective_token_loss_threshold > 0.0:
+            if submask_count <= 0.0:
+                train_loss = torch.tensor(0, dtype=self.emb.weight.dtype).requires_grad_()
+                sample_loss = train_loss.clone().detach().requires_grad_(False)
+                train_token_count = 0
+                train_mask = submask
+
+            elif self.selective_token_loss_threshold > 0.0:
 
                 # Sample loss, without backprop 
                 with torch.no_grad():
@@ -928,16 +937,19 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
                 train_token_count = torch.sum(train_mask)
 
                 # Adjust the factor accordingly
-                L2Wrap_factor = L2Wrap_factor * (torch.sum(submask) / train_token_count)
+                L2Wrap_factor = L2Wrap_factor * (submask_count / train_token_count)
 
             else:
                 train_loss = torch.sum(token_loss * submask) / total_mask_sum
                 sample_loss = train_loss.clone().detach().requires_grad_(False)
-                train_token_count = torch.sum(submask)
+                train_token_count = submask_count
                 train_mask = submask
 
-            # L2Wrap for the backprop process
-            segment_train_loss = L2Wrap.apply(train_loss, logits, L2Wrap_factor, train_mask)
+            if train_loss <= 0.0:
+                segment_train_loss = torch.tensor(0, dtype=self.emb.weight.dtype).requires_grad_()
+            else:
+                # L2Wrap for the backprop process
+                segment_train_loss = L2Wrap.apply(train_loss, logits, L2Wrap_factor, train_mask)
 
             # Return the checkpoint values
             return sample_loss, segment_train_loss, new_shift_states, new_wkv_states, train_token_count
diff --git a/notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml b/notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml
new file mode 100644
index 00000000..85b60f6a
--- /dev/null
+++ b/notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml
@@ -0,0 +1,265 @@
+# lightning.pytorch==2.0.2
+seed_everything: 3941088705
+trainer:
+
+  #
+  # Configure the deepspeed strategy, we recommend you start with `deepspeed_stage_2_offload` 
+  # and adjust from there according to your training needs. `deepspeed_stage_3_offload` is useful  
+  # for training LoRA on large models on a single GPU.
+  #
+  # In general you would want to use the following:
+  #
+  # - deepspeed_stage_1 : Each of your GPU has too much vram, and you do not know what to do
+  #
+  # - deepspeed_stage_2 : Optimal distributed training strategy, across multiple gpu each with sufficient vram
+  # - deepspeed_stage_2_offload : Reduce vram usage by offloading the optimizer state and work to cpu
+  #
+  # - deepspeed_stage_3 : Split up the model across multiple gpu, useful for large models, at a performance cost
+  # - deepspeed_stage_3_offload : Additional offloading, for even greater performance cost
+  #
+  # For more details see:
+  # https://lightning.ai/docs/pytorch/stable/advanced/model_parallel.html#deepspeed-zero-stage-2
+  #
+  strategy: deepspeed_stage_2_offload
+
+  # Logger setting for wandb, if you want to enable wandb, uncomment the whole logger section
+  # ---
+  logger:
+    class_path: lightning.pytorch.loggers.WandbLogger
+    init_args:
+      name: 'infctx-v5-unit-test-baseline (train-ctx=4096, data-ctx=full)'
+      project: 'RWKV-infctx-unit-test'
+      tags: ['RWKV', 'infctx']
+  
+  # Checkpoint settings for the training process
+  callbacks:
+    class_path: lightning.pytorch.callbacks.ModelCheckpoint
+    init_args:
+      # Configure this to the path you want to save your checkpoints to
+      # note that a subdir will be created with the name `epoch=x-step=y.ckpt`
+      # 
+      # to convert a checkpoint to a model, you can use the 
+      # `python3 export_checkpoint.py <checkpoint path>` script, 
+      # which will create a `rwkv_model.pth` in the checkpoint directory.
+      #
+      # Do not use the `zero_to_fp32.py` script as that will have export format issues
+      dirpath: ../checkpoint/trainer-validaiton/infctx-v5-enwiki-10k-full
+      filename: null
+      
+      # Save the top/last K checkpoints
+      save_top_k: 3
+      # Choose by the most recent checkpoints (step based)
+      monitor: 'step'
+      mode: max
+      
+      # If enabled (true), save a copy of the latest checkpoint to 'last.ckpt'
+      # useful to simply checkpoint resume scripts, at a price of disk performance
+      save_last: false
+
+      # DO NOT set this as true, as the model weight exported will have format issues
+      # expert as checkpoint, and use the `export_checkpoint.py` script to convert to model instead
+      save_weights_only: false
+
+      # How frequent you want to save a checkpoint for every step.
+      # This will happen for every X data sample, where X = every_n_train_steps * accumulate_grad_batches
+      #
+      # In general you will want to avoid putting a low number (expecially if accumulate_grad_batches <= 100)
+      # as the checkpoint process, will pause all the gpu training for some time, slowing down the overall process
+      # However you do not want to configure too high of a number, where you will lose too much progress if the training crashes
+      every_n_train_steps: 100
+      every_n_epochs: null
+      save_on_train_epoch_end: true
+      train_time_interval: null
+
+      # Other settings, you can probably leave alone
+      verbose: false
+      auto_insert_metric_name: true
+  
+  ########################################
+  ## Training run parameter settings
+  ########################################
+
+  # Generally what you want to configure is the maximum number of epochs
+  # Leave it as -1, and it will keep going forever till interrupted
+  # Or set it as a number, and it will stop after that number of epochs
+  max_epochs: 1
+  min_epochs: null
+  max_steps: -1
+  min_steps: null
+  max_time: null
+
+  # Number of datasamples to train for each step, a data sample is considered
+  # a "substep" in wandb logs, and a "step" is tracked as "trainer/global_step"
+  #
+  # This decides the number of datasample, to learn together from, before backproping
+  # any weight changes at the end of the batch.
+  #
+  # Recommended to be a big enough number (like 128/256) where it prevents the training 
+  # loss from flucuating in the process. But not too big of a number where the increased
+  # GPU vRAM / offloaded RAM usage will cause the training to crash.
+  #
+  # You are also recommended to configure this to a large enough number to fully utilize
+  # your GPU processing time %, and avoid idle time for the GPU between batches
+  #
+  # This number is divided by the number of GPUs, and nodes configured
+  # So if you have 4 GPUs, and 2 nodes, and this is configured as 128
+  # Each GPU will process 128/4/2 = 16 datasamples per step, via accumulate_grad_batches
+  target_batch_size: 16
+
+########################################
+## Training model settings
+########################################
+model:
+  # Model to start the finetune/training process from
+  load_model: ../model/L24-D2048-world-v5base-init.pth
+
+  # Context length to use for the training process
+  # the larger the number (and batch size) the larger the vram usage
+  # 
+  # Note that if the datasample context length is larger then the ctx_len
+  # its training process would be split into ctx_len sized chunks.
+  #
+  # This allows the training of extreamly large context length (eg. 100k),
+  # without eating up too much vram by keeping the training context length
+  # to a resonable number sutible to the current GPU setup
+  ctx_len: 4096
+  
+  # Data samples would be cut down to the respective max ctx_len_cutoffs
+  # values if its larger then ctx_len. If the data sample is larger then
+  # the largest len_cutoff, the remaining data will be discarded
+  ctx_len_cutoffs: []
+  # Experimental settings, number of tokens to skip in the data sample
+  # prefix, for the respective cutoff length. Used to speed up the process
+  ctx_len_warmup_steps: []
+
+  # Learning rate of the training process
+  # ---
+
+  # Initia learning rate of the process
+  lr_init: 8e-4
+  # Final learning rate after the learning rate period
+  # learning rate will stay at final value from then onwards
+  lr_final: 4e-4
+
+  # Number of epoch to reduce the learning rate from lr_init to lr_final
+  #  1 means a single epoch (so lr would be lr_final from epoch 2 onwards)
+  #  0 means lr_final will apply immediately
+  # -1 means we take the current max_step / max_epoch as the period
+  lr_period: 1
+  # lr_period type if its set, defaults to epoch
+  lr_period_type: epoch
+
+  # Adam optimizer settings
+  # You probably want to leave this alone, unless you know what you are doing
+  beta1: 0.9
+  beta2: 0.99
+  adam_eps: 1.0e-08
+  weight_decay: 0.01
+
+  # torch.set_float32_matmul_precision, used to optimize operations with tensor cores
+  # this should be set as null, for non cuda core GPUs
+  torch_set_float32_matmul_precision: 'high'
+  # torch_set_float32_matmul_precision: null
+
+  # Segmented based learning, used to work around training of large context length
+  # beyond what can be supported by the current GPU vram architecture
+  #
+  # This is not 1:1 equivalent to the same training process with required vram
+  # as the training process is split into multiple segments, part by part.
+  # with limited learnings from the previous segment.
+  bptt_learning: true
+
+  # Segmented range to performing backprop learning on
+  # 1 means to apply only for the last segment
+  # -1 means to apply for all segments
+  bptt_learning_range: -1
+
+data:
+  # dataset_path for the prebuilt dataset, using HF `load_from_disk()`
+  #
+  # Use this if you have built your own dataset and saved it with `save_to_disk()`
+  # with source left as null. Other wise configure this to a directory which the 
+  # dataset will be built and tokenized by the huggingface dataset process.
+  data_path: ../datapath/enwiki_10k-world-4096/
+
+  # Other wise provide the source path, which is used as huggingface dataset path
+  # this will be used to populate the dataset_path
+  #
+  # Use either the following
+  # - hugging face dataset 
+  # - Directory path to a directory containing dataset files
+  # - Path to a single dataset file
+  # - hugging face dataset mode (ie: text,csv,etc - use data_dir, to configure the path then)
+  # - null
+  #
+  # If source is disabled, all other params, except data_path, is ignored
+  source: "teven/enwiki_10k"
+  # source: text
+  # source: /home/ubuntu/RWKV-LM-LoRA/dataset-text/enwik8.txt
+
+  # Use data_dir, if you are using source=text/json/etc
+  # this should be relative to the trainer script path
+  source_data_dir: null
+
+  # After loading the dataset, split out test data used for unit-test, 
+  # This process is skipped if the dataset includes a test split
+  # This process is skipped if set to zero
+  test_split: 0.01
+  test_split_shuffle: false
+
+  # Tokenizer to use, use either the inbuilt 'neox', or 'world' tokenizer
+  # If using a custom tokenizer, provide the tokenizer file path
+  # ---
+  tokenizer: world
+
+  # Minimum / Maximum token size of the dataset to use
+  # useful for filtering out small noisy data samples from large datasets
+  # (eg. removal of small articles of less then 512 tokens from wikipedia)
+  #
+  # This is ignored, if set to -1
+  min_token_size: -1
+  max_token_size: -1
+
+  # Rechunking of text dataset, this is done only when source is set as 'text'
+  # and will merge the various sentencees, into larger chunks up to the target size
+  #
+  # Defaults to 4096
+  #
+  # This is ignored, if source is not set as text
+  # This is ignored, if set to zero
+  # ---
+  # text_rechunk_size: 4096
+
+  # Apply text rechunk to the dataset, even if its not a 'text' source
+  # This is done only after dataset filtering, and if source is not 'text'
+  # ---
+  text_rechunk_force: false
+
+  # Custom text column to use, useful for dataset with alternative training columns labels
+  # This is checked before multi column merging, default is null (disabled)
+  # eg: 'code'
+  # ---
+  # custom_text_key: 'code'
+
+  # Multi Column merging process, default setting is used to support and merge
+  # "instruction", "input", "output", datasets. To disable set multi_column_keys to []
+  #
+  # A minimum of 2 columns is required, with non empty data, for the merge to occur
+  # If no match is found, this will fallback to the default prompt/completion or text column, 
+  # or throw an error if the default fallback is not found
+  # ---
+  # multi_column_keys: ['instruction', 'input', 'output']
+  # multi_column_prefix: ['Instruction:\n', 'Input:\n', 'Output:\n']
+  # multi_column_train_mask: [true, false, true]
+  # multi_column_separator: '\n\n'
+
+  # If processing prompt/completion jsonl pairs, the prompt is masked by default
+  # use this flag to disable this default behaviour
+  # ---
+  # disable_prompt_completion_mask: false
+
+# Path to the current checkpoint to continue training from
+# Enable this to the last checkpoint after the first run 
+# (if it crash and you want to resume)
+# ckpt_path: ../checkpoint/trainer-validaiton/infctx-unit-test-baseline/epoch=0-step=20.ckpt
+ckpt_path: null
diff --git a/notebook/trainer-v5-validation/dataset-microbatch.ipynb b/notebook/trainer-v5-validation/dataset-microbatch.ipynb
new file mode 100644
index 00000000..73ebc960
--- /dev/null
+++ b/notebook/trainer-v5-validation/dataset-microbatch.ipynb
@@ -0,0 +1,469 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Dataset microbatch testing\n",
+    "\n",
+    "Testing runs on multiple micro batch settings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ENABLE_WANDB: False\n",
+      "GPU_DEVICES: auto\n",
+      "NOTEBOOK_DIR: /home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation\n",
+      "TRAINER_DIR: /home/picocreator/rwkv-proj/RWKV-infctx-trainer/RWKV-v5\n",
+      "PROJECT_DIR: /home/picocreator/rwkv-proj/RWKV-infctx-trainer\n"
+     ]
+    }
+   ],
+   "source": [
+    "GPU_DEVICES=\"auto\"\n",
+    "ENABLE_WANDB=False\n",
+    "WANDB_PREFIX=\"infctx-v5-microbatch\"\n",
+    "\n",
+    "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n",
+    "print(\"GPU_DEVICES:\", GPU_DEVICES)\n",
+    "\n",
+    "if ENABLE_WANDB:\n",
+    "    WANDB_MODE=\"online\"\n",
+    "else:\n",
+    "    WANDB_MODE=\"disabled\"\n",
+    "\n",
+    "# Computing the notebook, and various paths\n",
+    "import os\n",
+    "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n",
+    "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../\"))\n",
+    "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n",
+    "\n",
+    "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n",
+    "print(\"TRAINER_DIR:\", TRAINER_DIR)\n",
+    "print(\"PROJECT_DIR:\", PROJECT_DIR)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-01-18 11:19:39,010] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.1'\n",
+      "---- Initializing model ----\n",
+      "No of layers: 6\n",
+      "Embedding size: 512\n",
+      "Output model path: ../model/L6-D512-world-v5base-init.pth\n",
+      "Vocab size: 65536\n",
+      "Emb scale: 0.0001\n",
+      "Note: this process takes a significant time (and ram) for large models\n",
+      "---- ----- ----\n",
+      "Model exists, skipping init_model\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Init the model\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    python3 ./init_model.py \\\n",
+    "        --n_layer 6 --n_embd 512 \\\n",
+    "        --vocab_size world --skip-if-exists \\\n",
+    "        \"../model/L6-D512-world-v5base-init.pth\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Map (num_proc=16): 100%|█████████| 10000/10000 [00:01<00:00, 9575.14 examples/s]\n",
+      "Filter (num_proc=16): 100%|█████| 10000/10000 [00:00<00:00, 12203.75 examples/s]\n",
+      "Map (num_proc=16): 100%|██████████| 9892/9892 [00:00<00:00, 20646.21 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 9892/9892 [00:00<00:00, 241357.37 examp\n",
+      "Saving the dataset (1/1 shards): 100%|█| 100/100 [00:00<00:00, 28064.93 examples\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Lets preload the requried dataset \n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    python3 preload_datapath.py \"{NOTEBOOK_DIR}/config/enwiki_10k-world-full.yaml\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# microbatch=1\n",
+    "\n",
+    "Note: We are intentionally testing without rechunk, as that has known edge case issues."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-01-18 12:00:55,830] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.1'\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-full/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Microbatch 1 - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=1', '--trainer.devices=auto'], args=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-full/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Microbatch 1 - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=1', '--trainer.devices=auto'].\n",
+      "Seed set to 3941088705\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /home/picocreator/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "\n",
+      "\n",
+      "[RWKV.Trainer] Applying 'target_batch_size' with the following:\n",
+      "   - target_batch_size:       16\n",
+      "   - num_nodes:               1\n",
+      "   - num_devices:             1\n",
+      "   - microbatch_size:         1\n",
+      "   - accumulate_grad_batches: 16\n",
+      "   - effective_batch_size:    16\n",
+      "\n",
+      "Saving the dataset (1/1 shards): 100%|█| 9892/9892 [00:00<00:00, 595479.80 examp\n",
+      "Saving the dataset (1/1 shards): 100%|█| 100/100 [00:00<00:00, 28472.64 examples\n",
+      "[rank: 0] Seed set to 3941088705\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/1\n",
+      "Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:639: Checkpoint directory ../checkpoint/v5-enwiki-10k-full/ exists and is not empty.\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
+      "#\n",
+      "# RWKV lighting_trainer.py important notes \n",
+      "# https://github.com/RWKV/RWKV-infctx-trainer \n",
+      "#\n",
+      "# - Ensure your host is not running cuda 12.0 (use either 11.8, or >=12.1), as this is known to have freeze issues\n",
+      "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\n",
+      "# - When resuming from checkpoint, the estimated time is inaccurate\n",
+      "#\n",
+      "\n",
+      "[RWKV.model] Configuring optimizer with\n",
+      "    - lr_init:  8.000e-04 (0.0008)\n",
+      "    - lr_final: 4.000e-04 (0.0004)\n",
+      "\n",
+      "Using /home/picocreator/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu121/fused_adam/build.ninja...\n",
+      "Building extension module fused_adam...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.05255270004272461 seconds\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201450/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading `train_dataloader` to estimate number of stepping batches.\n",
+      "\n",
+      "  | Name   | Type       | Params\n",
+      "--------------------------------------\n",
+      "0 | emb    | Embedding  | 33.6 M\n",
+      "1 | blocks | ModuleList | 20.5 M\n",
+      "2 | ln_out | LayerNorm  | 1.0 K \n",
+      "3 | head   | Linear     | 33.6 M\n",
+      "--------------------------------------\n",
+      "87.6 M    Trainable params\n",
+      "0         Non-trainable params\n",
+      "87.6 M    Total params\n",
+      "350.405   Total estimated model params size (MB)\n",
+      "Epoch 0:  16%|▏| 1600/9892 [00:55<04:49, 28.62it/s, v_num=mu7h, train/loss=5.310/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "Epoch 0: 100%|█| 9892/9892 [05:45<00:00, 28.62it/s, v_num=mu7h, train/loss=4.090\n",
+      "Validation: |                                             | 0/? [00:00<?, ?it/s]\u001b[A\n",
+      "Validation:   0%|                                       | 0/100 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0:   0%|                          | 0/100 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0:   1%|▏                 | 1/100 [00:00<00:30,  3.23it/s]\u001b[A\n",
+      "Validation DataLoader 0:   2%|▎                 | 2/100 [00:00<00:28,  3.44it/s]\u001b[A\n",
+      "Validation DataLoader 0:   3%|▌                 | 3/100 [00:00<00:27,  3.49it/s]\u001b[A\n",
+      "Validation DataLoader 0:   4%|▋                 | 4/100 [00:01<00:27,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:   5%|▉                 | 5/100 [00:01<00:26,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:   6%|█                 | 6/100 [00:01<00:26,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:   7%|█▎                | 7/100 [00:01<00:25,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:   8%|█▍                | 8/100 [00:02<00:25,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:   9%|█▌                | 9/100 [00:02<00:25,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  10%|█▋               | 10/100 [00:02<00:24,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  11%|█▊               | 11/100 [00:03<00:24,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  12%|██               | 12/100 [00:03<00:24,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  13%|██▏              | 13/100 [00:03<00:23,  3.65it/s]\u001b[A\n",
+      "Validation DataLoader 0:  14%|██▍              | 14/100 [00:03<00:23,  3.65it/s]\u001b[A\n",
+      "Validation DataLoader 0:  15%|██▌              | 15/100 [00:04<00:23,  3.66it/s]\u001b[A\n",
+      "Validation DataLoader 0:  16%|██▋              | 16/100 [00:04<00:22,  3.66it/s]\u001b[A\n",
+      "Validation DataLoader 0:  17%|██▉              | 17/100 [00:04<00:22,  3.66it/s]\u001b[A\n",
+      "Validation DataLoader 0:  18%|███              | 18/100 [00:04<00:22,  3.66it/s]\u001b[A\n",
+      "Validation DataLoader 0:  19%|███▏             | 19/100 [00:05<00:22,  3.67it/s]\u001b[A\n",
+      "Validation DataLoader 0:  20%|███▍             | 20/100 [00:05<00:21,  3.67it/s]\u001b[A\n",
+      "Validation DataLoader 0:  21%|███▌             | 21/100 [00:05<00:21,  3.67it/s]\u001b[A\n",
+      "Validation DataLoader 0:  22%|███▋             | 22/100 [00:06<00:21,  3.66it/s]\u001b[A\n",
+      "Validation DataLoader 0:  23%|███▉             | 23/100 [00:06<00:21,  3.67it/s]\u001b[A\n",
+      "Validation DataLoader 0:  24%|████             | 24/100 [00:06<00:20,  3.67it/s]\u001b[A\n",
+      "Validation DataLoader 0:  25%|████▎            | 25/100 [00:07<00:22,  3.39it/s]\u001b[A\n",
+      "Validation DataLoader 0:  26%|████▍            | 26/100 [00:07<00:21,  3.40it/s]\u001b[A\n",
+      "Validation DataLoader 0:  27%|████▌            | 27/100 [00:07<00:21,  3.41it/s]\u001b[A\n",
+      "Validation DataLoader 0:  28%|████▊            | 28/100 [00:08<00:21,  3.42it/s]\u001b[A\n",
+      "Validation DataLoader 0:  29%|████▉            | 29/100 [00:08<00:20,  3.43it/s]\u001b[A\n",
+      "Validation DataLoader 0:  30%|█████            | 30/100 [00:08<00:20,  3.44it/s]\u001b[A\n",
+      "Validation DataLoader 0:  31%|█████▎           | 31/100 [00:08<00:20,  3.44it/s]\u001b[A\n",
+      "Validation DataLoader 0:  32%|█████▍           | 32/100 [00:09<00:19,  3.45it/s]\u001b[A\n",
+      "Validation DataLoader 0:  33%|█████▌           | 33/100 [00:09<00:19,  3.46it/s]\u001b[A\n",
+      "Validation DataLoader 0:  34%|█████▊           | 34/100 [00:09<00:19,  3.47it/s]\u001b[A\n",
+      "Validation DataLoader 0:  35%|█████▉           | 35/100 [00:10<00:18,  3.47it/s]\u001b[A\n",
+      "Validation DataLoader 0:  36%|██████           | 36/100 [00:10<00:18,  3.48it/s]\u001b[A\n",
+      "Validation DataLoader 0:  37%|██████▎          | 37/100 [00:10<00:18,  3.49it/s]\u001b[A\n",
+      "Validation DataLoader 0:  38%|██████▍          | 38/100 [00:10<00:17,  3.49it/s]\u001b[A\n",
+      "Validation DataLoader 0:  39%|██████▋          | 39/100 [00:11<00:17,  3.50it/s]\u001b[A\n",
+      "Validation DataLoader 0:  40%|██████▊          | 40/100 [00:11<00:17,  3.50it/s]\u001b[A\n",
+      "Validation DataLoader 0:  41%|██████▉          | 41/100 [00:11<00:16,  3.51it/s]\u001b[A\n",
+      "Validation DataLoader 0:  42%|███████▏         | 42/100 [00:11<00:16,  3.51it/s]\u001b[A\n",
+      "Validation DataLoader 0:  43%|███████▎         | 43/100 [00:12<00:16,  3.51it/s]\u001b[A\n",
+      "Validation DataLoader 0:  44%|███████▍         | 44/100 [00:12<00:15,  3.52it/s]\u001b[A\n",
+      "Validation DataLoader 0:  45%|███████▋         | 45/100 [00:12<00:15,  3.52it/s]\u001b[A\n",
+      "Validation DataLoader 0:  46%|███████▊         | 46/100 [00:13<00:15,  3.53it/s]\u001b[A\n",
+      "Validation DataLoader 0:  47%|███████▉         | 47/100 [00:13<00:15,  3.53it/s]\u001b[A\n",
+      "Validation DataLoader 0:  48%|████████▏        | 48/100 [00:13<00:14,  3.53it/s]\u001b[A\n",
+      "Validation DataLoader 0:  49%|████████▎        | 49/100 [00:13<00:14,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:  50%|████████▌        | 50/100 [00:14<00:14,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:  51%|████████▋        | 51/100 [00:14<00:13,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:  52%|████████▊        | 52/100 [00:14<00:13,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:  53%|█████████        | 53/100 [00:14<00:13,  3.55it/s]\u001b[A\n",
+      "Validation DataLoader 0:  54%|█████████▏       | 54/100 [00:15<00:12,  3.55it/s]\u001b[A\n",
+      "Validation DataLoader 0:  55%|█████████▎       | 55/100 [00:15<00:12,  3.55it/s]\u001b[A\n",
+      "Validation DataLoader 0:  56%|█████████▌       | 56/100 [00:15<00:12,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  57%|█████████▋       | 57/100 [00:16<00:12,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  58%|█████████▊       | 58/100 [00:16<00:11,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  59%|██████████       | 59/100 [00:16<00:11,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  60%|██████████▏      | 60/100 [00:16<00:11,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  61%|██████████▎      | 61/100 [00:17<00:10,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  62%|██████████▌      | 62/100 [00:17<00:10,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  63%|██████████▋      | 63/100 [00:17<00:10,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  64%|██████████▉      | 64/100 [00:17<00:10,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  65%|███████████      | 65/100 [00:18<00:09,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  66%|███████████▏     | 66/100 [00:18<00:09,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  67%|███████████▍     | 67/100 [00:18<00:09,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  68%|███████████▌     | 68/100 [00:18<00:08,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  69%|███████████▋     | 69/100 [00:19<00:08,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  70%|███████████▉     | 70/100 [00:19<00:08,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  71%|████████████     | 71/100 [00:19<00:08,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  72%|████████████▏    | 72/100 [00:20<00:07,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  73%|████████████▍    | 73/100 [00:20<00:07,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  74%|████████████▌    | 74/100 [00:20<00:07,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  75%|████████████▊    | 75/100 [00:20<00:06,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  76%|████████████▉    | 76/100 [00:21<00:06,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  77%|█████████████    | 77/100 [00:21<00:06,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  78%|█████████████▎   | 78/100 [00:21<00:06,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  79%|█████████████▍   | 79/100 [00:21<00:05,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  80%|█████████████▌   | 80/100 [00:22<00:05,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  81%|█████████████▊   | 81/100 [00:22<00:05,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  82%|█████████████▉   | 82/100 [00:22<00:04,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  83%|██████████████   | 83/100 [00:22<00:04,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  84%|██████████████▎  | 84/100 [00:23<00:04,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  85%|██████████████▍  | 85/100 [00:23<00:04,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  86%|██████████████▌  | 86/100 [00:23<00:03,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  87%|██████████████▊  | 87/100 [00:24<00:03,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  88%|██████████████▉  | 88/100 [00:24<00:03,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  89%|███████████████▏ | 89/100 [00:24<00:03,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  90%|███████████████▎ | 90/100 [00:24<00:02,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  91%|███████████████▍ | 91/100 [00:25<00:02,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  92%|███████████████▋ | 92/100 [00:25<00:02,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  93%|███████████████▊ | 93/100 [00:25<00:01,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  94%|███████████████▉ | 94/100 [00:25<00:01,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  95%|████████████████▏| 95/100 [00:26<00:01,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  96%|████████████████▎| 96/100 [00:26<00:01,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  97%|████████████████▍| 97/100 [00:26<00:00,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  98%|████████████████▋| 98/100 [00:27<00:00,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  99%|████████████████▊| 99/100 [00:27<00:00,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0: 100%|████████████████| 100/100 [00:27<00:00,  3.63it/s]\u001b[A\n",
+      "Epoch 0: 100%|█| 9892/9892 [06:13<00:00, 26.50it/s, v_num=mu7h, train/loss=4.090`Trainer.fit` stopped: `max_epochs=1` reached.\n",
+      "Epoch 0: 100%|█| 9892/9892 [06:13<00:00, 26.50it/s, v_num=mu7h, train/loss=4.090\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
+    "    python3 lightning_trainer.py fit \\\n",
+    "        -c \"{NOTEBOOK_DIR}/config/enwiki_10k-world-full.yaml\" \\\n",
+    "        --model.load_model=\"../model/L6-D512-world-v5base-init.pth\" \\\n",
+    "        --trainer.callbacks.init_args.dirpath=\"../checkpoint/v5-enwiki-10k-full/\" \\\n",
+    "        --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Microbatch 1 - (deepspeed_stage_1)\" \\\n",
+    "        --trainer.strategy=\"deepspeed_stage_1\" \\\n",
+    "        --trainer.microbatch_size=1 \\\n",
+    "        --trainer.devices=\"{GPU_DEVICES}\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# microbatch=2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Lets preload the requried dataset \n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    python3 preload_datapath.py \"{NOTEBOOK_DIR}/config/test-dataset-repack.yaml\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-01-18 12:10:50,430] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.1'\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/test-dataset-repack.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-packing/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Packing - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=2', '--trainer.devices=auto'], args=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/test-dataset-repack.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-packing/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Packing - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=2', '--trainer.devices=auto'].\n",
+      "Seed set to 3941088705\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /home/picocreator/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "\n",
+      "\n",
+      "[RWKV.Trainer] Applying 'target_batch_size' with the following:\n",
+      "   - target_batch_size:       16\n",
+      "   - num_nodes:               1\n",
+      "   - num_devices:             1\n",
+      "   - microbatch_size:         2\n",
+      "   - accumulate_grad_batches: 8\n",
+      "   - effective_batch_size:    16\n",
+      "\n",
+      "Map (num_proc=16): 100%|█████████| 10000/10000 [00:01<00:00, 9808.78 examples/s]\n",
+      "Filter (num_proc=16): 100%|█████| 10000/10000 [00:00<00:00, 13056.99 examples/s]\n",
+      "Map (num_proc=16): 100%|██████████| 6409/6409 [00:00<00:00, 14454.07 examples/s]\n",
+      "Map (num_proc=16): 100%|███████████| 6409/6409 [00:00<00:00, 6890.28 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 323/323 [00:00<00:00, 7147.58 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█| 65/65 [00:00<00:00, 17810.79 examples/s\n",
+      "[rank: 0] Seed set to 3941088705\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/1\n",
+      "Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
+      "#\n",
+      "# RWKV lighting_trainer.py important notes \n",
+      "# https://github.com/RWKV/RWKV-infctx-trainer \n",
+      "#\n",
+      "# - Ensure your host is not running cuda 12.0 (use either 11.8, or >=12.1), as this is known to have freeze issues\n",
+      "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\n",
+      "# - When resuming from checkpoint, the estimated time is inaccurate\n",
+      "#\n",
+      "\n",
+      "[RWKV.model] Configuring optimizer with\n",
+      "    - lr_init:  8.000e-04 (0.0008)\n",
+      "    - lr_final: 4.000e-04 (0.0004)\n",
+      "\n",
+      "Using /home/picocreator/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu121/fused_adam/build.ninja...\n",
+      "Building extension module fused_adam...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.05469512939453125 seconds\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201450/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading `train_dataloader` to estimate number of stepping batches.\n",
+      "\n",
+      "  | Name   | Type       | Params\n",
+      "--------------------------------------\n",
+      "0 | emb    | Embedding  | 33.6 M\n",
+      "1 | blocks | ModuleList | 20.5 M\n",
+      "2 | ln_out | LayerNorm  | 1.0 K \n",
+      "3 | head   | Linear     | 33.6 M\n",
+      "--------------------------------------\n",
+      "87.6 M    Trainable params\n",
+      "0         Non-trainable params\n",
+      "87.6 M    Total params\n",
+      "350.405   Total estimated model params size (MB)\n",
+      "Epoch 0:  21%|▋  | 34/162 [00:19<01:12,  1.76it/s, v_num=oczk, train/loss=8.880]^C\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
+    "    python3 lightning_trainer.py fit \\\n",
+    "        -c \"{NOTEBOOK_DIR}/config/enwiki_10k-world-full.yaml\" \\\n",
+    "        --model.load_model=\"../model/L6-D512-world-v5base-init.pth\" \\\n",
+    "        --trainer.callbacks.init_args.dirpath=\"../checkpoint/v5-enwiki-10k-full/\" \\\n",
+    "        --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Microbatch 2 - (deepspeed_stage_1)\" \\\n",
+    "        --trainer.strategy=\"deepspeed_stage_1\" \\\n",
+    "        --trainer.microbatch_size=2 \\\n",
+    "        --trainer.devices=\"{GPU_DEVICES}\""
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "rwkv-infctx",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From f24eb074ccbb390ceead13acd22c0d837a2347f3 Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Thu, 18 Jan 2024 12:22:28 +0800
Subject: [PATCH 30/33] disable selective token loss by default

---
 RWKV-v5/src/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RWKV-v5/src/model.py b/RWKV-v5/src/model.py
index 1d065584..f52d1d60 100644
--- a/RWKV-v5/src/model.py
+++ b/RWKV-v5/src/model.py
@@ -197,7 +197,7 @@ def __init__(self,
                  position_loss_bias_in_validation: bool = False,
                  
                  # Selective loss settings
-                 selective_token_loss_threshold: float = 1.0,
+                 selective_token_loss_threshold: float = 0.0,
 
                  # Backprop settings
                  grad_cp: bool = True,

From 25d1354dbe013d1df24e5cb64833b29f0033c97c Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Thu, 18 Jan 2024 12:22:41 +0800
Subject: [PATCH 31/33] microbatch validation

---
 .../dataset-microbatch.ipynb                  | 357 +++++++++++++++++-
 1 file changed, 338 insertions(+), 19 deletions(-)

diff --git a/notebook/trainer-v5-validation/dataset-microbatch.ipynb b/notebook/trainer-v5-validation/dataset-microbatch.ipynb
index 73ebc960..df25fda5 100644
--- a/notebook/trainer-v5-validation/dataset-microbatch.ipynb
+++ b/notebook/trainer-v5-validation/dataset-microbatch.ipynb
@@ -330,27 +330,230 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 32,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-01-18 12:11:50,734] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.1'\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-full/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Microbatch 2 - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=2', '--trainer.devices=auto'], args=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-full/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Microbatch 2 - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=2', '--trainer.devices=auto'].\n",
+      "Seed set to 3941088705\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /home/picocreator/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "\n",
+      "\n",
+      "[RWKV.Trainer] Applying 'target_batch_size' with the following:\n",
+      "   - target_batch_size:       16\n",
+      "   - num_nodes:               1\n",
+      "   - num_devices:             1\n",
+      "   - microbatch_size:         2\n",
+      "   - accumulate_grad_batches: 8\n",
+      "   - effective_batch_size:    16\n",
+      "\n",
+      "Saving the dataset (1/1 shards): 100%|█| 9892/9892 [00:00<00:00, 530522.66 examp\n",
+      "Saving the dataset (1/1 shards): 100%|█| 100/100 [00:00<00:00, 28455.25 examples\n",
+      "[rank: 0] Seed set to 3941088705\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/1\n",
+      "Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:639: Checkpoint directory ../checkpoint/v5-enwiki-10k-full/ exists and is not empty.\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
+      "#\n",
+      "# RWKV lighting_trainer.py important notes \n",
+      "# https://github.com/RWKV/RWKV-infctx-trainer \n",
+      "#\n",
+      "# - Ensure your host is not running cuda 12.0 (use either 11.8, or >=12.1), as this is known to have freeze issues\n",
+      "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\n",
+      "# - When resuming from checkpoint, the estimated time is inaccurate\n",
+      "#\n",
+      "\n",
+      "[RWKV.model] Configuring optimizer with\n",
+      "    - lr_init:  8.000e-04 (0.0008)\n",
+      "    - lr_final: 4.000e-04 (0.0004)\n",
+      "\n",
+      "Using /home/picocreator/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu121/fused_adam/build.ninja...\n",
+      "Building extension module fused_adam...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.05180692672729492 seconds\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201450/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading `train_dataloader` to estimate number of stepping batches.\n",
+      "\n",
+      "  | Name   | Type       | Params\n",
+      "--------------------------------------\n",
+      "0 | emb    | Embedding  | 33.6 M\n",
+      "1 | blocks | ModuleList | 20.5 M\n",
+      "2 | ln_out | LayerNorm  | 1.0 K \n",
+      "3 | head   | Linear     | 33.6 M\n",
+      "--------------------------------------\n",
+      "87.6 M    Trainable params\n",
+      "0         Non-trainable params\n",
+      "87.6 M    Total params\n",
+      "350.405   Total estimated model params size (MB)\n",
+      "Epoch 0:  16%|▏| 800/4946 [00:35<03:05, 22.41it/s, v_num=3o87, train/loss=5.060]/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "Epoch 0: 100%|█| 4946/4946 [03:42<00:00, 22.19it/s, v_num=3o87, train/loss=5.720\n",
+      "Validation: |                                             | 0/? [00:00<?, ?it/s]\u001b[A\n",
+      "Validation:   0%|                                       | 0/100 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0:   0%|                          | 0/100 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0:   1%|▏                 | 1/100 [00:00<00:30,  3.21it/s]\u001b[A\n",
+      "Validation DataLoader 0:   2%|▎                 | 2/100 [00:00<00:28,  3.41it/s]\u001b[A\n",
+      "Validation DataLoader 0:   3%|▌                 | 3/100 [00:00<00:27,  3.51it/s]\u001b[A\n",
+      "Validation DataLoader 0:   4%|▋                 | 4/100 [00:01<00:26,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:   5%|▉                 | 5/100 [00:01<00:26,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:   6%|█                 | 6/100 [00:01<00:26,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:   7%|█▎                | 7/100 [00:01<00:25,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:   8%|█▍                | 8/100 [00:02<00:25,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:   9%|█▌                | 9/100 [00:02<00:25,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  10%|█▋               | 10/100 [00:02<00:24,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  11%|█▊               | 11/100 [00:03<00:24,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  12%|██               | 12/100 [00:03<00:24,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  13%|██▏              | 13/100 [00:03<00:23,  3.65it/s]\u001b[A\n",
+      "Validation DataLoader 0:  14%|██▍              | 14/100 [00:03<00:23,  3.65it/s]\u001b[A\n",
+      "Validation DataLoader 0:  15%|██▌              | 15/100 [00:04<00:23,  3.66it/s]\u001b[A\n",
+      "Validation DataLoader 0:  16%|██▋              | 16/100 [00:04<00:22,  3.66it/s]\u001b[A\n",
+      "Validation DataLoader 0:  17%|██▉              | 17/100 [00:04<00:22,  3.67it/s]\u001b[A\n",
+      "Validation DataLoader 0:  18%|███              | 18/100 [00:04<00:22,  3.67it/s]\u001b[A\n",
+      "Validation DataLoader 0:  19%|███▏             | 19/100 [00:05<00:22,  3.67it/s]\u001b[A\n",
+      "Validation DataLoader 0:  20%|███▍             | 20/100 [00:05<00:21,  3.68it/s]\u001b[A\n",
+      "Validation DataLoader 0:  21%|███▌             | 21/100 [00:05<00:21,  3.68it/s]\u001b[A\n",
+      "Validation DataLoader 0:  22%|███▋             | 22/100 [00:05<00:21,  3.68it/s]\u001b[A\n",
+      "Validation DataLoader 0:  23%|███▉             | 23/100 [00:06<00:20,  3.68it/s]\u001b[A\n",
+      "Validation DataLoader 0:  24%|████             | 24/100 [00:06<00:20,  3.68it/s]\u001b[A\n",
+      "Validation DataLoader 0:  25%|████▎            | 25/100 [00:07<00:22,  3.40it/s]\u001b[A\n",
+      "Validation DataLoader 0:  26%|████▍            | 26/100 [00:07<00:21,  3.41it/s]\u001b[A\n",
+      "Validation DataLoader 0:  27%|████▌            | 27/100 [00:07<00:21,  3.42it/s]\u001b[A\n",
+      "Validation DataLoader 0:  28%|████▊            | 28/100 [00:08<00:20,  3.43it/s]\u001b[A\n",
+      "Validation DataLoader 0:  29%|████▉            | 29/100 [00:08<00:20,  3.44it/s]\u001b[A\n",
+      "Validation DataLoader 0:  30%|█████            | 30/100 [00:08<00:20,  3.45it/s]\u001b[A\n",
+      "Validation DataLoader 0:  31%|█████▎           | 31/100 [00:08<00:19,  3.46it/s]\u001b[A\n",
+      "Validation DataLoader 0:  32%|█████▍           | 32/100 [00:09<00:19,  3.47it/s]\u001b[A\n",
+      "Validation DataLoader 0:  33%|█████▌           | 33/100 [00:09<00:19,  3.47it/s]\u001b[A\n",
+      "Validation DataLoader 0:  34%|█████▊           | 34/100 [00:09<00:18,  3.48it/s]\u001b[A\n",
+      "Validation DataLoader 0:  35%|█████▉           | 35/100 [00:10<00:18,  3.49it/s]\u001b[A\n",
+      "Validation DataLoader 0:  36%|██████           | 36/100 [00:10<00:18,  3.49it/s]\u001b[A\n",
+      "Validation DataLoader 0:  37%|██████▎          | 37/100 [00:10<00:17,  3.50it/s]\u001b[A\n",
+      "Validation DataLoader 0:  38%|██████▍          | 38/100 [00:10<00:17,  3.51it/s]\u001b[A\n",
+      "Validation DataLoader 0:  39%|██████▋          | 39/100 [00:11<00:17,  3.51it/s]\u001b[A\n",
+      "Validation DataLoader 0:  40%|██████▊          | 40/100 [00:11<00:17,  3.52it/s]\u001b[A\n",
+      "Validation DataLoader 0:  41%|██████▉          | 41/100 [00:11<00:16,  3.52it/s]\u001b[A\n",
+      "Validation DataLoader 0:  42%|███████▏         | 42/100 [00:11<00:16,  3.53it/s]\u001b[A\n",
+      "Validation DataLoader 0:  43%|███████▎         | 43/100 [00:12<00:16,  3.53it/s]\u001b[A\n",
+      "Validation DataLoader 0:  44%|███████▍         | 44/100 [00:12<00:15,  3.53it/s]\u001b[A\n",
+      "Validation DataLoader 0:  45%|███████▋         | 45/100 [00:12<00:15,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:  46%|███████▊         | 46/100 [00:12<00:15,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:  47%|███████▉         | 47/100 [00:13<00:14,  3.55it/s]\u001b[A\n",
+      "Validation DataLoader 0:  48%|████████▏        | 48/100 [00:13<00:14,  3.55it/s]\u001b[A\n",
+      "Validation DataLoader 0:  49%|████████▎        | 49/100 [00:13<00:14,  3.55it/s]\u001b[A\n",
+      "Validation DataLoader 0:  50%|████████▌        | 50/100 [00:14<00:14,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  51%|████████▋        | 51/100 [00:14<00:13,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  52%|████████▊        | 52/100 [00:14<00:13,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  53%|█████████        | 53/100 [00:14<00:13,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  54%|█████████▏       | 54/100 [00:15<00:12,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  55%|█████████▎       | 55/100 [00:15<00:12,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  56%|█████████▌       | 56/100 [00:15<00:12,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  57%|█████████▋       | 57/100 [00:15<00:12,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  58%|█████████▊       | 58/100 [00:16<00:11,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  59%|██████████       | 59/100 [00:16<00:11,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  60%|██████████▏      | 60/100 [00:16<00:11,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  61%|██████████▎      | 61/100 [00:17<00:10,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  62%|██████████▌      | 62/100 [00:17<00:10,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  63%|██████████▋      | 63/100 [00:17<00:10,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  64%|██████████▉      | 64/100 [00:17<00:10,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  65%|███████████      | 65/100 [00:18<00:09,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  66%|███████████▏     | 66/100 [00:18<00:09,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  67%|███████████▍     | 67/100 [00:18<00:09,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  68%|███████████▌     | 68/100 [00:18<00:08,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  69%|███████████▋     | 69/100 [00:19<00:08,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  70%|███████████▉     | 70/100 [00:19<00:08,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  71%|████████████     | 71/100 [00:19<00:08,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  72%|████████████▏    | 72/100 [00:19<00:07,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  73%|████████████▍    | 73/100 [00:20<00:07,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  74%|████████████▌    | 74/100 [00:20<00:07,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  75%|████████████▊    | 75/100 [00:20<00:06,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  76%|████████████▉    | 76/100 [00:21<00:06,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  77%|█████████████    | 77/100 [00:21<00:06,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  78%|█████████████▎   | 78/100 [00:21<00:06,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  79%|█████████████▍   | 79/100 [00:21<00:05,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  80%|█████████████▌   | 80/100 [00:22<00:05,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  81%|█████████████▊   | 81/100 [00:22<00:05,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  82%|█████████████▉   | 82/100 [00:22<00:04,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  83%|██████████████   | 83/100 [00:22<00:04,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  84%|██████████████▎  | 84/100 [00:23<00:04,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  85%|██████████████▍  | 85/100 [00:23<00:04,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  86%|██████████████▌  | 86/100 [00:23<00:03,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  87%|██████████████▊  | 87/100 [00:23<00:03,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  88%|██████████████▉  | 88/100 [00:24<00:03,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  89%|███████████████▏ | 89/100 [00:24<00:03,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  90%|███████████████▎ | 90/100 [00:24<00:02,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  91%|███████████████▍ | 91/100 [00:25<00:02,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  92%|███████████████▋ | 92/100 [00:25<00:02,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  93%|███████████████▊ | 93/100 [00:25<00:01,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  94%|███████████████▉ | 94/100 [00:25<00:01,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  95%|████████████████▏| 95/100 [00:26<00:01,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  96%|████████████████▎| 96/100 [00:26<00:01,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  97%|████████████████▍| 97/100 [00:26<00:00,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  98%|████████████████▋| 98/100 [00:26<00:00,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  99%|████████████████▊| 99/100 [00:27<00:00,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0: 100%|████████████████| 100/100 [00:27<00:00,  3.64it/s]\u001b[A\n",
+      "Epoch 0: 100%|█| 4946/4946 [04:10<00:00, 19.74it/s, v_num=3o87, train/loss=5.720`Trainer.fit` stopped: `max_epochs=1` reached.\n",
+      "Epoch 0: 100%|█| 4946/4946 [04:10<00:00, 19.74it/s, v_num=3o87, train/loss=5.720\n"
+     ]
+    }
+   ],
    "source": [
-    "# Lets preload the requried dataset \n",
     "!cd \"{TRAINER_DIR}\" && \\\n",
-    "    python3 preload_datapath.py \"{NOTEBOOK_DIR}/config/test-dataset-repack.yaml\""
+    "    export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
+    "    python3 lightning_trainer.py fit \\\n",
+    "        -c \"{NOTEBOOK_DIR}/config/enwiki_10k-world-full.yaml\" \\\n",
+    "        --model.load_model=\"../model/L6-D512-world-v5base-init.pth\" \\\n",
+    "        --trainer.callbacks.init_args.dirpath=\"../checkpoint/v5-enwiki-10k-full/\" \\\n",
+    "        --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Microbatch 2 - (deepspeed_stage_1)\" \\\n",
+    "        --trainer.strategy=\"deepspeed_stage_1\" \\\n",
+    "        --trainer.microbatch_size=2 \\\n",
+    "        --trainer.devices=\"{GPU_DEVICES}\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# microbatch=8"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 33,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-01-18 12:10:50,430] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-01-18 12:17:38,330] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
       "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.1'\n",
-      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/test-dataset-repack.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-packing/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Packing - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=2', '--trainer.devices=auto'], args=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/test-dataset-repack.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-packing/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Packing - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=2', '--trainer.devices=auto'].\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-full/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Microbatch 2 - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=8', '--trainer.devices=auto'], args=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-full/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Microbatch 2 - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=8', '--trainer.devices=auto'].\n",
       "Seed set to 3941088705\n",
       "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
       "  return self.fget.__get__(instance, owner)()\n",
@@ -375,19 +578,16 @@
       "   - target_batch_size:       16\n",
       "   - num_nodes:               1\n",
       "   - num_devices:             1\n",
-      "   - microbatch_size:         2\n",
-      "   - accumulate_grad_batches: 8\n",
+      "   - microbatch_size:         8\n",
+      "   - accumulate_grad_batches: 2\n",
       "   - effective_batch_size:    16\n",
       "\n",
-      "Map (num_proc=16): 100%|█████████| 10000/10000 [00:01<00:00, 9808.78 examples/s]\n",
-      "Filter (num_proc=16): 100%|█████| 10000/10000 [00:00<00:00, 13056.99 examples/s]\n",
-      "Map (num_proc=16): 100%|██████████| 6409/6409 [00:00<00:00, 14454.07 examples/s]\n",
-      "Map (num_proc=16): 100%|███████████| 6409/6409 [00:00<00:00, 6890.28 examples/s]\n",
-      "Saving the dataset (1/1 shards): 100%|█| 323/323 [00:00<00:00, 7147.58 examples/\n",
-      "Saving the dataset (1/1 shards): 100%|█| 65/65 [00:00<00:00, 17810.79 examples/s\n",
+      "Saving the dataset (1/1 shards): 100%|█| 9892/9892 [00:00<00:00, 516553.02 examp\n",
+      "Saving the dataset (1/1 shards): 100%|█| 100/100 [00:00<00:00, 28147.80 examples\n",
       "[rank: 0] Seed set to 3941088705\n",
       "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/1\n",
       "Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:639: Checkpoint directory ../checkpoint/v5-enwiki-10k-full/ exists and is not empty.\n",
       "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
       "#\n",
       "# RWKV lighting_trainer.py important notes \n",
@@ -409,7 +609,7 @@
       "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
       "ninja: no work to do.\n",
       "Loading extension module fused_adam...\n",
-      "Time to load fused_adam op: 0.05469512939453125 seconds\n",
+      "Time to load fused_adam op: 0.05247139930725098 seconds\n",
       "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201450/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
       "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
       "Loading `train_dataloader` to estimate number of stepping batches.\n",
@@ -425,8 +625,127 @@
       "0         Non-trainable params\n",
       "87.6 M    Total params\n",
       "350.405   Total estimated model params size (MB)\n",
-      "Epoch 0:  21%|▋  | 34/162 [00:19<01:12,  1.76it/s, v_num=oczk, train/loss=8.880]^C\n",
-      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...\n"
+      "Epoch 0:   1%|  | 18/1237 [00:05<06:23,  3.18it/s, v_num=rhl5, train/loss=8.250]Traceback (most recent call last):\n",
+      "  File \"/home/picocreator/rwkv-proj/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 296, in <module>\n",
+      "    cli_main()\n",
+      "  File \"/home/picocreator/rwkv-proj/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 271, in cli_main\n",
+      "    LightningCLI(\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py\", line 386, in __init__\n",
+      "    self._run_subcommand(self.subcommand)\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py\", line 677, in _run_subcommand\n",
+      "    fn(**fn_kwargs)\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py\", line 544, in fit\n",
+      "    call._call_and_handle_interrupt(\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py\", line 43, in _call_and_handle_interrupt\n",
+      "    return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py\", line 102, in launch\n",
+      "    return function(*args, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py\", line 580, in _fit_impl\n",
+      "    self._run(model, ckpt_path=ckpt_path)\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py\", line 989, in _run\n",
+      "    results = self._run_stage()\n",
+      "              ^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py\", line 1035, in _run_stage\n",
+      "    self.fit_loop.run()\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py\", line 202, in run\n",
+      "    self.advance()\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py\", line 359, in advance\n",
+      "    self.epoch_loop.run(self._data_fetcher)\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/training_epoch_loop.py\", line 136, in run\n",
+      "    self.advance(data_fetcher)\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/training_epoch_loop.py\", line 240, in advance\n",
+      "    batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)\n",
+      "                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py\", line 187, in run\n",
+      "    self._optimizer_step(batch_idx, closure)\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py\", line 265, in _optimizer_step\n",
+      "    call._call_lightning_module_hook(\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py\", line 157, in _call_lightning_module_hook\n",
+      "    output = fn(*args, **kwargs)\n",
+      "             ^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/core/module.py\", line 1282, in optimizer_step\n",
+      "    optimizer.step(closure=optimizer_closure)\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/core/optimizer.py\", line 151, in step\n",
+      "    step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)\n",
+      "                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/ddp.py\", line 264, in optimizer_step\n",
+      "    optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)\n",
+      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py\", line 230, in optimizer_step\n",
+      "    return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/plugins/precision/deepspeed.py\", line 123, in optimizer_step\n",
+      "    closure_result = closure()\n",
+      "                     ^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py\", line 140, in __call__\n",
+      "    self._result = self.closure(*args, **kwargs)\n",
+      "                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/utils/_contextlib.py\", line 115, in decorate_context\n",
+      "    return func(*args, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py\", line 126, in closure\n",
+      "    step_output = self._step_fn()\n",
+      "                  ^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py\", line 315, in _training_step\n",
+      "    training_step_output = call._call_strategy_hook(trainer, \"training_step\", *kwargs.values())\n",
+      "                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py\", line 309, in _call_strategy_hook\n",
+      "    output = fn(*args, **kwargs)\n",
+      "             ^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py\", line 381, in training_step\n",
+      "    return self._forward_redirection(self.model, self.lightning_module, \"training_step\", *args, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py\", line 633, in __call__\n",
+      "    wrapper_output = wrapper_module(*args, **kwargs)\n",
+      "                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1518, in _wrapped_call_impl\n",
+      "    return self._call_impl(*args, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1527, in _call_impl\n",
+      "    return forward_call(*args, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/utils/nvtx.py\", line 15, in wrapped_fn\n",
+      "    ret_val = func(*args, **kwargs)\n",
+      "              ^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/runtime/engine.py\", line 1818, in forward\n",
+      "    loss = self.module(*inputs, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1518, in _wrapped_call_impl\n",
+      "    return self._call_impl(*args, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1527, in _call_impl\n",
+      "    return forward_call(*args, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py\", line 626, in wrapped_forward\n",
+      "    out = method(*_args, **_kwargs)\n",
+      "          ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/rwkv-proj/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1234, in training_step\n",
+      "    total_loss = self.compute_loss(batch, batch_idx, True)\n",
+      "                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/rwkv-proj/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1132, in compute_loss\n",
+      "    self.manual_backward(learning_loss, optimizer, retain_graph=True)\n",
+      "  File \"/home/picocreator/rwkv-proj/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 779, in manual_backward\n",
+      "    self.trainer.strategy.backward(loss, None, *args, **kwargs)\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py\", line 204, in backward\n",
+      "    self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, *args, **kwargs)\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/plugins/precision/deepspeed.py\", line 112, in backward\n",
+      "    deepspeed_engine.backward(tensor, *args, **kwargs)\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/utils/nvtx.py\", line 15, in wrapped_fn\n",
+      "    ret_val = func(*args, **kwargs)\n",
+      "              ^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/runtime/engine.py\", line 1940, in backward\n",
+      "    self.optimizer.backward(loss, retain_graph=retain_graph)\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/runtime/zero/stage_1_and_2.py\", line 1953, in backward\n",
+      "    self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/runtime/fp16/loss_scaler.py\", line 63, in backward\n",
+      "    scaled_loss.backward(retain_graph=retain_graph)\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_tensor.py\", line 492, in backward\n",
+      "    torch.autograd.backward(\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/autograd/__init__.py\", line 251, in backward\n",
+      "    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass\n",
+      "torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.69 GiB. GPU 0 has a total capacty of 22.16 GiB of which 1.03 GiB is free. Including non-PyTorch memory, this process has 21.09 GiB memory in use. Of the allocated memory 16.32 GiB is allocated by PyTorch, and 4.10 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF\n"
      ]
     }
    ],
@@ -439,7 +758,7 @@
     "        --trainer.callbacks.init_args.dirpath=\"../checkpoint/v5-enwiki-10k-full/\" \\\n",
     "        --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Microbatch 2 - (deepspeed_stage_1)\" \\\n",
     "        --trainer.strategy=\"deepspeed_stage_1\" \\\n",
-    "        --trainer.microbatch_size=2 \\\n",
+    "        --trainer.microbatch_size=4 \\\n",
     "        --trainer.devices=\"{GPU_DEVICES}\""
    ]
   }

From 86808e9cde9181c057ac16ddaf0175bdd7908285 Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Thu, 18 Jan 2024 12:25:43 +0800
Subject: [PATCH 32/33] microbatch validation

---
 .../dataset-microbatch.ipynb                  | 245 +++++++++---------
 1 file changed, 116 insertions(+), 129 deletions(-)

diff --git a/notebook/trainer-v5-validation/dataset-microbatch.ipynb b/notebook/trainer-v5-validation/dataset-microbatch.ipynb
index df25fda5..cf45e52c 100644
--- a/notebook/trainer-v5-validation/dataset-microbatch.ipynb
+++ b/notebook/trainer-v5-validation/dataset-microbatch.ipynb
@@ -544,16 +544,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 34,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-01-18 12:17:38,330] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-01-18 12:18:03,234] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
       "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.1'\n",
-      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-full/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Microbatch 2 - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=8', '--trainer.devices=auto'], args=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-full/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Microbatch 2 - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=8', '--trainer.devices=auto'].\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-full/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Microbatch 2 - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=4', '--trainer.devices=auto'], args=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-full/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Microbatch 2 - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=4', '--trainer.devices=auto'].\n",
       "Seed set to 3941088705\n",
       "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
       "  return self.fget.__get__(instance, owner)()\n",
@@ -578,12 +578,12 @@
       "   - target_batch_size:       16\n",
       "   - num_nodes:               1\n",
       "   - num_devices:             1\n",
-      "   - microbatch_size:         8\n",
-      "   - accumulate_grad_batches: 2\n",
+      "   - microbatch_size:         4\n",
+      "   - accumulate_grad_batches: 4\n",
       "   - effective_batch_size:    16\n",
       "\n",
-      "Saving the dataset (1/1 shards): 100%|█| 9892/9892 [00:00<00:00, 516553.02 examp\n",
-      "Saving the dataset (1/1 shards): 100%|█| 100/100 [00:00<00:00, 28147.80 examples\n",
+      "Saving the dataset (1/1 shards): 100%|█| 9892/9892 [00:00<00:00, 583184.18 examp\n",
+      "Saving the dataset (1/1 shards): 100%|█| 100/100 [00:00<00:00, 28878.44 examples\n",
       "[rank: 0] Seed set to 3941088705\n",
       "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/1\n",
       "Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.\n",
@@ -609,7 +609,7 @@
       "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
       "ninja: no work to do.\n",
       "Loading extension module fused_adam...\n",
-      "Time to load fused_adam op: 0.05247139930725098 seconds\n",
+      "Time to load fused_adam op: 0.05039358139038086 seconds\n",
       "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201450/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
       "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
       "Loading `train_dataloader` to estimate number of stepping batches.\n",
@@ -625,127 +625,114 @@
       "0         Non-trainable params\n",
       "87.6 M    Total params\n",
       "350.405   Total estimated model params size (MB)\n",
-      "Epoch 0:   1%|  | 18/1237 [00:05<06:23,  3.18it/s, v_num=rhl5, train/loss=8.250]Traceback (most recent call last):\n",
-      "  File \"/home/picocreator/rwkv-proj/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 296, in <module>\n",
-      "    cli_main()\n",
-      "  File \"/home/picocreator/rwkv-proj/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 271, in cli_main\n",
-      "    LightningCLI(\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py\", line 386, in __init__\n",
-      "    self._run_subcommand(self.subcommand)\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py\", line 677, in _run_subcommand\n",
-      "    fn(**fn_kwargs)\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py\", line 544, in fit\n",
-      "    call._call_and_handle_interrupt(\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py\", line 43, in _call_and_handle_interrupt\n",
-      "    return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py\", line 102, in launch\n",
-      "    return function(*args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py\", line 580, in _fit_impl\n",
-      "    self._run(model, ckpt_path=ckpt_path)\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py\", line 989, in _run\n",
-      "    results = self._run_stage()\n",
-      "              ^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py\", line 1035, in _run_stage\n",
-      "    self.fit_loop.run()\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py\", line 202, in run\n",
-      "    self.advance()\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py\", line 359, in advance\n",
-      "    self.epoch_loop.run(self._data_fetcher)\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/training_epoch_loop.py\", line 136, in run\n",
-      "    self.advance(data_fetcher)\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/training_epoch_loop.py\", line 240, in advance\n",
-      "    batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)\n",
-      "                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py\", line 187, in run\n",
-      "    self._optimizer_step(batch_idx, closure)\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py\", line 265, in _optimizer_step\n",
-      "    call._call_lightning_module_hook(\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py\", line 157, in _call_lightning_module_hook\n",
-      "    output = fn(*args, **kwargs)\n",
-      "             ^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/core/module.py\", line 1282, in optimizer_step\n",
-      "    optimizer.step(closure=optimizer_closure)\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/core/optimizer.py\", line 151, in step\n",
-      "    step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)\n",
-      "                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/ddp.py\", line 264, in optimizer_step\n",
-      "    optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)\n",
-      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py\", line 230, in optimizer_step\n",
-      "    return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/plugins/precision/deepspeed.py\", line 123, in optimizer_step\n",
-      "    closure_result = closure()\n",
-      "                     ^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py\", line 140, in __call__\n",
-      "    self._result = self.closure(*args, **kwargs)\n",
-      "                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/utils/_contextlib.py\", line 115, in decorate_context\n",
-      "    return func(*args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py\", line 126, in closure\n",
-      "    step_output = self._step_fn()\n",
-      "                  ^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py\", line 315, in _training_step\n",
-      "    training_step_output = call._call_strategy_hook(trainer, \"training_step\", *kwargs.values())\n",
-      "                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py\", line 309, in _call_strategy_hook\n",
-      "    output = fn(*args, **kwargs)\n",
-      "             ^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py\", line 381, in training_step\n",
-      "    return self._forward_redirection(self.model, self.lightning_module, \"training_step\", *args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py\", line 633, in __call__\n",
-      "    wrapper_output = wrapper_module(*args, **kwargs)\n",
-      "                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1518, in _wrapped_call_impl\n",
-      "    return self._call_impl(*args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1527, in _call_impl\n",
-      "    return forward_call(*args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/utils/nvtx.py\", line 15, in wrapped_fn\n",
-      "    ret_val = func(*args, **kwargs)\n",
-      "              ^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/runtime/engine.py\", line 1818, in forward\n",
-      "    loss = self.module(*inputs, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1518, in _wrapped_call_impl\n",
-      "    return self._call_impl(*args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1527, in _call_impl\n",
-      "    return forward_call(*args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py\", line 626, in wrapped_forward\n",
-      "    out = method(*_args, **_kwargs)\n",
-      "          ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/rwkv-proj/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1234, in training_step\n",
-      "    total_loss = self.compute_loss(batch, batch_idx, True)\n",
-      "                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/rwkv-proj/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1132, in compute_loss\n",
-      "    self.manual_backward(learning_loss, optimizer, retain_graph=True)\n",
-      "  File \"/home/picocreator/rwkv-proj/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 779, in manual_backward\n",
-      "    self.trainer.strategy.backward(loss, None, *args, **kwargs)\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py\", line 204, in backward\n",
-      "    self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, *args, **kwargs)\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/plugins/precision/deepspeed.py\", line 112, in backward\n",
-      "    deepspeed_engine.backward(tensor, *args, **kwargs)\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/utils/nvtx.py\", line 15, in wrapped_fn\n",
-      "    ret_val = func(*args, **kwargs)\n",
-      "              ^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/runtime/engine.py\", line 1940, in backward\n",
-      "    self.optimizer.backward(loss, retain_graph=retain_graph)\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/runtime/zero/stage_1_and_2.py\", line 1953, in backward\n",
-      "    self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/runtime/fp16/loss_scaler.py\", line 63, in backward\n",
-      "    scaled_loss.backward(retain_graph=retain_graph)\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_tensor.py\", line 492, in backward\n",
-      "    torch.autograd.backward(\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/autograd/__init__.py\", line 251, in backward\n",
-      "    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass\n",
-      "torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.69 GiB. GPU 0 has a total capacty of 22.16 GiB of which 1.03 GiB is free. Including non-PyTorch memory, this process has 21.09 GiB memory in use. Of the allocated memory 16.32 GiB is allocated by PyTorch, and 4.10 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF\n"
+      "Epoch 0:  16%|▏| 400/2473 [00:30<02:37, 13.12it/s, v_num=jp9a, train/loss=6.780]/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "Epoch 0: 100%|█| 2473/2473 [03:04<00:00, 13.41it/s, v_num=jp9a, train/loss=6.660\n",
+      "Validation: |                                             | 0/? [00:00<?, ?it/s]\u001b[A\n",
+      "Validation:   0%|                                       | 0/100 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0:   0%|                          | 0/100 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0:   1%|▏                 | 1/100 [00:00<00:33,  2.99it/s]\u001b[A\n",
+      "Validation DataLoader 0:   2%|▎                 | 2/100 [00:00<00:30,  3.25it/s]\u001b[A\n",
+      "Validation DataLoader 0:   3%|▌                 | 3/100 [00:00<00:28,  3.37it/s]\u001b[A\n",
+      "Validation DataLoader 0:   4%|▋                 | 4/100 [00:01<00:27,  3.45it/s]\u001b[A\n",
+      "Validation DataLoader 0:   5%|▉                 | 5/100 [00:01<00:27,  3.49it/s]\u001b[A\n",
+      "Validation DataLoader 0:   6%|█                 | 6/100 [00:01<00:26,  3.53it/s]\u001b[A\n",
+      "Validation DataLoader 0:   7%|█▎                | 7/100 [00:01<00:26,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:   8%|█▍                | 8/100 [00:02<00:25,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:   9%|█▌                | 9/100 [00:02<00:25,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  10%|█▋               | 10/100 [00:02<00:25,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  11%|█▊               | 11/100 [00:03<00:24,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  12%|██               | 12/100 [00:03<00:24,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  13%|██▏              | 13/100 [00:03<00:24,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  14%|██▍              | 14/100 [00:03<00:23,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  15%|██▌              | 15/100 [00:04<00:23,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  16%|██▋              | 16/100 [00:04<00:23,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  17%|██▉              | 17/100 [00:04<00:22,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  18%|███              | 18/100 [00:04<00:22,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  19%|███▏             | 19/100 [00:05<00:22,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  20%|███▍             | 20/100 [00:05<00:21,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  21%|███▌             | 21/100 [00:05<00:21,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  22%|███▋             | 22/100 [00:06<00:21,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  23%|███▉             | 23/100 [00:06<00:21,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  24%|████             | 24/100 [00:06<00:20,  3.65it/s]\u001b[A\n",
+      "Validation DataLoader 0:  25%|████▎            | 25/100 [00:07<00:22,  3.37it/s]\u001b[A\n",
+      "Validation DataLoader 0:  26%|████▍            | 26/100 [00:07<00:21,  3.38it/s]\u001b[A\n",
+      "Validation DataLoader 0:  27%|████▌            | 27/100 [00:07<00:21,  3.39it/s]\u001b[A\n",
+      "Validation DataLoader 0:  28%|████▊            | 28/100 [00:08<00:21,  3.40it/s]\u001b[A\n",
+      "Validation DataLoader 0:  29%|████▉            | 29/100 [00:08<00:20,  3.41it/s]\u001b[A\n",
+      "Validation DataLoader 0:  30%|█████            | 30/100 [00:08<00:20,  3.42it/s]\u001b[A\n",
+      "Validation DataLoader 0:  31%|█████▎           | 31/100 [00:09<00:20,  3.42it/s]\u001b[A\n",
+      "Validation DataLoader 0:  32%|█████▍           | 32/100 [00:09<00:19,  3.43it/s]\u001b[A\n",
+      "Validation DataLoader 0:  33%|█████▌           | 33/100 [00:09<00:19,  3.44it/s]\u001b[A\n",
+      "Validation DataLoader 0:  34%|█████▊           | 34/100 [00:09<00:19,  3.45it/s]\u001b[A\n",
+      "Validation DataLoader 0:  35%|█████▉           | 35/100 [00:10<00:18,  3.45it/s]\u001b[A\n",
+      "Validation DataLoader 0:  36%|██████           | 36/100 [00:10<00:18,  3.46it/s]\u001b[A\n",
+      "Validation DataLoader 0:  37%|██████▎          | 37/100 [00:10<00:18,  3.46it/s]\u001b[A\n",
+      "Validation DataLoader 0:  38%|██████▍          | 38/100 [00:10<00:17,  3.47it/s]\u001b[A\n",
+      "Validation DataLoader 0:  39%|██████▋          | 39/100 [00:11<00:17,  3.48it/s]\u001b[A\n",
+      "Validation DataLoader 0:  40%|██████▊          | 40/100 [00:11<00:17,  3.48it/s]\u001b[A\n",
+      "Validation DataLoader 0:  41%|██████▉          | 41/100 [00:11<00:16,  3.49it/s]\u001b[A\n",
+      "Validation DataLoader 0:  42%|███████▏         | 42/100 [00:12<00:16,  3.49it/s]\u001b[A\n",
+      "Validation DataLoader 0:  43%|███████▎         | 43/100 [00:12<00:16,  3.49it/s]\u001b[A\n",
+      "Validation DataLoader 0:  44%|███████▍         | 44/100 [00:12<00:16,  3.50it/s]\u001b[A\n",
+      "Validation DataLoader 0:  45%|███████▋         | 45/100 [00:12<00:15,  3.50it/s]\u001b[A\n",
+      "Validation DataLoader 0:  46%|███████▊         | 46/100 [00:13<00:15,  3.51it/s]\u001b[A\n",
+      "Validation DataLoader 0:  47%|███████▉         | 47/100 [00:13<00:15,  3.51it/s]\u001b[A\n",
+      "Validation DataLoader 0:  48%|████████▏        | 48/100 [00:13<00:14,  3.51it/s]\u001b[A\n",
+      "Validation DataLoader 0:  49%|████████▎        | 49/100 [00:13<00:14,  3.52it/s]\u001b[A\n",
+      "Validation DataLoader 0:  50%|████████▌        | 50/100 [00:14<00:14,  3.52it/s]\u001b[A\n",
+      "Validation DataLoader 0:  51%|████████▋        | 51/100 [00:14<00:13,  3.52it/s]\u001b[A\n",
+      "Validation DataLoader 0:  52%|████████▊        | 52/100 [00:14<00:13,  3.53it/s]\u001b[A\n",
+      "Validation DataLoader 0:  53%|█████████        | 53/100 [00:15<00:13,  3.53it/s]\u001b[A\n",
+      "Validation DataLoader 0:  54%|█████████▏       | 54/100 [00:15<00:13,  3.53it/s]\u001b[A\n",
+      "Validation DataLoader 0:  55%|█████████▎       | 55/100 [00:15<00:12,  3.53it/s]\u001b[A\n",
+      "Validation DataLoader 0:  56%|█████████▌       | 56/100 [00:15<00:12,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:  57%|█████████▋       | 57/100 [00:16<00:12,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:  58%|█████████▊       | 58/100 [00:16<00:11,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:  59%|██████████       | 59/100 [00:16<00:11,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:  60%|██████████▏      | 60/100 [00:16<00:11,  3.55it/s]\u001b[A\n",
+      "Validation DataLoader 0:  61%|██████████▎      | 61/100 [00:17<00:10,  3.55it/s]\u001b[A\n",
+      "Validation DataLoader 0:  62%|██████████▌      | 62/100 [00:17<00:10,  3.55it/s]\u001b[A\n",
+      "Validation DataLoader 0:  63%|██████████▋      | 63/100 [00:17<00:10,  3.55it/s]\u001b[A\n",
+      "Validation DataLoader 0:  64%|██████████▉      | 64/100 [00:18<00:10,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  65%|███████████      | 65/100 [00:18<00:09,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  66%|███████████▏     | 66/100 [00:18<00:09,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  67%|███████████▍     | 67/100 [00:18<00:09,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  68%|███████████▌     | 68/100 [00:19<00:08,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  69%|███████████▋     | 69/100 [00:19<00:08,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  70%|███████████▉     | 70/100 [00:19<00:08,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  71%|████████████     | 71/100 [00:19<00:08,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  72%|████████████▏    | 72/100 [00:20<00:07,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  73%|████████████▍    | 73/100 [00:20<00:07,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  74%|████████████▌    | 74/100 [00:20<00:07,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  75%|████████████▊    | 75/100 [00:20<00:06,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  76%|████████████▉    | 76/100 [00:21<00:06,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  77%|█████████████    | 77/100 [00:21<00:06,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  78%|█████████████▎   | 78/100 [00:21<00:06,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  79%|█████████████▍   | 79/100 [00:22<00:05,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  80%|█████████████▌   | 80/100 [00:22<00:05,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  81%|█████████████▊   | 81/100 [00:22<00:05,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  82%|█████████████▉   | 82/100 [00:22<00:05,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  83%|██████████████   | 83/100 [00:23<00:04,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  84%|██████████████▎  | 84/100 [00:23<00:04,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  85%|██████████████▍  | 85/100 [00:23<00:04,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  86%|██████████████▌  | 86/100 [00:23<00:03,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  87%|██████████████▊  | 87/100 [00:24<00:03,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  88%|██████████████▉  | 88/100 [00:24<00:03,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  89%|███████████████▏ | 89/100 [00:24<00:03,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  90%|███████████████▎ | 90/100 [00:25<00:02,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  91%|███████████████▍ | 91/100 [00:25<00:02,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  92%|███████████████▋ | 92/100 [00:25<00:02,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  93%|███████████████▊ | 93/100 [00:25<00:01,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  94%|███████████████▉ | 94/100 [00:26<00:01,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  95%|████████████████▏| 95/100 [00:26<00:01,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  96%|████████████████▎| 96/100 [00:26<00:01,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  97%|████████████████▍| 97/100 [00:26<00:00,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  98%|████████████████▋| 98/100 [00:27<00:00,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  99%|████████████████▊| 99/100 [00:27<00:00,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0: 100%|████████████████| 100/100 [00:27<00:00,  3.60it/s]\u001b[A\n",
+      "Epoch 0: 100%|█| 2473/2473 [03:32<00:00, 11.65it/s, v_num=jp9a, train/loss=6.660`Trainer.fit` stopped: `max_epochs=1` reached.\n",
+      "Epoch 0: 100%|█| 2473/2473 [03:32<00:00, 11.65it/s, v_num=jp9a, train/loss=6.660\n"
      ]
     }
    ],

From ce4a4614d61bbad27abdaf36c3038cd0c9bcf6cd Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Thu, 18 Jan 2024 13:14:17 +0800
Subject: [PATCH 33/33] experimental token based dropout

---
 RWKV-v5/src/model.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/RWKV-v5/src/model.py b/RWKV-v5/src/model.py
index f52d1d60..912a5684 100644
--- a/RWKV-v5/src/model.py
+++ b/RWKV-v5/src/model.py
@@ -197,7 +197,8 @@ def __init__(self,
                  position_loss_bias_in_validation: bool = False,
                  
                  # Selective loss settings
-                 selective_token_loss_threshold: float = 0.0,
+                 token_loss_threshold: float = 0.0,
+                 token_dropout_rate: float = 0.0, # Dropout rate should be between 0-1
 
                  # Backprop settings
                  grad_cp: bool = True,
@@ -295,7 +296,8 @@ def __init__(self,
         # Save the position loss params, and selective loss settings
         self.position_loss_bias = position_loss_bias
         self.position_loss_bias_in_validation = position_loss_bias_in_validation
-        self.selective_token_loss_threshold = selective_token_loss_threshold
+        self.token_loss_threshold = token_loss_threshold
+        self.token_dropout_rate = token_dropout_rate
 
         dim_att = dim_att or n_embd
         dim_ffn = dim_ffn or int((n_embd * 3.5) // 32 * 32)
@@ -922,15 +924,24 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
                 train_token_count = 0
                 train_mask = submask
 
-            elif self.selective_token_loss_threshold > 0.0:
+            elif self.token_loss_threshold > 0.0 or self.token_dropout_rate > 0.0:
 
                 # Sample loss, without backprop 
                 with torch.no_grad():
                     sample_loss = (torch.sum(token_loss * submask) / total_mask_sum).clone().detach().requires_grad_(False)
 
+                # Building the training mask
+                train_mask = submask
+
                 # Selective loss gating
-                above_threshold = token_loss > self.selective_token_loss_threshold
-                train_mask = submask * above_threshold
+                if self.token_loss_threshold > 0.0:
+                    above_threshold = token_loss > self.token_loss_threshold
+                    train_mask = train_mask * above_threshold
+
+                # Dropout logic
+                if self.token_dropout_rate > 0.0:
+                    dropout_mask = torch.rand(train_mask.shape, device=train_mask.device) > self.token_dropout_rate
+                    train_mask = train_mask * dropout_mask
                 
                 # The training loss to use
                 train_loss = torch.sum(token_loss * train_mask) / total_mask_sum