Merge pull request #58 from RWKV/main

sync-up playground
RWKV · Jan 18, 2024 · 41c7d95 · 41c7d95
2 parents 3559270 + 36e6737
commit 41c7d95
Show file tree

Hide file tree

Showing 14 changed files with 3,929 additions and 411 deletions.
diff --git a/RWKV-v5/config-example.yaml b/RWKV-v5/config-example.yaml
@@ -382,7 +382,7 @@ data:
   # Tokenizer to use, use either the inbuilt 'neox', or 'world' tokenizer
   # If using a custom tokenizer, provide the HF tokenizer name/path
   # ---
-  tokenizer: neox
+  tokenizer: world
 
   # Minimum / Maximum token size of the dataset to use
   # useful for filtering out small noisy data samples from large datasets
@@ -393,21 +393,6 @@ data:
   # min_token_size: 1024
   # max_token_size: -1
 
-  # Rechunking of text dataset, this is done only when source is set as 'text'
-  # and will merge the various sentencees, into larger chunks up to the target size
-  #
-  # Defaults to 2048
-  #
-  # This is ignored, if source is not set as text (unless text_rechunk_force)
-  # This is ignored, if set to zero
-  # ---
-  # text_rechunk_size: 2048
-
-  # Apply text rechunk to the dataset, even if its not a 'text' source
-  # This is done only after dataset filtering, and if source is not 'text'
-  # ---
-  # text_rechunk_force: True
-
   # Custom text column to use, useful for dataset with alternative training columns labels
   # This is checked before multi column merging, default is null (disabled)
   # eg: 'code'
@@ -433,12 +418,92 @@ data:
   # multi_column_suffix: ["\n\n", "\n\n", "\n\n"]
   # multi_column_train_mask: [true, false, true]
   # multi_column_separator: "\n\n"
+
+
+  # Conversation merging process
+  # useful for merging full conversational datasets, into single documents
+  # default is off, (or set conversation_key to [])
+  # conversation_formatting supports "iopairs" or "sender" for now.
+  # ---
+  # conversation_format: 'iopairs'
+  # conversation_key: 'conversation'
+  # conversation_end_of_conversation: "\n\nUser:"
+
+  # Iopairs specific config
+  # This means that every object in the conversation object is a pair of input output.
+  # In future it will also support a format where one of the keys dictates the format style
+  # if conversation_key is set to null, it will use the root object as the conversation object
+  # ---
+  # conversation_input_key_prefix_map: {'input': "\n\nUser: ", 'output': "\n\nAssistant: "}
+  # conversation_input_key_mask: {'input': false, 'output': true}
+  # conversation_sender_suffix: {'input': "", 'output': ""}
+
+  # Sender specific config
+  # This means that every object in the conversation object is a single message (with sender and message keys - or similar)
+  # The output is dictated by the input key map, the rest of the "sender_" config is keyed by the value of the sender key
+  # conversation_input_key_map: {'message': "\n\n{sender}: ", 'context': ''}
+  # conversation_sender_key: 'sender'
+  # conversation_sender_value_map: {'user': 'User', 'assistant': 'Assistant', 'system': 'System'}
+  # conversation_sender_mask: {'user': false, 'assistant': true, 'system': false}
+  # conversation_sender_suffix: {'user': "", 'assistant': "", 'system': ""}
 
   # If processing prompt/completion jsonl pairs, the prompt is masked by default
   # use this flag to disable this default behaviour
   # ---
   # disable_prompt_completion_mask: false
 
+  # ----------------------------
+  # Rechunking support
+  # ----------------------------
+
+  # Rechunking of text dataset, this is done only when source is set as 'text'
+  # and will merge the various sentencees, into larger chunks up to the target size
+  #
+  # Defaults to 2048
+  #
+  # This is ignored, if source is not set as text (unless text_rechunk_force)
+  # This is ignored, if set to zero / -1
+  # ---
+  text_rechunk_size: 2048
+
+  # Apply text rechunk to the dataset, even if its not a 'text' source
+  # This is done only after dataset filtering, and if source is not 'text'
+  # ---
+  text_rechunk_force: False
+
+  # Used to disable the automated text rechunkin for text files, if set as false
+  # ---
+  text_rechunk_auto: True
+
+  # ----------------------------
+  # Dataset packing support
+  # Recommended to be used with mixed documents sized finetuning
+  # For foundation model "from scratch", rechunking is typically used instead
+  # ----------------------------
+
+  # Boolean flag to enable / disable dataset packing
+  packing_enable: True
+
+  # Used to ensure all training samples wihin this batch size is the same length
+  # Ideally this should align exactly with your real "batch size"
+  #
+  # Uses, `8 * (3 * 4 * 5 * 6 * 7) = 20160` for default, as it should align across
+  # a large number of batch size combinations. This helps reduce the amount of
+  # misaligned batches, and thus reduce the amount of wasted training time.
+  packing_batchsize: 20160
+
+  # Chunking size to align within each batch, this ideally should be equal to
+  # the training context length used.
+  packing_chunksize: 4096
+
+  # Minimum size to pack up to, this should be a multiple of packing_chunksize
+  # defautls to -1, which equals to packing_chunksize
+  packing_min_ctx_len: -1
+
+  # Pack the data sequentially if possible, in accordance to the dataset sequence
+  # this can be used together with sort_by_length, otherwise a shuffle will be done
+  packing_in_sequence: False
+
 # Path to the current checkpoint to continue training from
 # this should be the directory path, and ends with `.ckpt/`
 ckpt_path: null