Skip to content

Commit

Permalink
Merge pull request #58 from RWKV/main
Browse files Browse the repository at this point in the history
sync-up playground
  • Loading branch information
PicoCreator authored Jan 18, 2024
2 parents 3559270 + 36e6737 commit 41c7d95
Show file tree
Hide file tree
Showing 14 changed files with 3,929 additions and 411 deletions.
97 changes: 81 additions & 16 deletions RWKV-v5/config-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ data:
# Tokenizer to use, use either the inbuilt 'neox', or 'world' tokenizer
# If using a custom tokenizer, provide the HF tokenizer name/path
# ---
tokenizer: neox
tokenizer: world

# Minimum / Maximum token size of the dataset to use
# useful for filtering out small noisy data samples from large datasets
Expand All @@ -393,21 +393,6 @@ data:
# min_token_size: 1024
# max_token_size: -1

# Rechunking of text dataset, this is done only when source is set as 'text'
# and will merge the various sentencees, into larger chunks up to the target size
#
# Defaults to 2048
#
# This is ignored, if source is not set as text (unless text_rechunk_force)
# This is ignored, if set to zero
# ---
# text_rechunk_size: 2048

# Apply text rechunk to the dataset, even if its not a 'text' source
# This is done only after dataset filtering, and if source is not 'text'
# ---
# text_rechunk_force: True

# Custom text column to use, useful for dataset with alternative training columns labels
# This is checked before multi column merging, default is null (disabled)
# eg: 'code'
Expand All @@ -433,12 +418,92 @@ data:
# multi_column_suffix: ["\n\n", "\n\n", "\n\n"]
# multi_column_train_mask: [true, false, true]
# multi_column_separator: "\n\n"


# Conversation merging process
# useful for merging full conversational datasets, into single documents
# default is off, (or set conversation_key to [])
# conversation_formatting supports "iopairs" or "sender" for now.
# ---
# conversation_format: 'iopairs'
# conversation_key: 'conversation'
# conversation_end_of_conversation: "\n\nUser:"

# Iopairs specific config
# This means that every object in the conversation object is a pair of input output.
# In future it will also support a format where one of the keys dictates the format style
# if conversation_key is set to null, it will use the root object as the conversation object
# ---
# conversation_input_key_prefix_map: {'input': "\n\nUser: ", 'output': "\n\nAssistant: "}
# conversation_input_key_mask: {'input': false, 'output': true}
# conversation_sender_suffix: {'input': "", 'output': ""}

# Sender specific config
# This means that every object in the conversation object is a single message (with sender and message keys - or similar)
# The output is dictated by the input key map, the rest of the "sender_" config is keyed by the value of the sender key
# conversation_input_key_map: {'message': "\n\n{sender}: ", 'context': ''}
# conversation_sender_key: 'sender'
# conversation_sender_value_map: {'user': 'User', 'assistant': 'Assistant', 'system': 'System'}
# conversation_sender_mask: {'user': false, 'assistant': true, 'system': false}
# conversation_sender_suffix: {'user': "", 'assistant': "", 'system': ""}

# If processing prompt/completion jsonl pairs, the prompt is masked by default
# use this flag to disable this default behaviour
# ---
# disable_prompt_completion_mask: false

# ----------------------------
# Rechunking support
# ----------------------------

# Rechunking of text dataset, this is done only when source is set as 'text'
# and will merge the various sentencees, into larger chunks up to the target size
#
# Defaults to 2048
#
# This is ignored, if source is not set as text (unless text_rechunk_force)
# This is ignored, if set to zero / -1
# ---
text_rechunk_size: 2048

# Apply text rechunk to the dataset, even if its not a 'text' source
# This is done only after dataset filtering, and if source is not 'text'
# ---
text_rechunk_force: False

# Used to disable the automated text rechunkin for text files, if set as false
# ---
text_rechunk_auto: True

# ----------------------------
# Dataset packing support
# Recommended to be used with mixed documents sized finetuning
# For foundation model "from scratch", rechunking is typically used instead
# ----------------------------

# Boolean flag to enable / disable dataset packing
packing_enable: True

# Used to ensure all training samples wihin this batch size is the same length
# Ideally this should align exactly with your real "batch size"
#
# Uses, `8 * (3 * 4 * 5 * 6 * 7) = 20160` for default, as it should align across
# a large number of batch size combinations. This helps reduce the amount of
# misaligned batches, and thus reduce the amount of wasted training time.
packing_batchsize: 20160

# Chunking size to align within each batch, this ideally should be equal to
# the training context length used.
packing_chunksize: 4096

# Minimum size to pack up to, this should be a multiple of packing_chunksize
# defautls to -1, which equals to packing_chunksize
packing_min_ctx_len: -1

# Pack the data sequentially if possible, in accordance to the dataset sequence
# this can be used together with sort_by_length, otherwise a shuffle will be done
packing_in_sequence: False

# Path to the current checkpoint to continue training from
# this should be the directory path, and ends with `.ckpt/`
ckpt_path: null
Loading

0 comments on commit 41c7d95

Please sign in to comment.