Merge pull request #89 from RWKV/main

pulling main updates to rwkv-x-playground
RWKV · Apr 10, 2024 · b9f9093 · b9f9093
2 parents 41c7d95 + 2528086
commit b9f9093
Show file tree

Hide file tree

Showing 146 changed files with 362,593 additions and 282 deletions.
diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
@@ -1,15 +1,15 @@
-name: Docker Env Image (cuda-11-8)
+name: Docker Env Image (cuda-12-1)
 
 on:
   push:
-    branches: [ "main" ]
+    branches: [ "main", "rwkv-x-*" ]
     # Publish semver tags as releases.
     tags: [ 'v*.*.*' ]
     # Reduce build to only for the valid path
     paths:
       - docker/**
   pull_request:
-    branches: [ "main" ]
+    branches: [ "main", "rwkv-x-*" ]
     paths:
       - docker/**
 
@@ -21,7 +21,7 @@ env:
 
 jobs:
   build_env:
-    name: Docker Env Image (cuda-11-8)
+    name: Docker Env Image (cuda-12-1)
 
     runs-on: ubuntu-latest
     permissions:
@@ -71,9 +71,9 @@ jobs:
       # https://github.com/sigstore/cosign-installer
       - name: Install cosign
         if: github.event_name != 'pull_request'
-        uses: sigstore/cosign-installer@f3c664df7af409cb4873aa5068053ba9d61a57b6 #v2.6.0
-        with:
-          cosign-release: 'v1.11.0'
+        uses: sigstore/cosign-installer@v3.3.0
+        # with:
+        #   cosign-release: 'v2.2.0'
 
       # Workaround: https://github.com/docker/build-push-action/issues/461
       - name: Setup Docker buildx
@@ -103,20 +103,20 @@ jobs:
       
       # Build and push Docker image with Buildx (don't push on PR)
       # https://github.com/docker/build-push-action
-      - name: Build and push Docker image (env-cuda-11-8)
+      - name: Build and push Docker image (env-cuda-12-1)
         id: build-and-push
         uses: docker/build-push-action@v4
         with:
-          context: "{{defaultContext}}:docker/env-cuda-11-8"
+          context: "{{defaultContext}}:docker/env-cuda-12-1"
           push: ${{ github.event_name != 'pull_request' }} # Don't push on PR
-          tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME_LC }}:env-cuda-11-8
+          tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME_LC }}:env-cuda-12-1
           # tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
-          cache-from: type=gha,src=docker/env-cuda-11-8
+          cache-from: type=gha,src=docker/env-cuda-12-1
           cache-to: type=gha,mode=max
 
   build_runner:
-    name: Docker Env Image (github-worker-11-8)
+    name: Docker Env Image (github-worker-12-1)
 
     needs: build_env
     runs-on: ubuntu-latest
@@ -167,9 +167,9 @@ jobs:
       # https://github.com/sigstore/cosign-installer
       - name: Install cosign
         if: github.event_name != 'pull_request'
-        uses: sigstore/cosign-installer@f3c664df7af409cb4873aa5068053ba9d61a57b6 #v2.6.0
-        with:
-          cosign-release: 'v1.11.0'
+        uses: sigstore/cosign-installer@v3.3.0
+        # with:
+        #   cosign-release: 'v2.2.0'
 
       # Workaround: https://github.com/docker/build-push-action/issues/461
       - name: Setup Docker buildx
@@ -199,14 +199,14 @@ jobs:
       
       # Build and push Docker image with Buildx (don't push on PR)
       # https://github.com/docker/build-push-action
-      - name: Build and push Docker image (github-worker-cuda-11-8)
+      - name: Build and push Docker image (github-worker-cuda-12-1)
         id: build-and-push
         uses: docker/build-push-action@v4
         with:
-          context: "{{defaultContext}}:docker/github-worker-cuda-11-8"
+          context: "{{defaultContext}}:docker/github-worker-cuda-12-1"
           push: ${{ github.event_name != 'pull_request' }} # Don't push on PR
-          tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME_LC }}:github-worker-cuda-11-8
+          tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME_LC }}:github-worker-cuda-12-1
           # tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
-          cache-from: type=gha,src=docker/github-worker-cuda-11-8
+          cache-from: type=gha,src=docker/github-worker-cuda-12-1
           cache-to: type=gha,mode=max
diff --git a/.gitignore b/.gitignore
@@ -150,6 +150,7 @@ dmypy.json
 # and standard hidden files ignore. Including 
 # example files generated via notebook tutorials
 .*
+scratch/
 model/
 dataset/
 datapath/

diff --git a/README.md b/README.md
@@ -42,9 +42,9 @@ conda update conda
 conda create -n rwkv-infctx python=3.11 pip
 conda activate rwkv-infctx
 
-# Install pytorch (>=2.0.1)
-conda install -y pytorch==2.0.1 torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
-python -m pip install lightning==2.0.5 deepspeed==0.10.0
+# Install pytorch (>=2.1.2)
+conda install -y pytorch==2.1.2 torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
+python -m pip install lightning==2.1.3 deepspeed==0.12.6
 
 # Currently for torch.compile + 3.11 to work, for some platform, you will need the nightly build
 # if so you may need to try the following instead - this is considered highly "unstable"

diff --git a/RWKV-v5/config-example.yaml b/RWKV-v5/config-example.yaml
@@ -325,6 +325,17 @@ model:
   # dim_att: null
   # dim_ffn: null
 data:
+  # Skip the datapath setup
+  #
+  # ignored if using the preload_datapath.py, useful for speeding up the trainer startup
+  # provided you have your datasets all properly preinitialized
+  # ---
+  # skip_datapath_setup: True
+
+  # Datapack config yaml to use instead, this overwrites all other settings below
+  # ---
+  # datapack_config_path: null
+
   # dataset_path for the prebuilt dataset, using HF `load_from_disk()`
   #
   # Use this if you have built your own dataset and saved it with `save_to_disk()`
@@ -334,6 +345,23 @@ data:
   # If using relative path, this should be relative to the trainer script path
   data_path: /path/to/store/your/data_path/
 
+  # Data path storage options, this is used to support cloud storage
+  # via the huggingface dataset API. See:
+  # https://huggingface.co/docs/datasets/v2.16.1/en/filesystems#amazon-s3
+  #
+  # Note: As of Jan 2023, these options has been only tested to work with AWS S3, and backblaze. YMMV
+  #       For S3 bucket support you will also need to install s3fs `python3 -m pip install s3fs`
+  #
+  # If you want to reduce the risk of accidental key/secret commits, you can use
+  # `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables instead
+  #
+  # For datapath, it should use the `s3://bucket-name/subpath` format
+  # ---
+  # data_path_storage_options:
+  #   key: <example S3 key>
+  #   secret: <example S3 secret>
+  #   endpoint_url: <example S3 endpoint>
+
   # Other wise provide the source path, which is used as huggingface dataset path
   # this will be used to populate the dataset_path
   #
@@ -349,6 +377,10 @@ data:
   # source: "teven/enwiki_00k"   # Hugging face dataset
   # source: text                 # Text mode, used with source_data_dir
 
+  # Dataset split to use from HF dataset
+  # ---
+  # source_dataset_split: train
+
   # Additional source dataset params, used to grab subsets of the dataset
   # ---
   # source_dataset_params:
@@ -395,6 +427,7 @@ data:
 
   # Custom text column to use, useful for dataset with alternative training columns labels
   # This is checked before multi column merging, default is null (disabled)
+  # If set this takes priority
   # eg: 'code'
   # ---
   # custom_text_key: 'code'
@@ -407,19 +440,18 @@ data:
   # or throw an error if the default fallback is not found
   #
   # IMPORTANT NOTE: as newlines are commonly used for multi_column_suffix, etc. 
-  #                 you should use single quotes to ensure such values dun get escaped.
-  #                 eg. multi_column_suffix: ['\n\n']
+  #                 you should use double quotes to ensure such values dun get escaped.
+  #                 eg. multi_column_suffix: ["\n\n"]
   #
   # See: https://github.com/RWKV/RWKV-infctx-trainer/issues/34
   # Need to use " or the new lines won't be tokenized properly
   # ---
   # multi_column_keys: ["instruction", "input", "output"]
   # multi_column_prefix: ["Instruction:\n", "Input:\n", "Output:\n"]
-  # multi_column_suffix: ["\n\n", "\n\n", "\n\n"]
+  # multi_column_suffix: ['', '', '']
   # multi_column_train_mask: [true, false, true]
   # multi_column_separator: "\n\n"
 
-
   # Conversation merging process
   # useful for merging full conversational datasets, into single documents
   # default is off, (or set conversation_key to [])
@@ -504,6 +536,16 @@ data:
   # this can be used together with sort_by_length, otherwise a shuffle will be done
   packing_in_sequence: False
 
+  # ----------------------------
+  # Specal use caes flags
+  # ----------------------------
+
+  # Reverse the training dataset order before saving, this is useful for,
+  # optimizing dataset packing process, when using packing_in_sequence
+  # and sort_by_length desc order together
+  reverse_train_dataset_before_save: False
+
+
 # Path to the current checkpoint to continue training from
 # this should be the directory path, and ends with `.ckpt/`
 ckpt_path: null