From 1bf0a93973b131c910ff9554909e473509aeb7af Mon Sep 17 00:00:00 2001
From: Jeremy D <115047575+bmosaicml@users.noreply.github.com>
Date: Tue, 10 Oct 2023 10:07:45 -0400
Subject: [PATCH 1/3] chang bsz to 1 (#654)

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 scripts/eval/yamls/coding_tasks.yaml | 4 ++++
 scripts/eval/yamls/tasks.yaml        | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/scripts/eval/yamls/coding_tasks.yaml b/scripts/eval/yamls/coding_tasks.yaml
index 3a19dc640d..727b9a6078 100644
--- a/scripts/eval/yamls/coding_tasks.yaml
+++ b/scripts/eval/yamls/coding_tasks.yaml
@@ -6,6 +6,8 @@ icl_tasks:
   pass_at_k: 1
   num_beams: 20
   icl_task_type: code_evaluation
+  batch_size: 1
+
 -
   label: human_eval_cpp
   dataset_uri: eval/local_data/programming/processed_human_eval_cpp.jsonl # ADD YOUR OWN DATASET URI
@@ -13,6 +15,7 @@ icl_tasks:
   pass_at_k: 1
   num_beams: 20
   icl_task_type: code_evaluation
+  batch_size: 1
 -
   label: human_eval_js
   dataset_uri: eval/local_data/programming/processed_human_eval_js.jsonl # ADD YOUR OWN DATASET URI
@@ -20,3 +23,4 @@ icl_tasks:
   pass_at_k: 1
   num_beams: 20
   icl_task_type: code_evaluation
+  batch_size: 1
diff --git a/scripts/eval/yamls/tasks.yaml b/scripts/eval/yamls/tasks.yaml
index 54d14e34ab..187b15ee88 100644
--- a/scripts/eval/yamls/tasks.yaml
+++ b/scripts/eval/yamls/tasks.yaml
@@ -180,6 +180,7 @@ icl_tasks:
   pass_at_k: 1
   num_beams: 20
   icl_task_type: code_evaluation
+  batch_size: 1
 -
   label: human_eval_cpp
   dataset_uri: eval/local_data/programming/processed_human_eval_cpp.jsonl # ADD YOUR OWN DATASET URI
@@ -187,6 +188,7 @@ icl_tasks:
   pass_at_k: 1
   num_beams: 20
   icl_task_type: code_evaluation
+  batch_size: 1
 -
   label: human_eval_js
   dataset_uri: eval/local_data/programming/processed_human_eval_js.jsonl # ADD YOUR OWN DATASET URI
@@ -194,3 +196,4 @@ icl_tasks:
   pass_at_k: 1
   num_beams: 20
   icl_task_type: code_evaluation
+  batch_size: 1

From ba6b88037c346dea1dd9009efb27c5240f9aecb1 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Tue, 10 Oct 2023 09:13:01 -0700
Subject: [PATCH 2/3] Add images with flash attention 2 (#651)

---
 .github/workflows/docker.yaml         | 41 +++++++++++++++---
 .github/workflows/pr-gpu.yaml         |  1 +
 Dockerfile                            |  9 ++--
 llmfoundry/models/layers/attention.py | 62 +++++++++++++++++++++------
 setup.py                              | 11 ++++-
 5 files changed, 100 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
index 8e30554475..83c9a63884 100644
--- a/.github/workflows/docker.yaml
+++ b/.github/workflows/docker.yaml
@@ -3,6 +3,12 @@ on:
   push:
     branches:
     - main
+  pull_request:
+    branches:
+    - main
+    paths:
+    - ./Dockerfile
+    - .github/workflows/docker.yaml
   workflow_dispatch: {}
 jobs:
   docker-build:
@@ -13,10 +19,16 @@ jobs:
         include:
         - name: '1.13.1_cu117'
           base_image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
+          dep_groups: '[gpu]'
         - name: '2.0.1_cu118'
           base_image: mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04
+          dep_groups: '[gpu]'
         - name: '2.1.0_cu121'
           base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
+          dep_groups: '[gpu]'
+        - name: '2.1.0_cu121_flash2'
+          base_image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
+          dep_groups: '[gpu-flash2]'
 
     steps:
     - name: Maximize Build Space on Worker
@@ -52,13 +64,32 @@ jobs:
         GIT_SHA=$(echo ${{ github.sha }} | cut -c1-7)
         echo "IMAGE_TAG=${GIT_SHA}" >> ${GITHUB_ENV}
 
+        if [ "${{ github.event_name }}" == "push" ]; then
+          echo "Triggered by push event."
+          PROD_REPO="mosaicml/llm-foundry"
+          IMAGE_TAG="${PROD_REPO}:${{matrix.name}}-${GIT_SHA},${PROD_REPO}:${{matrix.name}}-latest"
+          IMAGE_CACHE="${PROD_REPO}:${{matrix.name}}-buildcache"
+        elif [ "${{ github.event_name }}" == "pull_request" ]; then
+          echo "Triggered by pull_request event."
+          STAGING_REPO="mosaicml/ci-staging"
+          IMAGE_TAG="${STAGING_REPO}:${{matrix.name}}-${GIT_SHA}"
+          IMAGE_CACHE="${STAGING_REPO}:${{matrix.name}}-buildcache"
+        else
+          echo "Triggered by unknown event: ${{ github.event_name }}"
+          exit 1
+        fi
+
+        echo "IMAGE_TAG=${IMAGE_TAG}" >> ${GITHUB_ENV}
+        echo "IMAGE_CACHE=${IMAGE_CACHE}" >> ${GITHUB_ENV}
+
     - name: Build and Push the Docker Image
       uses: docker/build-push-action@v3
       with:
         context: .
-        tags: mosaicml/llm-foundry:${{ matrix.name }}-latest,
-          mosaicml/llm-foundry:${{ matrix.name }}-${{ env.IMAGE_TAG }}
+        tags: ${{ env.IMAGE_TAG }}
         push: true
-        cache-from: type=registry,ref=mosaicml/llm-foundry:${{ matrix.name }}-buildcache
-        cache-to: type=registry,ref=mosaicml/llm-foundry:${{ matrix.name }}-buildcache,mode=max
-        build-args: BASE_IMAGE=${{ matrix.base_image }}
+        cache-from: type=registry,ref=${{ env.IMAGE_CACHE }}
+        cache-to: type=registry,ref=${{ env.IMAGE_CACHE }},mode=max
+        build-args: |
+          BASE_IMAGE=${{ matrix.base_image }}
+          DEP_GROUPS=${{ matrix.dep_groups }}
diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
index 769b345e39..e16f2c8b40 100644
--- a/.github/workflows/pr-gpu.yaml
+++ b/.github/workflows/pr-gpu.yaml
@@ -18,6 +18,7 @@ jobs:
     uses: ./.github/workflows/pytest-gpu.yaml
     strategy:
       matrix:
+        # TODO: After the PR with the flash attention 2 images goes in, add the new unit test suite
         include:
         - name: 'gpu-latest'
           container: mosaicml/pytorch:latest  # mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
diff --git a/Dockerfile b/Dockerfile
index 0d75241068..6c283660c4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -4,9 +4,10 @@
 ARG BASE_IMAGE
 FROM $BASE_IMAGE
 
+ARG DEP_GROUPS
 
 # Install and uninstall foundry to cache foundry requirements
-RUN git clone -b main https://github.com/mosaicml/llm-foundry.git && \
-    pip install --no-cache-dir "./llm-foundry[gpu]" && \
-    pip uninstall -y llm-foundry && \
-    rm -rf llm-foundry
+RUN git clone -b main https://github.com/mosaicml/llm-foundry.git
+RUN pip install --no-cache-dir "./llm-foundry${DEP_GROUPS}"
+RUN pip uninstall -y llm-foundry
+RUN rm -rf llm-foundry
diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py
index bea6284fb5..39fa7162ac 100644
--- a/llmfoundry/models/layers/attention.py
+++ b/llmfoundry/models/layers/attention.py
@@ -17,6 +17,22 @@
 from llmfoundry.models.layers.norm import NORM_CLASS_REGISTRY
 
 
+def is_flash_v2_installed():
+    try:
+        import flash_attn as flash_attn
+    except:
+        return False
+    return version.parse(flash_attn.__version__) >= version.parse('2.0.0')
+
+
+def is_flash_v1_installed():
+    try:
+        import flash_attn as flash_attn
+    except:
+        return False
+    return version.parse(flash_attn.__version__) < version.parse('2.0.0')
+
+
 def _reset_is_causal(num_query_tokens: int, num_key_tokens: int,
                      original_is_causal: bool) -> bool:
     # disable causal when it is not needed
@@ -197,7 +213,8 @@ def flash_attn_fn(
     try:
         from flash_attn import bert_padding, flash_attn_interface  # type: ignore # yapf: disable # isort: skip
     except:
-        raise RuntimeError('Please install flash-attn==1.0.3.post0')
+        raise RuntimeError(
+            'Please install flash-attn==1.0.9 or flash-attn==2.3.2')
 
     check_valid_inputs(query, key, value)
 
@@ -278,18 +295,35 @@ def flash_attn_fn(
 
     reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
 
-    output_unpad = flash_attn_interface.flash_attn_unpadded_func(
-        query_unpad,
-        key_unpad,
-        value_unpad,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        dropout_p,
-        softmax_scale=softmax_scale,
-        causal=reset_is_causal,
-        return_attn_probs=needs_weights)
+    if is_flash_v1_installed():
+        output_unpad = flash_attn_interface.flash_attn_unpadded_func(
+            q=query_unpad,
+            k=key_unpad,
+            v=value_unpad,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            causal=reset_is_causal,
+            return_attn_probs=needs_weights)
+    elif is_flash_v2_installed():
+        output_unpad = flash_attn_interface.flash_attn_varlen_func(
+            q=query_unpad,
+            k=key_unpad,
+            v=value_unpad,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            causal=reset_is_causal,
+            return_attn_probs=needs_weights)
+    else:
+        raise RuntimeError(
+            'flash-attn==1.0.9 or flash-attn==2.3.2 is required.')
 
     output = bert_padding.pad_input(
         rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size,
@@ -321,7 +355,7 @@ def triton_flash_attn_fn(
         if version.parse(torch.__version__) < version.parse('2.0.0'):
             _installed = True
             # if torch1.13.1 revert to using triton flash attn from HazyResearch
-            # with flash-attn==1.0.3.post0 and triton==2.0.0.dev20221202
+            # with flash-attn==1.0.9 and triton==2.0.0.dev20221202
             try:
                 from flash_attn.flash_attn_triton import flash_attn_func
             except:
diff --git a/setup.py b/setup.py
index be5b6708a3..a686dd0808 100644
--- a/setup.py
+++ b/setup.py
@@ -91,6 +91,12 @@
     # PyPI does not support direct dependencies, so we remove this line before uploading from PyPI
     'xentropy-cuda-lib@git+https://github.com/HazyResearch/flash-attention.git@v1.0.9#subdirectory=csrc/xentropy',
 ]
+extra_deps['gpu-flash2'] = [
+    'flash-attn==2.3.2',
+    'mosaicml-turbo==0.0.4',
+    # PyPI does not support direct dependencies, so we remove this line before uploading from PyPI
+    'xentropy-cuda-lib@git+https://github.com/HazyResearch/flash-attention.git@v2.3.2#subdirectory=csrc/xentropy',
+]
 
 extra_deps['peft'] = [
     'loralib==0.1.1',  # lora core
@@ -107,7 +113,10 @@
 ]
 extra_deps['all-cpu'] = set(
     dep for key, deps in extra_deps.items() for dep in deps if 'gpu' not in key)
-extra_deps['all'] = set(dep for deps in extra_deps.values() for dep in deps)
+extra_deps['all'] = set(dep for key, deps in extra_deps.items() for dep in deps
+                        if key != 'gpu-flash2')
+extra_deps['all-flash2'] = set(
+    dep for key, deps in extra_deps.items() for dep in deps if key != 'gpu')
 
 setup(
     name=_PACKAGE_NAME,

From a1283403bccfe13e5eb091ff8f5cb382b0196c53 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Tue, 10 Oct 2023 09:52:43 -0700
Subject: [PATCH 3/3] fix (#667)

---
 .github/workflows/pr-gpu.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
index e16f2c8b40..287c134b63 100644
--- a/.github/workflows/pr-gpu.yaml
+++ b/.github/workflows/pr-gpu.yaml
@@ -4,7 +4,7 @@ on:
     branches:
     - main
     - release/*
-  pull_request_target:
+  pull_request:
     branches:
     - main
     - release/**