From e2e4333c955b829d0e6087d27ee435f55c80d3a5 Mon Sep 17 00:00:00 2001 From: Tri Dao Date: Sun, 26 May 2024 15:35:21 -0700 Subject: [PATCH] Limit to MAX_JOBS=1 with CUDA 12.2 --- .github/workflows/publish.yml | 3 ++- flash_attn/__init__.py | 2 +- training/Dockerfile | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 88aa16768..020c1371a 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -168,7 +168,8 @@ jobs: export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH # Limit MAX_JOBS otherwise the github runner goes OOM - MAX_JOBS=2 FLASH_ATTENTION_FORCE_BUILD="TRUE" FLASH_ATTENTION_FORCE_CXX11_ABI=${{ matrix.cxx11_abi}} python setup.py bdist_wheel --dist-dir=dist + # CUDA 11.8 can compile with 2 jobs, but CUDA 12.2 goes OOM + MAX_JOBS=$([ "$MATRIX_CUDA_VERSION" == "122" ] && echo 1 || echo 2) FLASH_ATTENTION_FORCE_BUILD="TRUE" FLASH_ATTENTION_FORCE_CXX11_ABI=${{ matrix.cxx11_abi}} python setup.py bdist_wheel --dist-dir=dist tmpname=cu${MATRIX_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}cxx11abi${{ matrix.cxx11_abi }} wheel_name=$(ls dist/*whl | xargs -n 1 basename | sed "s/-/+$tmpname-/2") ls dist/*whl |xargs -I {} mv {} dist/${wheel_name} diff --git a/flash_attn/__init__.py b/flash_attn/__init__.py index a461e8ac6..242022d6a 100644 --- a/flash_attn/__init__.py +++ b/flash_attn/__init__.py @@ -1,4 +1,4 @@ -__version__ = "2.5.9" +__version__ = "2.5.9.post1" from flash_attn.flash_attn_interface import ( flash_attn_func, diff --git a/training/Dockerfile b/training/Dockerfile index 2c68bd1ea..0baec9278 100644 --- a/training/Dockerfile +++ b/training/Dockerfile @@ -85,7 +85,7 @@ RUN pip install transformers==4.25.1 datasets==2.8.0 pytorch-lightning==1.8.6 tr RUN pip install git+https://github.com/mlcommons/logging.git@2.1.0 # Install FlashAttention -RUN pip install flash-attn==2.5.9 +RUN pip install flash-attn==2.5.9.post1 # Install CUDA extensions for fused dense -RUN pip install git+https://github.com/HazyResearch/flash-attention@v2.5.9#subdirectory=csrc/fused_dense_lib +RUN pip install git+https://github.com/HazyResearch/flash-attention@v2.5.9.post1#subdirectory=csrc/fused_dense_lib