Merge branch 'master' into dev_tile

FlagOpen · Aug 23, 2024 · c6aa7f6 · c6aa7f6
2 parents fa7554a + 2c4625e
commit c6aa7f6
Show file tree

Hide file tree

Showing 35 changed files with 1,585 additions and 139 deletions.
diff --git a/.github/workflows/pre-commit.yml → .github/workflows/code-format-check.yml b/.github/workflows/pre-commit.yml → .github/workflows/code-format-check.yml
diff --git a/.github/workflows/model-test.yaml b/.github/workflows/model-test.yaml
@@ -0,0 +1,29 @@
+# This workflow will install Python dependencies, run tests and lint with a single version of Python
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: model-test
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+
+jobs:
+  container-model-test:
+    runs-on: [self-hosted, docker]
+    container:
+      image: localhost:5000/flag-gems-ci:v1.0
+      ports:
+        - 82
+      options: --gpus all --hostname flag-gems_cicd_model -v /home/flaggems_cicd/huggingface_cache_bert:/__w/_temp/_github_home/.cache/huggingface
+    steps:
+      - name: checkout-code
+        uses: actions/checkout@v4
+
+      - name: check-gpu-free
+        run: tests/scripts/gpu_check.sh
+
+      - name: examples-flag-gems
+        run: |
+          CUDA_VISIBLE_DEVICES=5 pytest -s examples/model_bert_test.py
diff --git a/.github/workflows/python-test.yaml → .github/workflows/op-unit-test.yaml b/.github/workflows/python-test.yaml → .github/workflows/op-unit-test.yaml
@@ -1,7 +1,7 @@
 # This workflow will install Python dependencies, run tests and lint with a single version of Python
 # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 
-name: flag-gems-test
+name: op-unit-test
 
 on:
   push:
@@ -62,21 +62,3 @@ jobs:
           done
 
           exit $overall_status
-
-  container-model-test:
-    runs-on: [self-hosted, docker]
-    container:
-      image: localhost:5000/flag-gems-ci:v1.0
-      ports:
-        - 82
-      options: --gpus all --hostname flag-gems_cicd_model -v /home/flaggems_cicd/huggingface_cache_bert:/__w/_temp/_github_home/.cache/huggingface
-    steps:
-      - name: checkout-code
-        uses: actions/checkout@v4
-
-      - name: check-gpu-free
-        run: tests/scripts/gpu_check.sh
-
-      - name: examples-flag-gems
-        run: |
-          CUDA_VISIBLE_DEVICES=5 pytest -s examples/model_bert_test.py
diff --git a/.github/workflows/python-coverage.yaml b/.github/workflows/python-coverage.yaml
@@ -0,0 +1,69 @@
+# https://github.com/marketplace/actions/python-coverage
+
+name: python-coverage
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+
+permissions:
+  pull-requests: write
+
+jobs:
+  container-coverage-test:
+    runs-on: [self-hosted, docker]
+    container:
+      image: localhost:5000/flag-gems-ci:v1.0
+      ports:
+        - 81
+      options: --gpus all --hostname flag-gems_cicd_coverage
+    steps:
+
+      - name: check-gpu-free
+        run: tests/scripts/gpu_check.sh
+
+      - name: run-pytest
+        shell: bash
+        run: |
+          cmds=(
+            "CUDA_VISIBLE_DEVICES=0 coverage run --parallel-mode --omit "*/.flaggems/*","*/usr/lib/*" -m pytest -s tests/test_unary_pointwise_ops.py &"
+            "CUDA_VISIBLE_DEVICES=0 coverage run --parallel-mode --omit "*/.flaggems/*","*/usr/lib/*" -m pytest -s tests/test_pointwise_type_promotion.py &"
+            "CUDA_VISIBLE_DEVICES=1 coverage run --parallel-mode --omit "*/.flaggems/*","*/usr/lib/*" -m pytest -s tests/test_binary_pointwise_ops.py &"
+            "CUDA_VISIBLE_DEVICES=1 coverage run --parallel-mode --omit "*/.flaggems/*","*/usr/lib/*" -m pytest -s tests/test_tensor_constructor_ops.py &"
+            "CUDA_VISIBLE_DEVICES=1 coverage run --parallel-mode --omit "*/.flaggems/*","*/usr/lib/*" -m pytest -s tests/test_distribution_ops.py &"
+            "CUDA_VISIBLE_DEVICES=2 coverage run --parallel-mode --omit "*/.flaggems/*","*/usr/lib/*" -m pytest -s tests/test_blas_ops.py &"
+            "CUDA_VISIBLE_DEVICES=3 coverage run --parallel-mode --omit "*/.flaggems/*","*/usr/lib/*" -m pytest -s tests/test_reduction_ops.py &"
+            "CUDA_VISIBLE_DEVICES=4 coverage run --parallel-mode --omit "*/.flaggems/*","*/usr/lib/*" -m pytest -s tests/test_special_ops.py &"
+            "CUDA_VISIBLE_DEVICES=5 coverage run --parallel-mode --omit "*/.flaggems/*","*/usr/lib/*" -m pytest -s tests/test_libentry.py &"
+            "CUDA_VISIBLE_DEVICES=5 coverage run --parallel-mode --omit "*/.flaggems/*","*/usr/lib/*" -m pytest -s examples/model_bert_test.py &"
+          )
+
+          declare -a exit_statuses
+
+          for cmd in "${cmds[@]}"; do
+            eval "$cmd"
+          done
+
+          for job in $(jobs -p); do
+            wait $job
+            exit_statuses+=($?)
+            echo "Task $pid completed with exit status ${exit_statuses[-1]}"
+          done
+
+          echo "Exit statuses of all tasks: ${exit_statuses[@]}"
+
+      - name: get-coverage
+        run: |
+          coverage combine --append
+          coverage report -m
+          coverage xml -o coverage.xml
+
+      - name: report-coverage
+        uses: orgoro/[email protected]
+        with:
+            coverageFile: coverage.xml
+            thresholdNew: 0.8
+            thresholdModified: 0.0
+            token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/README.md b/README.md
@@ -187,7 +187,7 @@ Operators will be implemented according to [OperatorList.md](./OperatorList.md).
 
 The following chart shows the speedup of FlagGems compared with PyTorch ATen library in eager mode. The speedup is calculated by averaging the speedup on each shape, representing the overall performance of the operator.
 
-![Operator Speedup](./assets/speedup-0708-eng.png)
+![Operator Speedup](./assets/speedup-0814-eng.png)
 
 ## Contributions
 

diff --git a/README_cn.md b/README_cn.md
@@ -186,7 +186,7 @@ pip install .
 
 FlagGems相比Torch Eager模式下ATen算子库的加速比如下图所示。其中，每个算子的加速比综合了多个形状测例的数据，代表该算子的整体性能。
 
-![算子加速比](./assets/speedup-0708-chn.png)
+![算子加速比](./assets/speedup-0814-chn.png)
 
 ## 贡献代码
 

diff --git a/assets/speedup-0814-chn.png b/assets/speedup-0814-chn.png
diff --git a/assets/speedup-0814-eng.png b/assets/speedup-0814-eng.png
diff --git a/benchmark/test_distribution_perf.py b/benchmark/test_distribution_perf.py
@@ -9,50 +9,6 @@
 )
 
 
-def test_perf_rand():
-    def rand_kwargs(dtype, batch, size):
-        return {"size": (batch, size), "dtype": dtype, "device": "cuda"}
-
-    bench = Benchmark(
-        op_name="rand",
-        torch_op=torch.rand,
-        arg_func=None,
-        dtypes=FLOAT_DTYPES,
-        batch=POINTWISE_BATCH,
-        sizes=SIZES,
-        kwargs_func=rand_kwargs,
-    )
-    bench.run()
-
-
-def test_perf_randn():
-    def randn_kwargs(dtype, batch, size):
-        return {"size": (batch, size), "dtype": dtype, "device": "cuda"}
-
-    bench = Benchmark(
-        op_name="randn",
-        torch_op=torch.randn,
-        arg_func=None,
-        dtypes=FLOAT_DTYPES,
-        batch=POINTWISE_BATCH,
-        sizes=SIZES,
-        kwargs_func=randn_kwargs,
-    )
-    bench.run()
-
-
-def test_perf_rand_like():
-    bench = Benchmark(
-        op_name="rand_like",
-        torch_op=torch.rand_like,
-        arg_func=unary_arg,
-        dtypes=FLOAT_DTYPES,
-        batch=POINTWISE_BATCH,
-        sizes=SIZES,
-    )
-    bench.run()
-
-
 def test_perf_normal():
     def normal_arg(dtype, batch, size):
         loc = torch.full(size=(size, batch), fill_value=3.0, dtype=dtype, device="cuda")

diff --git a/benchmark/test_pointwise_perf.py b/benchmark/test_pointwise_perf.py
@@ -134,6 +134,30 @@ def test_perf_eq():
     bench.run()
 
 
+def test_perf_maximum():
+    bench = Benchmark(
+        op_name="maximum",
+        torch_op=torch.maximum,
+        arg_func=binary_args,
+        dtypes=FLOAT_DTYPES,
+        batch=POINTWISE_BATCH,
+        sizes=SIZES,
+    )
+    bench.run()
+
+
+def test_perf_minimum():
+    bench = Benchmark(
+        op_name="minimum",
+        torch_op=torch.minimum,
+        arg_func=binary_args,
+        dtypes=FLOAT_DTYPES,
+        batch=POINTWISE_BATCH,
+        sizes=SIZES,
+    )
+    bench.run()
+
+
 def test_perf_exp():
     bench = Benchmark(
         op_name="exp",
@@ -158,14 +182,68 @@ def test_perf_ge():
     bench.run()
 
 
-def test_perf_gelu():
+def test_perf_gelu_tanh():
+    def gelu_kwargs(dtype, batch, size):
+        return {"approximate": "tanh"}
+
+    bench = Benchmark(
+        op_name="gelu",
+        torch_op=torch.nn.functional.gelu,
+        arg_func=unary_arg,
+        dtypes=FLOAT_DTYPES,
+        batch=POINTWISE_BATCH,
+        sizes=SIZES,
+        kwargs_func=gelu_kwargs,
+    )
+    bench.run()
+
+
+def test_perf_gelu_none():
+    def gelu_kwargs(dtype, batch, size):
+        return {"approximate": "none"}
+
+    bench = Benchmark(
+        op_name="gelu",
+        torch_op=torch.nn.functional.gelu,
+        arg_func=unary_arg,
+        dtypes=FLOAT_DTYPES,
+        batch=POINTWISE_BATCH,
+        sizes=SIZES,
+        kwargs_func=gelu_kwargs,
+    )
+    bench.run()
+
+
+def test_perf_gelu_backward_tanh():
+    def gelu_kwargs(dtype, batch, size):
+        return {"approximate": "tanh"}
+
+    bench = Benchmark(
+        op_name="gelu",
+        torch_op=torch.nn.functional.gelu,
+        arg_func=unary_arg,
+        dtypes=FLOAT_DTYPES,
+        batch=POINTWISE_BATCH,
+        sizes=SIZES,
+        kwargs_func=gelu_kwargs,
+        is_backward=True,
+    )
+    bench.run()
+
+
+def test_perf_gelu_backward_none():
+    def gelu_kwargs(dtype, batch, size):
+        return {"approximate": "none"}
+
     bench = Benchmark(
         op_name="gelu",
         torch_op=torch.nn.functional.gelu,
         arg_func=unary_arg,
         dtypes=FLOAT_DTYPES,
         batch=POINTWISE_BATCH,
         sizes=SIZES,
+        kwargs_func=gelu_kwargs,
+        is_backward=True,
     )
     bench.run()
 

diff --git a/benchmark/test_reduction_perf.py b/benchmark/test_reduction_perf.py
@@ -286,3 +286,28 @@ def test_perf_vector_norm():
         sizes=SIZES,
     )
     bench.run()
+
+
+def test_perf_index_select():
+    def index_select_args(dtype, batch, size):
+        inp = torch.randn([batch, size], dtype=dtype, device="cuda")
+
+        threshold = 0.1
+        dim = 0
+        index_size = inp.size(dim)
+        from math import floor
+
+        index = torch.randint(
+            0, index_size, [floor(index_size * threshold)], device="cuda"
+        )
+        return (inp, dim, index)
+
+    bench = Benchmark(
+        op_name="index_select",
+        torch_op=torch.index_select,
+        arg_func=index_select_args,
+        dtypes=FLOAT_DTYPES,
+        batch=REDUCTION_BATCH,
+        sizes=SIZES,
+    )
+    bench.run()
diff --git a/benchmark/test_special_perf.py b/benchmark/test_special_perf.py
@@ -1,6 +1,13 @@
 import torch
 
-from .performance_utils import FLOAT_DTYPES, POINTWISE_BATCH, SIZES, Benchmark
+from .performance_utils import (
+    FLOAT_DTYPES,
+    INT_DTYPES,
+    POINTWISE_BATCH,
+    SIZES,
+    Benchmark,
+    unary_int_arg,
+)
 
 
 def test_perf_embedding():
@@ -73,3 +80,19 @@ def resolve_conj_arg(dtype, batch, size):
         sizes=SIZES,
     )
     bench.run()
+
+
+def test_perf_unique():
+    def unique_kwargs(dtype, batch, size):
+        return {"sorted": True, "return_inverse": True, "return_counts": False}
+
+    bench = Benchmark(
+        op_name="unique",
+        torch_op=torch.unique,
+        arg_func=unary_int_arg,
+        dtypes=INT_DTYPES,
+        batch=POINTWISE_BATCH,
+        sizes=SIZES,
+        kwargs_func=unique_kwargs,
+    )
+    bench.run()