diff --git a/.ci/tritonbench/install-triton-nightly.sh b/.ci/tritonbench/install-triton-nightly.sh
new file mode 100644
index 000000000..4d79004f3
--- /dev/null
+++ b/.ci/tritonbench/install-triton-nightly.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+if [ -z "${BASE_CONDA_ENV}" ]; then
+  echo "ERROR: BASE_CONDA_ENV is not set"
+  exit 1
+fi
+
+if [ -z "${CONDA_ENV}" ]; then
+  echo "ERROR: CONDA_ENV is not set"
+  exit 1
+fi
+
+if [ -z "${SETUP_SCRIPT}" ]; then
+  echo "ERROR: SETUP_SCRIPT is not set"
+  exit 1
+fi
+
+CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}"
+conda activate "${BASE_CONDA_ENV}"
+# Remove the conda env if exists
+conda remove --name "${CONDA_ENV}" -y --all || true
+conda create --name "${CONDA_ENV}" -y --clone "${BASE_CONDA_ENV}"
+conda activate "${CONDA_ENV}"
+
+. "${SETUP_SCRIPT}"
+# Install the nightly openai/triton
+pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
diff --git a/.ci/tritonbench/test.sh b/.ci/tritonbench/test-install.sh
similarity index 87%
rename from .ci/tritonbench/test.sh
rename to .ci/tritonbench/test-install.sh
index 34604aae4..383f7d4cd 100644
--- a/.ci/tritonbench/test.sh
+++ b/.ci/tritonbench/test-install.sh
@@ -8,5 +8,5 @@ fi
 parent_dir=$(dirname "$(readlink -f "$0")")/../..
 cd ${parent_dir}
 
-# Test TritonBench
+# Test TritonBench installation
 python install.py --userbenchmark triton --fbgemm --test
diff --git a/.ci/tritonbench/test-operators.sh b/.ci/tritonbench/test-operators.sh
new file mode 100644
index 000000000..40af2f18f
--- /dev/null
+++ b/.ci/tritonbench/test-operators.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+set -x
+
+if [ -z "${SETUP_SCRIPT}" ]; then
+  echo "ERROR: SETUP_SCRIPT is not set"
+  exit 1
+fi
+
+. "${SETUP_SCRIPT}"
+
+# Test Tritonbench operators
+# TODO: test every operator, fwd+bwd
+python run_benchmark.py triton --op launch_latency --mode fwd --num-inputs 1 --test-only
+python run_benchmark.py triton --op addmm --mode fwd --num-inputs 1 --test-only
+python run_benchmark.py triton --op gemm --mode fwd --num-inputs 1 --test-only
+python run_benchmark.py triton --op sum --mode fwd --num-inputs 1 --test-only
+python run_benchmark.py triton --op softmax --mode fwd --num-inputs 1 --test-only
+python run_benchmark.py triton --op layer_norm --mode fwd --num-inputs 1 --test-only
+
+
+# Segfault
+# python run_benchmark.py triton --op flash_attention --mode fwd --num-inputs 1 --test-only
+
+# CUDA OOM
+# python run_benchmark.py triton --op jagged_layer_norm --mode fwd --num-inputs 1 --test-only
+# python run_benchmark.py triton --op jagged_mean --mode fwd --num-inputs 1 --test-only
+# python run_benchmark.py triton --op jagged_softmax --mode fwd --num-inputs 1 --test-only
+# python run_benchmark.py triton --op jagged_sum --mode fwd --num-inputs 1 --test-only
diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index c9bdb3131..815506b7c 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -3,18 +3,22 @@ on:
   pull_request:
     # ignore tritonbench paths
     paths-ignore:
-      - 'torchbenchmark/operators'
+      - 'torchbenchmark/operators/*'
+      - 'torchbenchmark/util/kernels/*'
       - 'torchbenchmark/util/triton_op.py'
-      - 'userbenchmark/triton'
+      - 'userbenchmark/triton/*'
+      - '.ci/tritonbench/*'
   workflow_dispatch:
   push:
     branches:
       - main
     # ignore tritonbench paths
     paths-ignore:
-      - 'torchbenchmark/operators'
+      - 'torchbenchmark/operators/*'
+      - 'torchbenchmark/util/kernels/*'
       - 'torchbenchmark/util/triton_op.py'
-      - 'userbenchmark/triton'
+      - 'userbenchmark/triton/*'
+      - '.ci/tritonbench/*'
 
 jobs:
   cpu-test:
diff --git a/.github/workflows/tritonbench-test.yml b/.github/workflows/tritonbench-test.yml
new file mode 100644
index 000000000..7e58ccf39
--- /dev/null
+++ b/.github/workflows/tritonbench-test.yml
@@ -0,0 +1,63 @@
+name: Tritonbench PR Test on Triton nightly
+on:
+  pull_request:
+    paths:
+      - 'torchbenchmark/operators/*'
+      - 'torchbenchmark/util/kernels/*'
+      - 'torchbenchmark/util/triton_op.py'
+      - 'userbenchmark/triton/*'
+      - '.ci/tritonbench/*'
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+    paths:
+      - 'torchbenchmark/operators/*'
+      - 'torchbenchmark/util/kernels/*'
+      - 'torchbenchmark/util/triton_op.py'
+      - 'userbenchmark/triton/*'
+      - '.ci/tritonbench/*'
+
+jobs:
+  cuda-test:
+    # Don't run on forked repos
+    if: github.repository_owner == 'pytorch'
+    runs-on: [a100-runner]
+    timeout-minutes: 240
+    environment: docker-s3-upload
+    env:
+      BASE_CONDA_ENV: "torchbench"
+      CONDA_ENV: "tritonbench-pr-test-cuda"
+      SETUP_SCRIPT: "/workspace/setup_instance.sh"
+      TEST_CONFIG: "cuda"
+      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    steps:
+      - name: Checkout TorchBench
+        uses: actions/checkout@v3
+        with:
+          submodules: 'true'
+      - name: Tune Nvidia GPU
+        run: |
+          sudo nvidia-smi -pm 1
+          sudo nvidia-smi -ac 1215,1410
+          sudo ldconfig
+          nvidia-smi
+      - name: Install triton-nightly
+        run: |
+          bash ./.ci/tritonbench/install-triton-nightly.sh
+      - name: Test Tritonbench install
+        run: |
+          bash ./.ci/tritonbench/test-install.sh
+      - name: Test Tritonbench operators
+        run: |
+          bash ./.ci/tritonbench/test-operators.sh
+      - name: Clean up Conda env
+        if: always()
+        run: |
+          . "${SETUP_SCRIPT}"
+          conda deactivate && conda deactivate
+          conda remove -n "${CONDA_ENV}" --all
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true