Migrate CUDA benchmark over to reusable workflow (#4707)

q10 · facebook-github-bot · commit 635ffe7f3175 · 2025-08-15T13:58:40.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1735 Pull Request resolved: #4707 Reviewed By: cthi Differential Revision: D80308055 Pulled By: q10 fbshipit-source-id: ebfb90779937102240888c2ff42a72b7af516f31
diff --git a/.github/workflows/_fbgemm_gpu_cuda_test.yml b/.github/workflows/_fbgemm_gpu_cuda_test.yml
@@ -43,9 +43,15 @@ on:
         type: string
         required: false
         default: ""
+      run-target:
+        description: Run target (test, benchmark)
+        type: string
+        required: true
+        default: test
     secrets:
       PYPI_TOKEN:
-        required: true
+        # The PyPI token is only needed if publishing the artifact to PyPI is desired
+        required: false
 
 jobs:
   # Download the built artifact from GHA, test on GPU, and push to PyPI
@@ -145,11 +151,33 @@ jobs:
       run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl
 
     - name: Test with PyTest
+      if: ${{ inputs.run-target == 'test' }}
       timeout-minutes: 60
       run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV
 
+    - name: Run FBGEMM_GPU Benchmark
+      if: ${{ inputs.run-target == 'benchmark' }}
+      timeout-minutes: 40
+      run: . $PRELUDE; run_tbe_microbench $BUILD_ENV
+
+    - name: Upload Benchmark Traces as GHA Artifacts
+      if: ${{ inputs.run-target == 'benchmark' }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: fbgemm_gpu_traces_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.zip
+        path: fbgemm_gpu/bench/*.json
+        if-no-files-found: error
+
     - name: Push Wheel to PyPI
       if: ${{ inputs.publish-to-pypi && matrix.cuda-version == inputs.cuda-version-publish }}
       env:
         PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
-      run: . $PRELUDE; publish_to_pypi $BUILD_ENV "$PYPI_TOKEN" *.whl
+      run: |
+        . $PRELUDE;
+
+        if [[ -z "$PYPI_TOKEN" ]]; then
+          echo "PYPI_TOKEN is not set!" >&2
+          exit 1
+        fi
+
+        publish_to_pypi $BUILD_ENV "$PYPI_TOKEN" *.whl
diff --git a/.github/workflows/fbgemm_gpu_benchmark_cuda.yml b/.github/workflows/fbgemm_gpu_benchmark_cuda.yml
@@ -48,93 +48,22 @@ jobs:
 
   benchmark:
     needs: build
-    runs-on: ${{ matrix.host-machine.instance }}
-    defaults:
-      run:
-        shell: bash
-    env:
-      PRELUDE: .github/scripts/setup_env.bash
-      BUILD_ENV: build_binary
-      BUILD_VARIANT: cuda
-      BUILD_CUDA_VERSION: ${{ matrix.cuda-version }}
-      ENFORCE_CUDA_DEVICE: 1
-    strategy:
-      fail-fast: false
-      matrix:
-        build-target: [ "default" ]
-        host-machine: [
-          { arch: x86, instance: "linux.g5.4xlarge.nvidia.gpu" },
-          # TODO: Enable when A100 machine queues are reasonably small enough for doing per-PR CI
-          # https://hud.pytorch.org/metrics
-          # { arch: x86, instance: "linux.gcp.a100" },
-        ]
-        python-version: [ "3.13" ]
-        cuda-version: [ "12.8.1" ]
-        compiler: [ "gcc" ]
-
-    steps:
-    # Cannot upgrade to actions/checkout@v4 yet because GLIBC on the instance is too old
-    - name: Checkout the Repository
-      uses: actions/checkout@v4
-      with:
-        submodules: true
-
-    - name: Download Wheel Artifact from GHA
-      # Cannot upgrade to actions/download-artifact@v4 yet because GLIBC on the instance is too old
-      uses: actions/download-artifact@v4
-      with:
-        name: fbgemm_${{ matrix.build-target }}_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
-
-    # Use PyTorch test infrastructure action - https://github.com/pytorch/test-infra/blob/main/.github/actions/setup-nvidia/action.yml
-    - name: Install NVIDIA Drivers and NVIDIA-Docker Runtime
-      uses: pytorch/test-infra/.github/actions/setup-nvidia@main
-
-    - name: Display System Info
-      run: . $PRELUDE; print_system_info; print_ec2_info
-
-    - name: Display GPU Info
-      run: . $PRELUDE; print_gpu_info
-
-    - name: Setup Miniconda
-      run: . $PRELUDE; setup_miniconda $HOME/miniconda
-
-    - name: Create Conda Environment
-      run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
-
-    - name: Install Build Tools
-      run: . $PRELUDE; install_build_tools $BUILD_ENV
-
-    - name: Install C/C++ Compilers for Updated LIBGCC
-      # NOTE: gcc is required for torch dynamo to work properly, as some of
-      # the compilation flags used by torch dynamo are gcc-specific:
-      #
-      #   clang-16: error: unknown argument: '-fno-tree-loop-vectorize'
-      run: . $PRELUDE; install_cxx_compiler $BUILD_ENV gcc
-
-    - name: Install CUDA
-      run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}
-
-    # Install via PIP to avoid defaulting to the CPU variant if the GPU variant of the day is not ready
-    - name: Install PyTorch Nightly
-      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.pytorch-channel-version) || 'nightly' }} cuda/${{ matrix.cuda-version }}
-
-    - name: Collect PyTorch Environment Info
-      if: ${{ success() || failure() }}
-      run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi
-
-    - name: Prepare FBGEMM_GPU Build
-      run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
-
-    - name: Install FBGEMM_GPU Wheel
-      run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl
-
-    - name: Run FBGEMM_GPU Benchmark
-      timeout-minutes: 40
-      run: . $PRELUDE; run_tbe_microbench $BUILD_ENV
-
-    - name: Upload Benchmark Traces as GHA Artifacts
-      uses: actions/upload-artifact@v4
-      with:
-        name: fbgemm_gpu_traces_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.zip
-        path: fbgemm_gpu/bench/*.json
-        if-no-files-found: error
+    uses: ./.github/workflows/_fbgemm_gpu_cuda_test.yml
+    with:
+      matrix: >-
+        {
+          "build-target": [ "default" ],
+          "host-machine": [
+            { "arch": "x86", "instance": "linux.g5.4xlarge.nvidia.gpu" },
+          ],
+          "python-version": [ "3.13" ],
+          "cuda-version": [ "12.8.1" ],
+          "compiler": [ "gcc" ],
+        }
+      repo-ref: ${{ github.ref }}
+      pytorch-channel-version: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.pytorch-channel-version) || 'nightly' }}
+      run-target: benchmark
+      extra-env: >-
+        {
+          "ENFORCE_CUDA_DEVICE": 1
+        }
diff --git a/fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_training_test.py b/fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_training_test.py
@@ -27,11 +27,7 @@
 )
 from fbgemm_gpu.split_table_batched_embeddings_ops_training import RESParams
 from fbgemm_gpu.tbe.ssd import SSDTableBatchedEmbeddingBags
-from fbgemm_gpu.tbe.utils import (
-    b_indices,
-    get_table_batched_offsets_from_dense,
-    round_up,
-)
+from fbgemm_gpu.tbe.utils import b_indices, get_table_batched_offsets_from_dense
 from hypothesis import assume, given, settings, Verbosity
 from torch import distributed as dist