diff --git a/.github/scripts/utils_cuda.bash b/.github/scripts/utils_cuda.bash
index 8bbf4ff2c..15d0d5c61 100644
--- a/.github/scripts/utils_cuda.bash
+++ b/.github/scripts/utils_cuda.bash
@@ -70,6 +70,12 @@ install_cuda () {
   nm -gDC "${libcuda_path}"
   append_to_library_path "${env_name}" "$(dirname "$libcuda_path")"
 
+  # The symlink appears to be missing when we attempt to run FBGEMM_GPU on the
+  # `ubuntu-latest` runners on GitHub, so we have to manually add this in.
+  if [ "$ADD_LIBCUDA_SYMLINK" == "1" ]; then
+    print_exec ln "${libcuda_path}" -s "$(dirname "$libcuda_path")/libcuda.so.1"
+  fi
+
   echo "[INSTALL] Set environment variable NVML_LIB_PATH ..."
   # shellcheck disable=SC2155,SC2086
   local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
diff --git a/.github/scripts/utils_pytorch.bash b/.github/scripts/utils_pytorch.bash
index 3d07e437c..42ea6c0f6 100644
--- a/.github/scripts/utils_pytorch.bash
+++ b/.github/scripts/utils_pytorch.bash
@@ -127,7 +127,12 @@ install_pytorch_pip () {
   # shellcheck disable=SC2155
   local env_prefix=$(env_name_or_prefix "${env_name}")
 
-  # Install the package from PyTorch PIP (not PyPI)
+  # Install the main dependencies
+  # shellcheck disable=SC2086
+  (exec_with_retries 3 conda install ${env_prefix} -c conda-forge -y \
+    numpy) || return 1
+
+  # Install the torch package from PyTorch PIP (not PyPI)
   install_from_pytorch_pip "${env_name}" torch "${pytorch_channel_version}" "${pytorch_variant_type_version}" || return 1
 
   # Check that PyTorch is importable
diff --git a/.github/scripts/utils_system.bash b/.github/scripts/utils_system.bash
index ba5605a28..a81ece336 100644
--- a/.github/scripts/utils_system.bash
+++ b/.github/scripts/utils_system.bash
@@ -73,6 +73,57 @@ free_disk_space () {
   echo "[CLEANUP] Freed up some disk space"
 }
 
+free_disk_space_on_host () {
+  echo "################################################################################"
+  echo "# Free Disk Space On CI Host"
+  echo "################################################################################"
+
+  # NOTE: This is meant to be run from ** inside ** containers hosted on
+  # non-PyTorch-infra GitHub runners, where the hosts might be close to full
+  # disk from serving many CI jobs.  When the container is set up properly, we
+  # can escape the container using nsenter to run commands on the host.
+  #
+  # On average, we see roughly 3GB of disk freed when running this cleanup,
+  # which appears to be sufficient to avoid the somewhat-frequent out-of-disk
+  # errors that we were previously running into.
+  #
+  # Frees up disk space on the ubuntu-latest host machine based on recommendations:
+  # https://github.com/orgs/community/discussions/25678
+  # https://github.com/apache/flink/blob/02d30ace69dc18555a5085eccf70ee884e73a16e/tools/azure-pipelines/free_disk_space.sh
+  #
+  # Escape the docker container to run the free disk operation on the host:
+  # https://stackoverflow.com/questions/66160057/how-to-run-a-command-in-host-before-entering-docker-container-in-github-ci
+  # https://stackoverflow.com/questions/32163955/how-to-run-shell-script-on-host-from-docker-container/63140387#63140387
+
+  nsenter -t 1 -m -u -n -i bash -c "
+    echo 'Listing 100 largest packages';
+    dpkg-query -Wf '\${Installed-Size}\t\${Package}\n' | sort -n | tail -n 100;
+    df -h;
+
+    echo 'Removing large packages';
+    sudo apt-get remove -y '^ghc-8.*';
+    sudo apt-get remove -y '^dotnet-.*';
+    sudo apt-get remove -y '^llvm-.*';
+    sudo apt-get remove -y 'php.*';
+    sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel;
+    sudo apt-get autoremove -y;
+    sudo apt-get clean;
+    df -h;
+
+    echo 'Removing large directories';
+    rm -rf /usr/local/android;
+    rm -rf /usr/share/dotnet;
+    rm -rf /usr/local/share/boost;
+    rm -rf /opt/ghc;
+    rm -rf /usr/local/share/chrom*;
+    rm -rf /usr/share/swift;
+    rm -rf /usr/local/julia*;
+    rm -rf /usr/local/lib/android;
+    rm -rf /opt/hostedtoolcache;
+    df -h;
+  "
+}
+
 
 ################################################################################
 # Info Functions
@@ -91,7 +142,7 @@ print_gpu_info () {
 
   (lspci -v | grep -e 'controller.*NVIDIA') || true
 
-  if [[ "${ENFORCE_CUDA_DEVICE}" ]]; then
+  if [[ "${ENFORCE_CUDA_DEVICE}" == '1' ]]; then
     # Ensure that nvidia-smi is available and returns GPU entries
     if ! nvidia-smi; then
       echo "[CHECK] NVIDIA drivers and CUDA device are required for this workflow, but does not appear to be installed or available!"
diff --git a/.github/workflows/fbgemm_gpu_ci_genai.yml b/.github/workflows/fbgemm_gpu_ci_genai.yml
index 07f8f8059..3ffdf45c0 100644
--- a/.github/workflows/fbgemm_gpu_ci_genai.yml
+++ b/.github/workflows/fbgemm_gpu_ci_genai.yml
@@ -127,7 +127,6 @@ jobs:
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
-    # runs-on: linux.4xlarge.nvidia.gpu
     # Use available instance types - https://github.com/pytorch/test-infra/blob/main/.github/scale-config.yml
     runs-on: ${{ matrix.host-machine.instance }}
     defaults:
diff --git a/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml b/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml
new file mode 100644
index 000000000..ef9f78ed6
--- /dev/null
+++ b/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml
@@ -0,0 +1,199 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This workflow is used for FBGEMM_GPU-GenAI CI, and is meant to be used for
+# copies of the FBGEMM repos hosted outside of the pytorch org.
+name: FBGEMM_GPU-GenAI CI (Generic Runner)
+
+on:
+  # PR Trigger
+  #
+  pull_request:
+    branches:
+      - main
+
+  # Push Trigger (enable to catch errors coming out of multiple merges)
+  #
+  push:
+    branches:
+      - main
+
+  # Manual Trigger
+  #
+  workflow_dispatch:
+
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  # Build on CPU hosts and upload to GHA
+  build_artifact:
+    runs-on: ${{ matrix.host-machine.instance }}
+    container:
+      image: amazonlinux:2023
+      options: --user root --privileged --pid=host
+      volumes:
+          - /var/run/docker.sock:/var/run/docker.sock
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_ENV: build_binary
+      BUILD_VARIANT: genai
+    continue-on-error: true
+    strategy:
+      fail-fast: false
+      matrix:
+        host-machine: [
+          { arch: x86, instance: "ubuntu-latest" },
+        ]
+        python-version: [ "3.9", "3.10", "3.11", "3.12" ]
+        cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
+        compiler: [ "gcc", "clang" ]
+
+    steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which
+
+    - name: Checkout the Repository
+      uses: actions/checkout@v4
+      with:
+        submodules: true
+
+    - name: Free Disk Space on Host
+      run: . $PRELUDE; free_disk_space_on_host
+
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info
+
+    - name: Display GPU Info
+      run: . $PRELUDE; print_gpu_info
+
+    - name: Setup Miniconda
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
+
+    - name: Create Conda Environment
+      run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
+
+    - name: Install C/C++ Compilers
+      run: . $PRELUDE; install_cxx_compiler $BUILD_ENV ${{ matrix.compiler }}
+
+    - name: Install Build Tools
+      run: . $PRELUDE; install_build_tools $BUILD_ENV
+
+    - name: Install CUDA
+      run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}
+
+    # Install via PIP to avoid defaulting to the CPU variant if the GPU variant of the day is not ready
+    - name: Install PyTorch Nightly
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda/${{ matrix.cuda-version }}
+
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi
+
+    - name: Install cuDNN
+      run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }}
+
+    - name: Prepare FBGEMM_GPU Build
+      run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
+
+    - name: Build FBGEMM_GPU Wheel
+      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV nightly genai
+
+    - name: Upload Built Wheel as GHA Artifact
+      # Cannot upgrade to actions/upload-artifact@v4 yet because GLIBC on the instance is too old
+      uses: actions/upload-artifact@v3
+      with:
+        name: fbgemm_gpu_nightly_genai_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
+        path: fbgemm_gpu/dist/*.whl
+        if-no-files-found: error
+
+  # Download the built artifact from GHA, test on GPU, and push to PyPI
+  test_artifact:
+    runs-on: ${{ matrix.host-machine.instance }}
+    container:
+      image: amazonlinux:2023
+      options: --user root --privileged --pid=host
+      volumes:
+          - /var/run/docker.sock:/var/run/docker.sock
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_ENV: build_binary
+      BUILD_VARIANT: genai
+      ENFORCE_CUDA_DEVICE: 0
+      CUDA_VISIBLE_DEVICES: -1
+      ADD_LIBCUDA_SYMLINK: 1
+    strategy:
+      fail-fast: false
+      matrix:
+        host-machine: [
+          { arch: x86, instance: "ubuntu-latest" },
+        ]
+        python-version: [ "3.9", "3.10", "3.11", "3.12" ]
+        cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
+        # Specify exactly ONE CUDA version for artifact publish
+        cuda-version-publish: [ "12.1.1" ]
+        compiler: [ "gcc", "clang" ]
+    needs: build_artifact
+
+    steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which
+
+    - name: Checkout the Repository
+      uses: actions/checkout@v4
+      with:
+        submodules: true
+
+    - name: Free Disk Space on Host
+      run: . $PRELUDE; free_disk_space_on_host
+
+    - name: Download Wheel Artifact from GHA
+      # Cannot upgrade to actions/download-artifact@v4 yet because GLIBC on the instance is too old
+      uses: actions/download-artifact@v3
+      with:
+        name: fbgemm_gpu_nightly_genai_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
+
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info; print_ec2_info
+
+    - name: Display GPU Info
+      run: . $PRELUDE; print_gpu_info
+
+    - name: Setup Miniconda
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
+
+    - name: Create Conda Environment
+      run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
+
+    - name: Install C/C++ Compilers for Updated LIBGCC
+      run: . $PRELUDE; install_cxx_compiler $BUILD_ENV clang
+
+    - name: Install CUDA
+      run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}
+
+    - name: Install PyTorch Nightly
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda/${{ matrix.cuda-version }}
+
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi
+
+    - name: Prepare FBGEMM_GPU Build
+      run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
+
+    - name: Install FBGEMM_GPU Wheel
+      run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl
+
+    - name: Test with PyTest
+      timeout-minutes: 30
+      run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV
diff --git a/fbgemm_gpu/experimental/gen_ai/test/attention/gqa_test.py b/fbgemm_gpu/experimental/gen_ai/test/attention/gqa_test.py
index 43c1dafb4..8446e34e3 100755
--- a/fbgemm_gpu/experimental/gen_ai/test/attention/gqa_test.py
+++ b/fbgemm_gpu/experimental/gen_ai/test/attention/gqa_test.py
@@ -137,7 +137,7 @@ def gqa_reference(
 
 class Int4GQATest(unittest.TestCase):
     @unittest.skipIf(
-        not torch.version.cuda or torch.cuda.get_device_capability()[0] < 8,
+        not torch.cuda.is_available() or torch.cuda.get_device_capability()[0] < 8,
         "Skip when CUDA is not available or CUDA compute capability is less than 8",
     )
     @settings(verbosity=VERBOSITY, max_examples=40, deadline=None)
@@ -243,7 +243,7 @@ def test_gqa(
     )
     # pyre-fixme[56]
     @unittest.skipIf(
-        not torch.version.cuda or not HAS_XFORMERS,
+        not torch.cuda.is_available() or not HAS_XFORMERS,
         "Skip when CUDA is not available or xformers is not available",
     )
     def test_mqa_main(  # noqa C901
diff --git a/fbgemm_gpu/fbgemm_gpu/permute_pooled_embedding_modules.py b/fbgemm_gpu/fbgemm_gpu/permute_pooled_embedding_modules.py
index 03021ad86..cbdf1b0bc 100644
--- a/fbgemm_gpu/fbgemm_gpu/permute_pooled_embedding_modules.py
+++ b/fbgemm_gpu/fbgemm_gpu/permute_pooled_embedding_modules.py
@@ -11,6 +11,7 @@
 from typing import List, Optional
 
 import torch
+
 from fbgemm_gpu.utils.loader import load_torch_module
 
 try: