Skip to content

remove flaky remote function failure (#244) #1070

remove flaky remote function failure (#244)

remove flaky remote function failure (#244) #1070

Workflow file for this run

name: Build monarch
on:
push:
branches:
- main
pull_request:
branches:
- main
- gh/**
concurrency:
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
cancel-in-progress: true
jobs:
test:
name: cuda12.6-py3.10-4xlarge
strategy:
fail-fast: true
matrix:
include:
- name: 4xlarge
runs-on: linux.g5.4xlarge.nvidia.gpu
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
gpu-arch-type: "cuda"
gpu-arch-version: "12.6"
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
with:
timeout: 60
runner: ${{ matrix.runs-on }}
gpu-arch-type: ${{ matrix.gpu-arch-type }}
gpu-arch-version: ${{ matrix.gpu-arch-version }}
submodules: recursive
script: |
conda create -n venv python=3.10 -y
conda activate venv
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
python -m pip install --upgrade pip
# Install native dependencies
dnf update -y
dnf install clang-devel libunwind libunwind-devel -y
# Install rust and setup nightly toolchain
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
source $HOME/.cargo/env
rustup toolchain install nightly
rustup default nightly
# Install build dependencies
pip install -r build-requirements.txt
# Install test dependencies
pip install -r python/tests/requirements.txt
# Install remote process_allocator binary (some tests use it)
cargo install --path monarch_hyperactor
# Build and install monarch
# NB: monarch is currently can't be built in isolated builds (e.g not PEP519 compatible)
# because 'torch-sys' needs to be compiled against 'torch' in the main python environment
# so that libtorch is linked correctly at runtime.
pip install --no-build-isolation .
# Run tests
LC_ALL=C pytest python/tests/ -s -v -m "not oss_skip" -n auto
python python/tests/test_mock_cuda.py