pytorch
diff --git a/‎.github/workflows/slurm-local-integration-tests.yaml
Lines changed: 6 additions & 4 deletions b/‎.github/workflows/slurm-local-integration-tests.yaml
Lines changed: 6 additions & 4 deletions
diff --git a/‎torchx/schedulers/slurm_scheduler.py
Lines changed: 109 additions & 41 deletions b/‎torchx/schedulers/slurm_scheduler.py
Lines changed: 109 additions & 41 deletions
@@ -6,8 +6,11 @@ on:
       - main
   pull_request:
 
+
 env:
-  SLURM_VERSION: 21.08.6
+  # slurm tag should be one of https://github.com/SchedMD/slurm/tags
+  SLURM_TAG: slurm-23-11-11-1
+  SLURM_VERSION: 23.11.11
 
 jobs:
   slurm:
@@ -27,8 +30,7 @@ jobs:
         run: |
           set -ex
 
-          # TODO: switch to trunk once https://github.com/giovtorres/slurm-docker-cluster/pull/29 lands
-          git clone https://github.com/d4l3k/slurm-docker-cluster.git
+          git clone https://github.com/giovtorres/slurm-docker-cluster.git
       - name: Pull docker containers
         run: |
           set -ex
@@ -43,7 +45,7 @@ jobs:
         run: |
           set -ex
           cd slurm-docker-cluster
-          docker build -t slurm-docker-cluster:$SLURM_VERSION .
+          docker build --build-arg SLURM_TAG=$SLURM_TAG -t slurm-docker-cluster:$SLURM_VERSION .
       - name: Start slurm
         run: |
           set -ex
 
@@ -20,6 +20,7 @@
 import tempfile
 from dataclasses import dataclass
 from datetime import datetime
+from subprocess import CalledProcessError, PIPE
 from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple
 
 import torchx
@@ -39,6 +40,7 @@
     macros,
     NONE,
     ReplicaStatus,
+    Resource,
     Role,
     RoleStatus,
     runopts,
@@ -66,6 +68,11 @@
     "TIMEOUT": AppState.FAILED,
 }
 
+
+def appstate_from_slurm_state(slurm_state: str) -> AppState:
+    return SLURM_STATES.get(slurm_state, AppState.UNKNOWN)
+
+
 SBATCH_JOB_OPTIONS = {
     "comment",
     "mail-user",
@@ -483,15 +490,34 @@ def _cancel_existing(self, app_id: str) -> None:
 
     def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
         try:
-            return self._describe_sacct(app_id)
-        except subprocess.CalledProcessError:
             return self._describe_squeue(app_id)
+        except CalledProcessError as e:
+            # NOTE: squeue errors out with 'slurm_load_jobs error: Invalid job id specified'
+            #   if the job does not exist or has finished (e.g. not in PENDING or RUNNING states)
+            #   in this case, fall back to the less descriptive but more persistent sacct
+            #   (slurm cluster must have accounting storage enabled for sacct to work)
+            log.info(
+                "unable to get job info for `{}` with `squeue` ({}), trying `sacct`".format(
+                    app_id, e.stderr
+                )
+            )
+            return self._describe_sacct(app_id)
 
     def _describe_sacct(self, app_id: str) -> Optional[DescribeAppResponse]:
-        p = subprocess.run(
-            ["sacct", "--parsable2", "-j", app_id], stdout=subprocess.PIPE, check=True
-        )
-        output = p.stdout.decode("utf-8").split("\n")
+        try:
+            output = subprocess.check_output(
+                ["sacct", "--parsable2", "-j", app_id],
+                stderr=PIPE,
+                encoding="utf-8",
+            ).split("\n")
+        except CalledProcessError as e:
+            log.info(
+                "unable to get job info for `{}` with `sacct` ({})".format(
+                    app_id, e.stderr
+                )
+            )
+            return None
+
         if len(output) <= 1:
             return None
 
@@ -511,11 +537,7 @@ def _describe_sacct(self, app_id: str) -> Optional[DescribeAppResponse]:
 
             state = row["State"]
             msg = state
-            state_enum = SLURM_STATES.get(state)
-            assert (
-                state_enum
-            ), f"failed to translate slurm state {state} to torchx state"
-            app_state = state_enum
+            app_state = appstate_from_slurm_state(state)
 
             role, _, replica_id = row["JobName"].rpartition("-")
             if not replica_id or not role:
@@ -540,46 +562,92 @@ def _describe_sacct(self, app_id: str) -> Optional[DescribeAppResponse]:
             msg=msg,
         )
 
-    def _describe_squeue(self, app_id: str) -> Optional[DescribeAppResponse]:
-        p = subprocess.run(
-            ["squeue", "--json", "-j", app_id], stdout=subprocess.PIPE, check=True
+    def _describe_squeue(self, app_id: str) -> DescribeAppResponse:
+        # squeue errors out with 'slurm_load_jobs error: Invalid job id specified'
+        # if the job does not exist or is finished (e.g. not in PENDING or RUNNING state)
+        output = subprocess.check_output(
+            ["squeue", "--json", "-j", app_id], stderr=PIPE, encoding="utf-8"
         )
-        output_json = json.loads(p.stdout.decode("utf-8"))
 
-        roles = {}
-        roles_statuses = {}
-        msg = ""
-        app_state = AppState.UNKNOWN
-        for job in output_json["jobs"]:
-            state = job["job_state"][0]
-            msg = state
-            state_enum = SLURM_STATES.get(state)
-            assert (
-                state_enum
-            ), f"failed to translate slurm state {state} to torchx state"
-            app_state = state_enum
+        output_json = json.loads(output)
+        jobs = output_json["jobs"]
 
-            role, _, replica_id = job["name"].rpartition("-")
-            if not replica_id or not role:
-                # name should always have at least 3 parts but sometimes sacct
-                # is slow to update
-                continue
-            if role not in roles:
-                roles[role] = Role(name=role, num_replicas=0, image="")
-                roles_statuses[role] = RoleStatus(role, [])
-            roles[role].num_replicas += 1
-            roles_statuses[role].replicas.append(
-                ReplicaStatus(
-                    id=int(replica_id), role=role, state=app_state, hostname=""
+        roles: dict[str, Role] = {}
+        roles_statuses: dict[str, RoleStatus] = {}
+        state = AppState.UNKNOWN
+
+        for job in jobs:
+            # job name is of the form "{role_name}-{replica_id}"
+            role_name, _, replica_id = job["name"].rpartition("-")
+
+            entrypoint = job["command"]
+            image = job["current_working_directory"]
+            state = appstate_from_slurm_state(job["job_state"][0])
+
+            job_resources = job["job_resources"]
+
+            role = roles.setdefault(
+                role_name,
+                Role(
+                    name=role_name,
+                    image=image,
+                    entrypoint=entrypoint,
+                    num_replicas=0,
                 ),
             )
+            role_status = roles_statuses.setdefault(
+                role_name,
+                RoleStatus(role_name, replicas=[]),
+            )
+
+            if state == AppState.PENDING:
+                # NOTE: torchx launched jobs points to exactly one host
+                #  otherwise, scheduled_nodes could be a node list expression (eg. 'slurm-compute-node[0-20,21,45-47]')
+                hostname = job_resources["scheduled_nodes"]
+                role.num_replicas += 1
+                role_status.replicas.append(
+                    ReplicaStatus(
+                        id=int(replica_id),
+                        role=role_name,
+                        state=state,
+                        hostname=hostname,
+                    )
+                )
+            else:  # state == AppState.RUNNING
+                # NOTE: torchx schedules on slurm with sbatch + heterogenous job
+                # where each replica is a "sub-job" so `allocated_nodes` will always be 1
+                # but we deal with jobs that have not been launched with torchx
+                # which can have multiple hosts per sub-job (count them as replicas)
+                node_infos = job_resources.get("allocated_nodes", [])
+                for node_info in node_infos:
+                    # NOTE: we expect resource specs for all the nodes to be the same
+                    # NOTE: use allocated (not used/requested) memory since
+                    #  users may only specify --cpu, in which case slurm
+                    #  uses the (system) configured {mem-per-cpu} * {cpus}
+                    #  to allocate memory.
+                    # NOTE: getting gpus is tricky because it modeled as a trackable-resource
+                    #  or not configured at all (use total-cpu-on-host as proxy for gpus)
+                    cpu = int(node_info["cpus_used"])
+                    memMB = int(node_info["memory_allocated"])
+
+                    hostname = node_info["nodename"]
+
+                    role.resource = Resource(cpu=cpu, memMB=memMB, gpu=-1)
+                    role.num_replicas += 1
+                    role_status.replicas.append(
+                        ReplicaStatus(
+                            id=int(replica_id),
+                            role=role_name,
+                            state=state,
+                            hostname=hostname,
+                        )
+                    )
 
         return DescribeAppResponse(
             app_id=app_id,
             roles=list(roles.values()),
             roles_statuses=list(roles_statuses.values()),
-            state=app_state,
-            msg=msg,
+            state=state,
         )
 
     def log_iter(