feat(backend): Implement job status endpoint

AdrianoKF · AdrianoKF · commit 26b30e2e06ca · 2024-08-16T12:16:31.000+02:00
The `GET /jobs/{job_id}/status` API endpoint reports the high-level
execution status of a Job (identified by its UID).

The associated Kueue workload for the job is looked up and its status
is used to determine the execution status (pending, executing, failed,
succeeded).
diff --git a/backend/src/jobs_server/models.py b/backend/src/jobs_server/models.py
@@ -1,4 +1,5 @@
 import re
+from enum import StrEnum
 from typing import Annotated, Any, TypeAlias
 
 from jobs import JobOptions
@@ -49,3 +50,10 @@ class WorkloadIdentifier(BaseModel):
 
     namespace: StrictStr
     uid: StrictStr
+
+
+class JobStatus(StrEnum):
+    PENDING = "pending"
+    EXECUTING = "executing"
+    SUCCEEDED = "succeeded"
+    FAILED = "failed"
diff --git a/backend/src/jobs_server/routers/jobs.py b/backend/src/jobs_server/routers/jobs.py
@@ -1,8 +1,14 @@
 from fastapi import APIRouter, HTTPException
 from jobs import Image, Job
 
-from jobs_server.models import CreateJobModel, ExecutionMode, WorkloadIdentifier
+from jobs_server.models import (
+    CreateJobModel,
+    ExecutionMode,
+    JobId,
+    WorkloadIdentifier,
+)
 from jobs_server.runner import Runner
+from jobs_server.utils.kueue import KueueWorkload
 
 router = APIRouter(tags=["Job management"])
 
@@ -33,3 +39,9 @@ def job_fn(): ...
     image = Image(opts.image_ref)
     workload_id = runner.run(job, image, opts.submission_context)
     return workload_id
+
+
+@router.get("/jobs/{uid}/status")
+async def status(uid: JobId, namespace: str = "default"):
+    workload = KueueWorkload.for_managed_resource(uid, namespace)
+    return workload.execution_status
diff --git a/backend/src/jobs_server/utils/helpers.py b/backend/src/jobs_server/utils/helpers.py
@@ -10,3 +10,89 @@ def remove_none_values(d: T) -> T:
     """Remove all keys with a ``None`` value from a dict."""
     filtered_dict = {k: v for k, v in d.items() if v is not None}
     return cast(T, filtered_dict)
+
+
+def traverse(d: Any, key_path: str, sep: str = ".", strict: bool = True) -> Any:
+    """
+    Retrieve a value from a nested Mapping-like object using a key path.
+
+    If the object behaves like a Mapping (i.e., implements `__getitem__`),
+    this function will be used to access elements.
+    If it behaves like an object (i.e., `__getattr__`), the path will be
+    resolved as attributes, instead.
+
+    Parameters
+    ----------
+    d : dict
+        The object to traverse.
+    key_path : str
+        A string representing the path of keys, separated by `sep`.
+    sep : str, optional
+        The separator used to split the key path into individual keys.
+        Default is ".".
+    strict : bool, optional
+        If False, return None when a key in the path does not exist.
+        If True, raise a KeyError when a key does not exist.
+        Default is False.
+
+    Returns
+    -------
+    Any
+        The value at the specified key path, or None if a key is missing
+        and `strict` is False.
+
+    Raises
+    ------
+    KeyError
+        If `strict` is True and any key in the path does not exist.
+
+    Examples
+    --------
+    >>> d = {"foo": {"bar": {"baz": 42}}}
+    >>> traverse(d, "foo.bar.baz")
+    42
+
+    >>> traverse(d, "foo.bar.qux", strict=False)
+    None
+
+    >>> traverse(d, "foo.bar.qux", strict=True)
+    Traceback (most recent call last):
+    ...
+    KeyError: 'qux'
+    """
+
+    def has_item(container, key):
+        # Check if the container is a dictionary or has the __contains__ method
+        if hasattr(container, "__contains__"):
+            return key in container
+        # Check if it's an object with attributes
+        elif hasattr(container, key):
+            return True
+        else:
+            return False
+
+    def get_item(container, key, default=None):
+        # Check if the container is a dictionary or supports the `__getitem__` method
+        if hasattr(container, "__getitem__"):
+            try:
+                return container[key]
+            except (KeyError, IndexError, TypeError):
+                return default
+        # Check if it's an object with attributes
+        elif hasattr(container, key):
+            return getattr(container, key, default)
+        else:
+            return default
+
+    keys = key_path.split(sep)
+    for key in keys:
+        # Bail out on missing keys in strict mode
+        if strict:
+            found = has_item(d, key)
+            if not found:
+                raise KeyError()
+
+        d = get_item(d, key)
+        if d is None:
+            return None
+    return d
diff --git a/backend/src/jobs_server/utils/kubernetes.py b/backend/src/jobs_server/utils/kubernetes.py
@@ -8,6 +8,7 @@
 from jobs.job import Job
 
 from jobs_server.models import SubmissionContext
+from jobs_server.utils.helpers import traverse
 
 
 def sanitize_rfc1123_domain_name(s: str) -> str:
@@ -70,3 +71,71 @@ def namespace(self) -> str:
         _, active_context = kubernetes.config.list_kube_config_contexts()
         current_namespace = active_context["context"].get("namespace")
         return self._namespace or current_namespace
+
+
+def filter_conditions(
+    obj: dict[str, Any],
+    typ: str | None = None,
+    reason: str | None = None,
+    message: str | None = None,
+):
+    """
+    Filters Kubernetes object conditions based on specified attributes.
+
+    This function filters the `status.conditions` field of a Kubernetes object
+    by matching conditions against the provided `type`, `reason`, and `message`
+    attributes. Only conditions that match all specified attributes are included
+    in the result.
+
+    Parameters
+    ----------
+    obj : dict[str, Any]
+        The Kubernetes object, typically a dictionary representing a Kubernetes
+        resource, containing a `status.conditions` field.
+    typ : str, optional
+        The type of condition to filter by. If `None`, this filter is not applied.
+    reason : str, optional
+        The reason attribute to filter by. If `None`, this filter is not applied.
+    message : str, optional
+        The message attribute to filter by. If `None`, this filter is not applied.
+
+    Returns
+    -------
+    list[dict[str, Any]]
+        A list of conditions that match the specified filters. Each condition
+        is represented as a dictionary.
+
+    Notes
+    -----
+    - The function assumes that the `status.conditions` field exists in the
+      provided object and that it is a list of condition dictionaries.
+    - If no conditions match the specified filters, an empty list is returned.
+
+    Examples
+    --------
+    >>> obj = {
+    ...     "status": {
+    ...         "conditions": [
+    ...             {"type": "Ready", "reason": "DeploymentCompleted", "message": "Deployment successful."},
+    ...             {"type": "Failed", "reason": "DeploymentFailed", "message": "Deployment failed due to timeout."}
+    ...         ]
+    ...     }
+    ... }
+    >>> filter_conditions(obj, typ="Ready")
+    [{'type': 'Ready', 'reason': 'DeploymentCompleted', 'message': 'Deployment successful.'}]
+
+    >>> filter_conditions(obj, reason="DeploymentFailed")
+    [{'type': 'Failed', 'reason': 'DeploymentFailed', 'message': 'Deployment failed due to timeout.'}]
+    """
+
+    def _match(cond):
+        match = True
+        if typ is not None:
+            match &= cond["type"] == typ
+        if reason is not None:
+            match &= cond["reason"] == reason
+        if message is not None:
+            match &= cond["message"] == message
+        return match
+
+    return [cond for cond in traverse(obj, "status.conditions") if _match(cond)]
diff --git a/backend/src/jobs_server/utils/kueue.py b/backend/src/jobs_server/utils/kueue.py
@@ -1,9 +1,14 @@
 from collections.abc import Mapping
-from typing import cast
+from typing import Any, cast
 
 from jobs.job import Job
 from jobs.utils.helpers import remove_none_values
 from kubernetes import client
+from pydantic import BaseModel, ConfigDict
+
+from jobs_server.models import JobId, JobStatus
+from jobs_server.utils.helpers import traverse
+from jobs_server.utils.kubernetes import filter_conditions
 
 
 def assert_kueue_localqueue(namespace: str, name: str) -> bool:
@@ -63,3 +68,73 @@ def kueue_scheduling_labels(job: Job, namespace: str) -> Mapping[str, str]:
             ),
         }),
     )
+
+
+def workload_by_managed_uid(uid: JobId, namespace: str):
+    """Find a Kueue Workload by the UID of its underlying job."""
+
+    api = client.CustomObjectsApi()
+    objs = api.list_namespaced_custom_object(
+        "kueue.x-k8s.io",
+        "v1beta1",
+        namespace,
+        "workloads",
+        label_selector=f"kueue.x-k8s.io/job-uid={uid}",
+    ).get("items")
+
+    if not objs:
+        return
+    return objs[0]
+
+
+class WorkloadSpec(BaseModel):
+    podSets: list
+    queueName: str
+    priorityClassName: str
+    priority: int
+    priorityClassSource: str
+    active: bool
+
+
+class WorkloadAdmission(BaseModel):
+    clusterQueue: str
+    podSetAssignments: list
+
+
+class WorkloadStatus(BaseModel):
+    conditions: list[dict[str, Any]]
+    admission: WorkloadAdmission | None = None
+    requeueState: Any | None = None
+    reclaimablePods: list | None = None
+    admissionChecks: list | None = None
+
+
+class KueueWorkload(BaseModel):
+    """Wrapper class for Kueue Workload resources.
+
+    See https://kueue.sigs.k8s.io/docs/reference/kueue.v1beta1/#kueue-x-k8s-io-v1beta1-Workload.
+    """
+
+    metadata: dict[str, Any]
+    spec: WorkloadSpec
+    status: WorkloadStatus
+
+    model_config = ConfigDict(
+        arbitrary_types_allowed=False,
+    )
+
+    @classmethod
+    def for_managed_resource(cls, uid: str, namespace: str):
+        workload = workload_by_managed_uid(uid, namespace)
+        return cls.model_validate(workload)
+
+    @property
+    def execution_status(self) -> JobStatus:
+        if filter_conditions(self, reason="Succeeded"):
+            return JobStatus.SUCCEEDED
+        elif filter_conditions(self, reason="Failed"):
+            return JobStatus.FAILED
+        elif traverse(self, "status.admission", strict=False) is not None:
+            return JobStatus.EXECUTING
+        else:
+            return JobStatus.PENDING
diff --git a/backend/tests/unit/test_util.py b/backend/tests/unit/test_util.py
@@ -0,0 +1,30 @@
+from typing import Any
+
+import pytest
+
+from jobs_server.utils.helpers import traverse
+
+
+@pytest.mark.parametrize(
+    "d, key_path, sep, strict, expected",
+    [
+        # Non-strict mode
+        ({"foo": {"bar": {"baz": 42}}}, "foo.bar.baz", ".", False, 42),
+        ({"foo": {"bar": {"baz": 42}}}, "foo.bar.qux", ".", False, None),
+        ({"foo": {"bar": {"baz": 42}}}, "foo.qux.baz", ".", False, None),
+        ({"foo": {"bar": {"baz": None}}}, "foo.bar.baz", ".", False, None),
+        # Strict mode
+        ({"foo": {"bar": {"baz": 42}}}, "foo.bar.baz", ".", True, 42),
+        ({"foo": {"bar": {"baz": 42}}}, "foo.qux.baz", ".", True, KeyError),
+        ({"foo": {"bar": {"baz": 42}}}, "foo-bar-qux", "-", True, KeyError),
+        ({"foo": {"bar": {"baz": None}}}, "foo.bar.baz", ".", True, None),
+    ],
+)
+def test_path_dict(
+    d: dict[str, Any], key_path: str, sep: str, strict: bool, expected: Any
+):
+    if strict and isinstance(expected, type):
+        with pytest.raises(expected):
+            traverse(d, key_path, sep, strict)
+    else:
+        assert traverse(d, key_path, sep, strict) == expected