Skip to content

Kueue job priorities #13

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Apr 24, 2024
11 changes: 8 additions & 3 deletions example_hello.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import time

from jobs import ImageOptions, JobOptions, ResourceOptions, job
from jobs import ImageOptions, JobOptions, ResourceOptions, SchedulingOptions, job
from jobs.cli import submit_job


Expand All @@ -9,14 +10,18 @@
# A job with explicit Dockerfile
image=ImageOptions(
dockerfile="Dockerfile",
name="hello-world-yaml",
name="hello-world-dev",
tag="latest",
),
resources=ResourceOptions(memory="2Gi", cpu="1", gpu=None),
resources=ResourceOptions(memory="256Mi", cpu="1"),
scheduling=SchedulingOptions(
priority_class="background",
),
)
)
def hello_world():
print("Hello, World!")
time.sleep(60)


if __name__ == "__main__":
Expand Down
27 changes: 27 additions & 0 deletions example_hello_prod.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import logging
import time

from jobs import ImageOptions, JobOptions, ResourceOptions, SchedulingOptions, job
from jobs.cli import submit_job


@job(
options=JobOptions(
# A job with explicit Dockerfile
image=ImageOptions(dockerfile="Dockerfile"),
resources=ResourceOptions(memory="256Mi", cpu="4"),
scheduling=SchedulingOptions(
priority_class="production",
),
)
)
def prod_training():
print("Hello, World!")
time.sleep(60)


if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
logging.getLogger("urllib3.connectionpool").setLevel(logging.INFO)

submit_job(prod_training)
92 changes: 0 additions & 92 deletions raycluster-manifest.yaml

This file was deleted.

2 changes: 1 addition & 1 deletion raycluster-values.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
image:
tag: 2.9.0-py310
tag: 2.11.0-py311
worker:
maxReplicas: 1
resources:
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ pyyaml==6.0.1
# via
# kubernetes
# ray
ray==2.10.0
ray==2.11.0
referencing==0.34.0
# via
# jsonschema
Expand Down
28 changes: 28 additions & 0 deletions single-clusterqueue-setup.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@ metadata:
name: "cluster-queue"
spec:
namespaceSelector: {} # match all.
# Allow preemption of workloads by higher priority ones
preemption:
reclaimWithinCohort: Any
borrowWithinCohort:
policy: LowerPriority
maxPriorityThreshold: 100
withinClusterQueue: LowerPriority
resourceGroups:
- coveredResources: ["cpu", "memory", "nvidia.com/gpu"]
flavors:
Expand All @@ -28,3 +35,24 @@ metadata:
name: "user-queue"
spec:
clusterQueue: "cluster-queue"
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: WorkloadPriorityClass
metadata:
name: background
value: 1
description: "Background (=lowest) priority"
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: WorkloadPriorityClass
metadata:
name: development
value: 100
description: "Development priority"
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: WorkloadPriorityClass
metadata:
name: production
value: 1000
description: "Production priority"
10 changes: 9 additions & 1 deletion src/jobs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,21 @@
from jobs import assembler
from jobs.image import Image
from jobs.job import ImageOptions, Job, JobOptions, ResourceOptions, job
from jobs.job import (
ImageOptions,
Job,
JobOptions,
ResourceOptions,
SchedulingOptions,
job,
)

__all__ = [
Image,
Job,
JobOptions,
ImageOptions,
ResourceOptions,
SchedulingOptions,
job,
assembler,
]
2 changes: 1 addition & 1 deletion src/jobs/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def submit_job(job: Job) -> None:
logging.debug(f"Execution mode: {mode}")

image: Image | None = None
if mode in [ExecutionMode.DOCKER, ExecutionMode.KUEUE, ExecutionMode.RAYCLUSTER]:
if mode in [ExecutionMode.DOCKER, ExecutionMode.KUEUE]:
image = job.build_image()
if image is None:
raise RuntimeError("Could not build container image")
Expand Down
18 changes: 16 additions & 2 deletions src/jobs/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,15 @@ def to_ray(self) -> dict[str, Any]:
)


@dataclass(frozen=True)
class SchedulingOptions:
priority_class: str | None = None
"""Kueue priority class name"""

queue_name: str | None = None
"""Kueue local queue name"""


@dataclass(frozen=True)
class ImageOptions:
name: str | None = None
Expand Down Expand Up @@ -118,15 +127,20 @@ def _is_yaml(path: AnyPath) -> bool:
if not self.build_context.is_dir():
raise ValueError(f"Build context must be a directory: {self.build_context}")

if self.dockerfile and not self.dockerfile.is_relative_to(self.build_context):
raise ValueError("Dockerfile must be relative to build context")
if self.dockerfile and not self.dockerfile.resolve().is_relative_to(
self.build_context.resolve()
):
raise ValueError(
f"Dockerfile must be relative to build context {self.build_context}"
)


@dataclass(frozen=True)
class JobOptions:
resources: ResourceOptions | None
"""Resource requests for this job in Kubernetes format (see https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-units-in-kubernetes)"""
image: ImageOptions | None
scheduling: SchedulingOptions | None


class Job:
Expand Down
16 changes: 12 additions & 4 deletions src/jobs/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

import jobs
from jobs import Image, Job
from jobs.util import sanitize_rfc1123_domain_name
from jobs.util import remove_none_values, sanitize_rfc1123_domain_name

JOBS_EXECUTE_CMD = "jobs_execute"

Expand Down Expand Up @@ -70,11 +70,19 @@ def __init__(self, **kwargs: str) -> None:
config.load_kube_config()

def _make_job_crd(self, job: Job, image: Image) -> client.V1Job:
sched_opts = job.options.scheduling
metadata = client.V1ObjectMeta(
generate_name=sanitize_rfc1123_domain_name(job.name),
labels={
"kueue.x-k8s.io/queue-name": self._queue,
},
labels=remove_none_values(
{
"kueue.x-k8s.io/queue-name": sched_opts.queue_name
if sched_opts and sched_opts.queue_name
else self._queue,
"kueue.x-k8s.io/priority-class": sched_opts.priority_class
if sched_opts
else None,
}
),
)

# Job container
Expand Down