Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Job lifecycle notifications #30

Draft
wants to merge 21 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
9a6eb80
feat(watcher): Add rudimentary notification tooling
AdrianoKF Jul 16, 2024
47ec982
feat: Add metadata to job options
AdrianoKF Jul 16, 2024
45b0f20
feat(watcher): Handle jobs with failures
AdrianoKF Jul 16, 2024
450ba6f
feat(watcher): Include failed containers in message
AdrianoKF Jul 16, 2024
8e3855e
feat(watcher): Add lookup for Kueue Workload resources
AdrianoKF Jul 16, 2024
76c5f58
feat(watcher): Workload monitoring, code structure
AdrianoKF Jul 17, 2024
b7822d7
fix(watcher): Don't reimplement Kueue informer factory
AdrianoKF Jul 17, 2024
f516c40
fix(watcher): Remove Kueue kjobctl dependency
AdrianoKF Jul 18, 2024
12e3e65
feat: Add React dashboard (WIP)
AdrianoKF Jul 31, 2024
8991380
Merge remote-tracking branch 'origin/main' into monitoring
AdrianoKF Sep 11, 2024
ab1062a
feat: Add Slack notification settings to example job
AdrianoKF Sep 11, 2024
08b0fed
chore: Fix pre-commit excludes
AdrianoKF Sep 11, 2024
c1cbaf0
feat(watcher): Add README
AdrianoKF Sep 12, 2024
4d0e8b3
fix(watcher): Improve code structure and config handling
AdrianoKF Sep 12, 2024
89e300c
feat(ci): Add CI workflow for watcher component
AdrianoKF Sep 12, 2024
4823c72
fix(watcher): Improve deployment and docs
AdrianoKF Sep 12, 2024
ad61470
fix(ci): Move golangci-lint args to right place
AdrianoKF Sep 12, 2024
f009e82
Merge remote-tracking branch 'origin/main' into monitoring
AdrianoKF Sep 12, 2024
63f854a
feat(watcher): Add Helm chart for deployment
AdrianoKF Sep 12, 2024
c52c9dc
Merge remote-tracking branch 'origin/main' into monitoring
AdrianoKF Sep 17, 2024
957c71f
fix(ci): Record and upload test coverage for watcher
AdrianoKF Sep 17, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@
.envrc
data
tests
watcher/
20 changes: 18 additions & 2 deletions example_hello.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
import time
from pathlib import Path

from jobs import ImageOptions, JobOptions, ResourceOptions, SchedulingOptions, job
from jobs import (
ImageOptions,
JobOptions,
Metadata,
ResourceOptions,
SchedulingOptions,
job,
)


@job(
Expand All @@ -11,12 +18,21 @@
name="localhost:5000/hello-world-dev",
tag="latest",
),
metadata=Metadata(
annotations={
"x-jobby.io/notify-channel": "slack",
"x-jobby.io/slack-channel-ids": "mlops-test",
}
),
resources=ResourceOptions(memory="1Gi", cpu="1"),
scheduling=SchedulingOptions(
priority_class="background", queue_name="user-queue"
),
)
)
def hello_world():
start_ts = time.perf_counter()
print("Hello, World!")
time.sleep(10)
time.sleep(2)
end_ts = time.perf_counter()
print(f"Elapsed: {end_ts - start_ts:.2f} s")
4 changes: 3 additions & 1 deletion src/jobs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,18 @@
ImageOptions,
Job,
JobOptions,
Metadata,
ResourceOptions,
SchedulingOptions,
job,
)

__all__ = [
"Image",
"ImageOptions",
"Job",
"JobOptions",
"ImageOptions",
"Metadata",
"ResourceOptions",
"SchedulingOptions",
"job",
Expand Down
6 changes: 6 additions & 0 deletions src/jobs/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,10 +171,16 @@ def _is_yaml(path: AnyPath) -> bool:
)


@dataclass
class Metadata:
annotations: dict[str, str]


@dataclass(frozen=True)
class JobOptions:
resources: ResourceOptions | None = None
"""Resource requests for this job in Kubernetes format (see https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-units-in-kubernetes)"""
metadata: Metadata | None = None
image: ImageOptions | None = None
scheduling: SchedulingOptions | None = None

Expand Down
8 changes: 7 additions & 1 deletion src/jobs/runner/kueue.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@
from jobs import Image, Job
from jobs.runner.base import Runner, _make_executor_command
from jobs.types import K8sResourceKind
from jobs.utils.kubernetes import KubernetesNamespaceMixin, sanitize_rfc1123_domain_name
from jobs.utils.kubernetes import (
KubernetesNamespaceMixin,
k8s_annotations,
sanitize_rfc1123_domain_name,
)
from jobs.utils.kueue import kueue_scheduling_labels


Expand All @@ -20,8 +24,10 @@ def _make_job_crd(self, job: Job, image: Image, namespace: str) -> client.V1Job:
raise ValueError("Job options must be specified")

scheduling_labels = kueue_scheduling_labels(job, self.namespace)
annotations = k8s_annotations(job)

metadata = client.V1ObjectMeta(
annotations=annotations,
generate_name=sanitize_rfc1123_domain_name(job.name),
labels=scheduling_labels,
)
Expand Down
7 changes: 6 additions & 1 deletion src/jobs/runner/ray.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@
from jobs.job import RayResourceOptions
from jobs.runner.base import Runner, _make_executor_command
from jobs.types import K8sResourceKind, NoOptions
from jobs.utils.kubernetes import KubernetesNamespaceMixin, sanitize_rfc1123_domain_name
from jobs.utils.kubernetes import (
KubernetesNamespaceMixin,
k8s_annotations,
sanitize_rfc1123_domain_name,
)
from jobs.utils.kueue import kueue_scheduling_labels


Expand Down Expand Up @@ -118,6 +122,7 @@ def _create_ray_job(self, job: Job, image: Image) -> dict:
"metadata": {
"name": sanitize_rfc1123_domain_name(job_id),
"labels": scheduling_labels,
"annotations": k8s_annotations(job),
},
"spec": {
"jobId": job_id,
Expand Down
10 changes: 10 additions & 0 deletions src/jobs/utils/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import kubernetes

from jobs.job import Job


def sanitize_rfc1123_domain_name(s: str) -> str:
"""Sanitize a string to be compliant with RFC 1123 domain name
Expand All @@ -12,6 +14,14 @@ def sanitize_rfc1123_domain_name(s: str) -> str:
return s.replace("_", "-")


def k8s_annotations(job: Job) -> dict[str, str]:
"""Determine the Kubernetes annotations for a Job"""

if not job.options or not job.options.metadata:
return {}
return job.options.metadata.annotations


class KubernetesNamespaceMixin:
"""Determine the desired or current Kubernetes namespace."""

Expand Down
2 changes: 2 additions & 0 deletions watcher/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Binary output
watcher
2 changes: 2 additions & 0 deletions watcher/.ko.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
defaultPlatforms:
- linux/arm64
58 changes: 58 additions & 0 deletions watcher/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
module watcher

go 1.22.5

require (
github.com/nikoksr/notify v0.41.0
github.com/sirupsen/logrus v1.9.3
k8s.io/api v0.30.2
k8s.io/apimachinery v0.30.2
k8s.io/client-go v0.30.2
)

require (
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
github.com/go-logr/logr v1.4.1 // indirect
github.com/go-openapi/jsonpointer v0.19.6 // indirect
github.com/go-openapi/jsonreference v0.20.2 // indirect
github.com/go-openapi/swag v0.22.3 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/protobuf v1.5.4 // indirect
github.com/google/gnostic-models v0.6.8 // indirect
github.com/google/go-cmp v0.6.0 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/uuid v1.3.0 // indirect
github.com/gorilla/websocket v1.5.0 // indirect
github.com/imdario/mergo v0.3.6 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/slack-go/slack v0.12.2 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/stretchr/objx v0.5.0 // indirect
github.com/stretchr/testify v1.8.4 // indirect
golang.org/x/net v0.23.0 // indirect
golang.org/x/oauth2 v0.10.0 // indirect
golang.org/x/sync v0.7.0 // indirect
golang.org/x/sys v0.18.0 // indirect
golang.org/x/term v0.18.0 // indirect
golang.org/x/text v0.14.0 // indirect
golang.org/x/time v0.3.0 // indirect
google.golang.org/appengine v1.6.7 // indirect
google.golang.org/protobuf v1.33.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/klog/v2 v2.120.1 // indirect
k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect
k8s.io/utils v0.0.0-20230726121419-3b25d923346b // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
sigs.k8s.io/yaml v1.3.0 // indirect
)
Loading