diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/amd_operator/amd_gpu_install.yaml b/ods_ci/tasks/Resources/Provisioning/GPU/amd_operator/amd_gpu_install.yaml new file mode 100644 index 000000000..e98c3ece4 --- /dev/null +++ b/ods_ci/tasks/Resources/Provisioning/GPU/amd_operator/amd_gpu_install.yaml @@ -0,0 +1,28 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: openshift-amd-gpu + +--- + +apiVersion: operators.coreos.com/v1 +kind: OperatorGroup +metadata: + name: openshift-amd-gpu-operator-group + namespace: openshift-amd-gpu +spec: {} + +--- + +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: amd-gpu-operator + namespace: openshift-amd-gpu +spec: + channel: alpha + installPlanApproval: Automatic + name: amd-gpu-operator + source: community-operators + sourceNamespace: openshift-marketplace + diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/amd_operator/amd_operator.sh b/ods_ci/tasks/Resources/Provisioning/GPU/amd_operator/amd_operator.sh new file mode 100644 index 000000000..ec824fe4b --- /dev/null +++ b/ods_ci/tasks/Resources/Provisioning/GPU/amd_operator/amd_operator.sh @@ -0,0 +1,136 @@ +#!/bin/bash +set -e + +GPU_INSTALL_DIR="$(dirname "$0")" + +function create_registry_network() { + oc patch configs.imageregistry.operator.openshift.io cluster --type merge --patch '{"spec":{"storage":{"emptyDir":{}}}}' + oc patch configs.imageregistry.operator.openshift.io cluster --type merge --patch '{"spec":{"managementState":"Managed"}}' + echo "Internal registry network created." +} + +function check_registry() { + registry_pod=$(oc get pod -l docker-registry=default -n openshift-image-registry --no-headers -o custom-columns=":metadata.name") + if [ -n "$registry_pod" ]; then + echo "Internal registry pod ($registry_pod) is present." + return 0 # Success + else + echo "Internal registry pod is not present." + create_registry_network + return 1 # Failure + fi +} +function wait_while { + local seconds timeout interval + interval=2 + seconds=0 + timeout=$1 + shift + while eval "$*"; do + seconds=$(( seconds + interval )) + sleep $interval + echo -n '.' + [[ $seconds -gt $timeout ]] && echo "Time out of ${timeout} exceeded" && return 1 + done + if [[ "$seconds" != '0' ]]; then + echo '' + fi + return 0 +} + +has_csv_succeeded() { + local ns=$1 + local subscription=$2 + local csv + csv=$(oc get subscriptions.operators.coreos.com "${subscription}" -n "${ns}" -o=custom-columns=CURRENT_CSV:.status.currentCSV --no-headers=true) + if [ x"$csv" != "x" ] && [ x"$csv" != x"" ] + then + phase=$(oc get clusterserviceversions.operators.coreos.com -n "${ns}" "${csv}" -o=custom-columns=PHASE:.status.phase --no-headers=true) + if [ "$phase" = "Succeeded" ] + then + return 0 + fi + fi + + return 1 +} + +function create_devconfig() { + oc create -f - </dev/null)" + daemon_status="$(oc get daemonset -l app="$pod_label" -n "$namespace" --no-headers=true 2>/dev/null)" + if [[ -n "$daemon_status" || -n "$pod_status" ]] ; then + echo "Waiting until GPU Pods or Daemonset of '$pod_label' in namespace '$namespace' are in running state..." + echo "Pods status: '$pod_status'" + echo "Daemonset status: '$daemon_status'" + oc wait --timeout="${timeout_seconds}s" --for=condition=ready pod -n "$namespace" -l app="$pod_label" || \ + oc rollout status --watch --timeout=3m daemonset -n "$namespace" -l app="$pod_label" || continue + break + fi + echo "Waiting for Pods or Daemonset with label app='$pod_label' in namespace '$namespace' to be present..." + sleep 5 + done +} + +function machineconfig_updates { + # There should be only "True" and there should be at least one + [ True = "$(oc get machineconfigpool --no-headers=true '-o=custom-columns=UPDATED:.status.conditions[?(@.type=="Updated")].status' | uniq)" ] +} + +function monitor_logs() { + local pod_name=$1 + local search_text=$2 + local ns=$3 + local c_name=$4 + echo "Monitoring logs for pod $pod_name..." + + # Use 'kubectl logs' command to fetch logs continuously + + oc logs "$pod_name" -c "$c_name" -n "$ns" | while read -r line; do + if [[ $line == *"$search_text"* ]]; then + echo "Found \"$search_text\" in pod logs: $line" + fi + done +} + +check_registry +status=$? + +# Blacklist the inbox drivers with a MachineConfig if the registry check was successful +if [ $status -eq 0 ]; then + oc apply -f "$GPU_INSTALL_DIR/blacklist_driver.yaml" +else + return 1 +fi + +sleep 120 +wait_while 1800 ! machineconfig_updates + +echo "Installing NFD operator" +oc apply -f "$GPU_INSTALL_DIR/../nfd_operator.yaml" +wait_while 360 ! has_csv_succeeded openshift-nfd nfd +oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml" +echo "Installing KMM operator" +oc apply -f "$GPU_INSTALL_DIR/kmm_operator_install.yaml" +wait_while 360 ! has_csv_succeeded openshift-kmm kernel-module-management +echo "Installing AMD operator" +oc apply -f "$GPU_INSTALL_DIR/amd_gpu_install.yaml" +wait_while 360 ! has_csv_succeeded openshift-amd-gpu amd-gpu-operator +create_devconfig +name=$(oc get pod -n openshift-amd-gpu -l openshift.io/build.name -oname) +wait_while 1200 ! monitor_logs "$name" "Successfully pushed image-registry.openshift-image-registry.svc:5000/openshift-amd-gpu" openshift-amd-gpu docker-build diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/amd_operator/blacklist_driver.yaml b/ods_ci/tasks/Resources/Provisioning/GPU/amd_operator/blacklist_driver.yaml new file mode 100644 index 000000000..36be6e2ff --- /dev/null +++ b/ods_ci/tasks/Resources/Provisioning/GPU/amd_operator/blacklist_driver.yaml @@ -0,0 +1,17 @@ +apiVersion: machineconfiguration.openshift.io/v1 +kind: MachineConfig +metadata: + labels: + machineconfiguration.openshift.io/role: worker + name: amdgpu-module-blacklist +spec: + config: + ignition: + version: 3.2.0 + storage: + files: + - path: "/etc/modprobe.d/amdgpu-blacklist.conf" + mode: 420 + overwrite: true + contents: + source: "data:text/plain;base64,YmxhY2tsaXN0IGFtZGdwdQo=" diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/amd_operator/kmm_operator_install.yaml b/ods_ci/tasks/Resources/Provisioning/GPU/amd_operator/kmm_operator_install.yaml new file mode 100644 index 000000000..7e8dddc2f --- /dev/null +++ b/ods_ci/tasks/Resources/Provisioning/GPU/amd_operator/kmm_operator_install.yaml @@ -0,0 +1,26 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: openshift-kmm + +--- + +apiVersion: operators.coreos.com/v1 +kind: OperatorGroup +metadata: + name: openshift-kmm-operator-group + namespace: openshift-kmm +spec: {} + +--- +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: kernel-module-management + namespace: openshift-kmm +spec: + channel: stable + installPlanApproval: Automatic + name: kernel-module-management + source: redhat-operators + sourceNamespace: openshift-marketplace diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh index 817c508ef..d4b186d6b 100755 --- a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh +++ b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh @@ -12,10 +12,10 @@ CSVNAME="$(oc get packagemanifests/gpu-operator-certified -n openshift-marketpla sed -i -e "0,/v1.11/s//$CHANNEL/g" -e "s/gpu-operator-certified.v1.11.0/$CSVNAME/g" "$GPU_INSTALL_DIR/gpu_install.yaml" oc apply -f "$GPU_INSTALL_DIR/gpu_install.yaml" - +oc apply -f "$GPU_INSTALL_DIR/nfd_operator.yaml" echo "Wait for Nvidia GPU Operator Subscription, InstallPlan and Deployment to complete" -oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n nvidia-gpu-operator sub nfd +oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n openshift-nfd sub nfd oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n nvidia-gpu-operator sub gpu-operator-certified diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/nfd_deploy.yaml b/ods_ci/tasks/Resources/Provisioning/GPU/nfd_deploy.yaml index db6aa9e48..4cb56c3af 100644 --- a/ods_ci/tasks/Resources/Provisioning/GPU/nfd_deploy.yaml +++ b/ods_ci/tasks/Resources/Provisioning/GPU/nfd_deploy.yaml @@ -2,7 +2,7 @@ apiVersion: nfd.openshift.io/v1 kind: NodeFeatureDiscovery metadata: name: nfd-instance - namespace: nvidia-gpu-operator + namespace: openshift-nfd spec: instance: "" # instance is empty by default topologyupdater: false # False by default diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/nfd_operator.yaml b/ods_ci/tasks/Resources/Provisioning/GPU/nfd_operator.yaml new file mode 100644 index 000000000..764e11d16 --- /dev/null +++ b/ods_ci/tasks/Resources/Provisioning/GPU/nfd_operator.yaml @@ -0,0 +1,29 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: openshift-nfd + +--- +apiVersion: operators.coreos.com/v1 +kind: OperatorGroup +metadata: + name: openshift-nfd-og + namespace: openshift-nfd +spec: + targetNamespaces: + - openshift-nfd + upgradeStrategy: Default + + +--- +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: nfd + namespace: openshift-nfd +spec: + channel: "stable" + installPlanApproval: Automatic + name: nfd + source: redhat-operators + sourceNamespace: openshift-marketplace diff --git a/ods_ci/tasks/Resources/Provisioning/Hive/AWS/provision-gpu.sh b/ods_ci/tasks/Resources/Provisioning/Hive/AWS/provision-gpu.sh index 8455ff405..42c37d014 100755 --- a/ods_ci/tasks/Resources/Provisioning/Hive/AWS/provision-gpu.sh +++ b/ods_ci/tasks/Resources/Provisioning/Hive/AWS/provision-gpu.sh @@ -28,7 +28,6 @@ jq -r --arg INSTANCE_TYPE "$INSTANCE_TYPE" '.spec.template.spec.providerSpec.val | del(.metadata.uid) | del(.metadata.creationTimestamp) | del(.metadata.resourceVersion) - | .spec.template.spec.taints += [{"effect": "NoSchedule" , "key": "nvidia.com/gpu" , "value": "None"}] ' /tmp/source-machineset.json > /tmp/gpu-machineset.json # Change machineset name