first commit

2025-12-16 17:56:13 +11:00
commit 2da0e4f030
70 changed files with 11317 additions and 0 deletions
--- a/values.yaml
+++ b/values.yaml
@@ -0,0 +1,312 @@
+# NFD related configs
+# schema reference: https://github.com/kubernetes-sigs/node-feature-discovery/blob/release-0.16/deployment/helm/node-feature-discovery/values.yaml
+node-feature-discovery:
+  # -- Set to true/false to enable/disable the installation of node feature discovery (NFD) operator
+  enabled: true
+  worker:
+    # -- Set tolerations for NFD worker daemonset
+    tolerations:
+      - key: "amd-dcm"
+        operator: "Equal"
+        value: "up"
+        effect: "NoExecute"
+    # -- Set nodeSelector for NFD worker daemonset
+    nodeSelector: {}
+# KMM related configs
+kmm:
+  # -- Set to true/false to enable/disable the installation of kernel module management (KMM) operator
+  enabled: true
+# -- Default NFD rule will detect amd gpu based on pci vendor ID
+installdefaultNFDRule: true
+# -- CRD will be patched as pre-upgrade/pre-rollback hook when doing helm upgrade/rollback to current helm chart
+upgradeCRD: true
+crds:
+  defaultCR:
+    # -- Deploy default DeviceConfig during helm chart installation
+    install: true
+    # -- Deploy / Patch default DeviceConfig during helm chart upgrade. Be careful about this option: 1. Your customized change on default DeviceConfig may be overwritten 2. Your existing DeviceConfig may conflict with upgraded default DeviceConfig 
+    upgrade: false
+deviceConfig:
+  spec:
+    # -- Set node selector for the default DeviceConfig
+    selector:
+      feature.node.kubernetes.io/amd-gpu: "true"
+    driver:
+      # -- enable/disable out-of-tree driver management, set to false to use inbox driver
+      enable: false
+      # -- enable/disable putting a blacklist amdgpu entry in modprobe config, which requires node labeller to run
+      blacklist: false
+      # -- image repository to store out-of-tree driver image, DO NOT put image tag since operator automatically manage it for users
+      image: "docker.io/myUserName/driverImage"
+      # -- image pull secret for pull/push access of the driver image repository, input secret name like {"name": "mysecret"}
+      imageRegistrySecret: {}
+      imageRegistryTLS:
+        # -- set to true to use plain HTTP for driver image repository
+        insecure: false
+        # -- set to true to skip TLS validation for driver image repository
+        insecureSkipTLSVerify: false
+      # -- specify an out-of-tree driver version to install
+      version: "6.4"
+      # -- specify the secrets to sign the out-of-tree kernel module inside driver image for secure boot, e.g. input private / public key secret {"keySecret":{"name":"privateKeySecret"},"certSecret":{"name":"publicKeySecret"}}
+      imageSign: {}
+      # -- configure the out-of-tree driver image build within the cluster. e.g. {"baseImageRegistry":"docker.io","baseImageRegistryTLS":{"baseImageRegistry":"docker.io","baseImageRegistryTLS":{"insecure":"false","insecureSkipTLSVerify":"false"}}}
+      imageBuild: {}
+      # -- configure driver tolerations so that operator can manage out-of-tree drivers on tainted nodes
+      tolerations: []
+      upgradePolicy:
+        # -- enable/disable automatic driver upgrade feature 
+        enable: true
+        # -- how many nodes can be upgraded in parallel
+        maxParallelUpgrades: 3
+        # -- maximum number of nodes that can be in a failed upgrade state beyond which upgrades will stop to keep cluster at a minimal healthy state
+        maxUnavailableNodes: 25%
+        # -- whether reboot each worker node or not during the driver upgrade
+        rebootRequired: true
+        nodeDrainPolicy:
+          # -- whether force draining is allowed or not
+          force: true
+          # -- the length of time in seconds to wait before giving up drain, zero means infinite
+          timeoutSeconds: 300
+          # -- the time kubernetes waits for a pod to shut down gracefully after receiving a termination signal, zero means immediate, minus value means follow pod defined grace period
+          gracePeriodSeconds: -1
+        podDeletionPolicy:
+          # -- whether force deletion is allowed or not
+          force: true
+          # -- the length of time in seconds to wait before giving up on pod deletion, zero means infinite
+          timeoutSeconds: 300
+          # -- the time kubernetes waits for a pod to shut down gracefully after receiving a termination signal, zero means immediate, minus value means follow pod defined grace period
+          gracePeriodSeconds: -1
+    commonConfig:
+      # -- init container image
+      initContainerImage: busybox:1.36
+      utilsContainer:
+        # -- gpu operator utility container image
+        image: docker.io/rocm/gpu-operator-utils:v1.4.0
+        # -- utility container image pull policy
+        imagePullPolicy: IfNotPresent
+        # -- utility container image pull secret, e.g. {"name": "mySecretName"}
+        imageRegistrySecret: {}
+    devicePlugin:
+      # -- device plugin image
+      devicePluginImage: rocm/k8s-device-plugin:latest
+      # -- device plugin image pull policy
+      devicePluginImagePullPolicy: IfNotPresent
+      # -- device plugin tolerations
+      devicePluginTolerations: []
+      # -- pass supported flags and their values while starting device plugin daemonset, e.g. {"resource_naming_strategy": "single"} or {"resource_naming_strategy": "mixed"}
+      devicePluginArguments: {}
+      # -- enable / disable node labeller
+      enableNodeLabeller: true
+      # -- node labeller image
+      nodeLabellerImage: rocm/k8s-device-plugin:labeller-latest
+      # -- node labeller image pull policy
+      nodeLabellerImagePullPolicy: IfNotPresent
+      # -- node labeller tolerations
+      nodeLabellerTolerations: []
+      # -- pass supported labels while starting node labeller daemonset, default ["vram", "cu-count", "simd-count", "device-id", "family", "product-name", "driver-version"], also support ["compute-memory-partition", "compute-partitioning-supported", "memory-partitioning-supported"]
+      nodeLabellerArguments: []
+      # -- image pull secret for device plugin and node labeller, e.g. {"name": "mySecretName"}
+      imageRegistrySecret: {}
+      upgradePolicy:
+        # -- the type of daemonset upgrade, RollingUpdate or OnDelete
+        upgradeStrategy: RollingUpdate
+        # -- the maximum number of Pods that can be unavailable during the update process
+        maxUnavailable: 1
+    metricsExporter:
+      # -- enable / disable device metrics exporter
+      enable: true
+      # -- type of service for exposing metrics endpoint, ClusterIP or NodePort
+      serviceType: ClusterIP
+      # -- internal port used for in-cluster and node access to pull metrics from the metrics-exporter (default 5000).
+      port: 5000
+      # -- external port for pulling metrics from outside the cluster for NodePort service, in the range 30000-32767 (assigned automatically by default)
+      nodePort: 32500
+      # -- metrics exporter image
+      image: docker.io/rocm/device-metrics-exporter:v1.4.0
+      # -- metrics exporter image pull policy
+      imagePullPolicy: "IfNotPresent"
+      # -- name of the metrics exporter config map, e.g. {"name": "metricConfigMapName"}
+      config: {}
+      # -- metrics exporter tolerations
+      tolerations: []
+      # -- metrics exporter image pull secret, e.g. {"name": "pullSecretName"}
+      imageRegistrySecret: {}
+      # -- metrics exporter node selector, if not specified it will reuse spec.selector
+      selector: {}
+      upgradePolicy:
+        # -- the type of daemonset upgrade, RollingUpdate or OnDelete
+        upgradeStrategy: RollingUpdate
+        # -- the maximum number of Pods that can be unavailable during the update process
+        maxUnavailable: 1
+      rbacConfig:
+        # -- enable/disable kube rbac proxy
+        enable: false
+        # -- kube rbac proxy side car container image
+        image: quay.io/brancz/kube-rbac-proxy:v0.18.1
+        # -- disable https protecting the proxy endpoint
+        disableHttps: false
+        # -- certificate secret to mount in kube-rbac container for TLS, self signed certificates will be generated by default, e.g. {"name": "secretName"}
+        secret: {}
+        # -- reference to a configmap containing the client CA (key: ca.crt) for mTLS client validation, e.g. {"name": "configMapName"}
+        clientCAConfigMap: {}
+        staticAuthorization:
+          # -- enables static authorization using client certificate CN
+          enable: false
+          # -- expected CN (Common Name) from client cert (e.g., Prometheus SA identity)
+          clientName: ""
+      prometheus:
+        serviceMonitor:
+          # -- enable or disable ServiceMonitor creation
+          enable: false
+          # -- frequency to scrape metrics. Accepts values with time unit suffix: "30s", "1m", "2h", "500ms"
+          interval: 30s
+          # -- define if Prometheus should attach node metadata to the target, e.g. {"node": "true"}
+          attachMetadata: {}
+          # -- choose the metric's labels on collisions with target labels
+          honorLabels: true
+          # -- control whether the scrape endpoints honor timestamps
+          honorTimestamps: false
+          # -- additional labels to add to the ServiceMonitor
+          labels: {}
+          # -- relabelConfigs to apply to samples before ingestion
+          relabelings: []
+          # -- relabeling rules applied to individual scraped metrics
+          metricRelabelings: []
+          # -- optional Prometheus authorization configuration for accessing the endpoint
+          authorization: {}
+          # -- TLS settings used by Prometheus to connect to the metrics endpoint
+          tlsConfig: {}
+    testRunner:
+      # -- enable / disable test runner
+      enable: false
+      # -- test runner image
+      image: docker.io/rocm/test-runner:v1.4.0
+      # -- test runner image pull policy
+      imagePullPolicy: "IfNotPresent"
+      # -- test runner config map, e.g. {"name": "myConfigMap"}
+      config: {}
+      logsLocation:
+        # -- test runner internal mounted directory to save test run logs
+        mountPath: "/var/log/amd-test-runner"
+        # -- host directory to save test run logs
+        hostPath: "/var/log/amd-test-runner"
+        # -- a list of secrets that contain connectivity info to multiple cloud providers
+        logsExportSecrets: []
+      upgradePolicy:
+        # -- the type of daemonset upgrade, RollingUpdate or OnDelete
+        upgradeStrategy: RollingUpdate
+        # -- the maximum number of Pods that can be unavailable during the update process
+        maxUnavailable: 1
+      # -- test runner tolerations
+      tolerations: []
+      # -- test runner image pull secret
+      imageRegistrySecret: {}
+      # -- test runner node selector, if not specified it will reuse spec.selector
+      selector: {}
+    configManager:
+      # -- enable/disable the config manager 
+      enable: false
+      # -- config manager image
+      image: docker.io/rocm/device-config-manager:v1.4.0
+      # -- image pull policy for config manager image
+      imagePullPolicy: IfNotPresent
+      # -- image pull secret for config manager image, e.g. {"name": "myPullSecret"}
+      imageRegistrySecret: {}
+      # -- config map for config manager, e.g. {"name": "myConfigMap"}
+      config: {}
+      # -- node selector for config manager, if not specified it will reuse spec.selector
+      selector: {}
+      upgradePolicy:
+        # -- the type of daemonset upgrade, RollingUpdate or OnDelete
+        upgradeStrategy: RollingUpdate
+        # -- the maximum number of Pods that can be unavailable during the update process
+        maxUnavailable: 1
+      # -- config manager tolerations
+      configManagerTolerations: []
+# AMD GPU operator controller related configs
+controllerManager:
+  manager:
+    args:
+      - --config=controller_manager_config.yaml
+    containerSecurityContext:
+      allowPrivilegeEscalation: false
+    image:
+      # -- AMD GPU operator controller manager image repository
+      repository: docker.io/rocm/gpu-operator
+      # -- AMD GPU operator controller manager image tag
+      tag: v1.4.0
+    # -- Image pull policy for AMD GPU operator controller manager pod
+    imagePullPolicy: Always
+    # -- Image pull secret name for pulling AMD GPU operator controller manager image if registry needs credential to pull image
+    imagePullSecrets: ""
+    tolerations:
+      - key: "node-role.kubernetes.io/master"
+        operator: "Equal"
+        value: ""
+        effect: "NoSchedule"
+      - key: "node-role.kubernetes.io/control-plane"
+        operator: "Equal"
+        value: ""
+        effect: "NoSchedule"
+    resources:
+      limits:
+        cpu: 1000m
+        memory: 1Gi
+      requests:
+        cpu: 100m
+        memory: 256Mi
+  # -- Node selector for AMD GPU operator controller manager deployment
+  nodeSelector: {}
+  # -- Deployment affinity configs for controller manager
+  affinity:
+    nodeAffinity:
+      preferredDuringSchedulingIgnoredDuringExecution:
+        - weight: 1
+          preference:
+            matchExpressions:
+              - key: node-role.kubernetes.io/control-plane
+                operator: Exists
+  replicas: 1
+  serviceAccount:
+    annotations: {}
+  env:
+    simEnable: false
+kmmDevicePlugin:
+  serviceAccount:
+    annotations: {}
+kmmModuleLoader:
+  serviceAccount:
+    annotations: {}
+kubernetesClusterDomain: cluster.local
+managerConfig:
+  controllerManagerConfigYaml: |-
+    healthProbeBindAddress: :8081
+    metricsBindAddress: 127.0.0.1:8080
+    leaderElection:
+      enabled: true
+      resourceID: gpu.amd.com
+metricsService:
+  ports:
+    - name: https
+      port: 8443
+      protocol: TCP
+      targetPort: https
+  type: ClusterIP
+nodeLabeller:
+  serviceAccount:
+    annotations: {}
+metricsExporter:
+  serviceAccount:
+    annotations: {}
+testRunner:
+  serviceAccount:
+    annotations: {}
+configManager:
+  serviceAccount:
+    annotations: {}
+utilsContainer:
+  serviceAccount:
+    annotations: {}
+global:
+  proxy:
+    env: {}